eval-protocol 0.2.44__tar.gz → 0.2.45.dev0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_protocol-0.2.44/eval_protocol.egg-info → eval_protocol-0.2.45.dev0}/PKG-INFO +1 -1
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/__init__.py +8 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/_version.py +3 -3
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/adapters/__init__.py +7 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/adapters/fireworks_tracing.py +32 -51
- eval_protocol-0.2.45.dev0/eval_protocol/adapters/weave.py +130 -0
- eval_protocol-0.2.45.dev0/eval_protocol/log_utils/util.py +22 -0
- eval_protocol-0.2.45.dev0/eval_protocol/proxy/proxy_core/__init__.py +10 -0
- eval_protocol-0.2.45.dev0/eval_protocol/proxy/proxy_core/app.py +259 -0
- eval_protocol-0.2.45.dev0/eval_protocol/proxy/proxy_core/auth.py +12 -0
- eval_protocol-0.2.45.dev0/eval_protocol/proxy/proxy_core/langfuse.py +358 -0
- eval_protocol-0.2.45.dev0/eval_protocol/proxy/proxy_core/litellm.py +168 -0
- eval_protocol-0.2.45.dev0/eval_protocol/proxy/proxy_core/main.py +10 -0
- eval_protocol-0.2.45.dev0/eval_protocol/proxy/proxy_core/models.py +51 -0
- eval_protocol-0.2.45.dev0/eval_protocol/proxy/proxy_core/redis_utils.py +48 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/pytest/remote_rollout_processor.py +1 -1
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0/eval_protocol.egg-info}/PKG-INFO +1 -1
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol.egg-info/SOURCES.txt +10 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/LICENSE +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/README.md +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/development/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/development/normalize_sandbox_fusion.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/development/utils/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/development/utils/generate_api_key.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/development/utils/subprocess_manager.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/__main__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/adapters/base.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/adapters/bigquery.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/adapters/braintrust.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/adapters/huggingface.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/adapters/langchain.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/adapters/langfuse.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/adapters/langsmith.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/adapters/openai_responses.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/adapters/trl.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/adapters/utils.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/agent/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/agent/models.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/agent/orchestrator.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/agent/resource_abc.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/agent/resource_pool.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/agent/resources/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/agent/resources/docker_resource.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/agent/resources/python_state_resource.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/agent/resources/sql_resource.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/agent/task_manager.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/agent/tool_registry.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/auth.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/benchmarks/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/benchmarks/test_aime25.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/benchmarks/test_gpqa.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/cli.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/cli_commands/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/cli_commands/common.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/cli_commands/deploy.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/cli_commands/logs.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/cli_commands/preview.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/cli_commands/upload.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/common_utils.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/config.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/data_loader/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/data_loader/factory_data_loader.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/data_loader/inline_data_loader.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/data_loader/models.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/dataset_logger/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/datasets/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/datasets/loader.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/directory_utils.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/evaluation.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/event_bus/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/event_bus/event_bus.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/event_bus/logger.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/execution/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/execution/pipeline.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/gcp_tools.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/generation/cache.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/generation/clients/base.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/generation/clients.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/generic_server.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/get_pep440_version.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/human_id/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/human_id/dictionary.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/integrations/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/integrations/deepeval.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/integrations/openeval.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/integrations/trl.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/log_utils/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/logging_utils.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/mcp/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/mcp/adapter.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/mcp/client/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/mcp/client/connection.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/mcp/clients.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/mcp/execution/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/mcp/execution/base_policy.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/mcp/execution/manager.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/mcp/execution/policy.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/mcp/grid_renderer.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/mcp/mcp_multi_client.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/mcp/mcpgym.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/mcp/process_manager.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/mcp/session/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/mcp/session/manager.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/mcp/simple_process_manager.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/mcp/simulation_server.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/mcp_agent/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/mcp_agent/config.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/mcp_agent/main.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/mcp_env.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/mcp_servers/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/mcp_servers/tau2/README.md +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/mcp_servers/tau2/server.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/models.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/packaging.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/platform_api.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/playback_policy.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/pytest/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/pytest/evaluation_test.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/pytest/exception_config.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/pytest/execution.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/pytest/handle_persist_flow.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/pytest/parameterize.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/pytest/plugin.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/pytest/rollout_processor.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/pytest/store_experiment_link.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/pytest/store_results_url.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/pytest/types.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/pytest/utils.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/pytest/validate_signature.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/quickstart/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/quickstart/llm_judge.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/quickstart/llm_judge_langfuse.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/quickstart/llm_judge_langsmith.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/quickstart/llm_judge_openai_responses.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/quickstart/utils.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/resources.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/reward_function.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/rewards/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/rewards/accuracy.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/rewards/accuracy_length.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/rewards/apps_coding_reward.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/rewards/apps_execution_utils.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/rewards/apps_testing_util.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/rewards/bfcl_reward.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/rewards/code_execution.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/rewards/code_execution_utils.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/rewards/cpp_code.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/rewards/deepcoder_reward.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/rewards/format.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/rewards/function_calling.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/rewards/json_schema.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/rewards/language_consistency.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/rewards/lean_prover.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/rewards/length.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/rewards/math.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/rewards/reasoning_steps.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/rewards/repetition.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/rewards/tag_count.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/rl_processing.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/server.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/stats/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/stats/confidence_intervals.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/typed_interface.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/types/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/types/errors.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/types/remote_rollout_processor.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/types/types.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/utils/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/utils/batch_evaluation.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/utils/batch_transformation.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/utils/check_server_status.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/utils/dataset_helpers.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/utils/logs_models.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/utils/logs_server.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/utils/module_loader.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/utils/packaging_utils.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/utils/show_results_url.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/utils/static_policy.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/utils/subprocess_utils.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/utils/vite_server.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol.egg-info/dependency_links.txt +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol.egg-info/entry_points.txt +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol.egg-info/requires.txt +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol.egg-info/top_level.txt +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/pyproject.toml +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/setup.cfg +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/setup.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_accuracy.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_accuracy_length.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_adapters_e2e.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_agent_orchestrator.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_agent_resources.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_auth.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_batch_evaluation.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_cli.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_cli_agent.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_cli_args.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_code_execution.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_config.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_control_plane_separation.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_cpp_code.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_data_driven_task_manager.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_deepcoder_reward.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_deepeval_integration.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_deploy_integration.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_directory_utils.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_e2b_integration.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_e2b_js_integration.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_edge_cases.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_eval_protocol_import.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_evaluation.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_evaluation_integration.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_evaluation_postprocess.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_evaluation_preview_integration.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_event_bus.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_event_bus_helper.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_examples_end_to_end.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_fireworks_api.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_format.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_fractional_code.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_function_calling.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_gcp_tools.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_generic_server.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_human_id.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_integration.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_json_schema.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_kwargs_validation.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_language_consistency.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_lean_prover.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_lean_prover_runner.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_length.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_logs_server.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_logs_server_simple.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_math.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_minimal.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_models.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_models_rl.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_n_variant_batch_integration.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_n_variant_integration.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_openai_compatibility.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_openeval_integration.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_packaging.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_parallel_rollouts.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_platform_api.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_quickstart_utils.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_readiness.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_reasoning_steps.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_repetition.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_repetition_debug.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_retry_mechanism.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_reward_function.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_reward_protocol_import.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_rl_processing.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_rollout_control_plane_integration.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_server.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_show_results_url.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_status_migration_changes.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_status_migration_integration.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_status_model.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_tag_count.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_tau_bench_airline_smoke.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_typed_interface.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_typed_interface_rl.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_upload_entrypoint.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_url_handling.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/tests/test_vite_server.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/agent/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/agent/base.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/agent/llm_agent.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/api_service/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/api_service/api_config.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/api_service/data_model.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/api_service/simulation_service.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/cli.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/config.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/data/domains/airline/policy.md +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/data/domains/mock/policy.md +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/data/domains/retail/policy.md +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/data_model/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/data_model/message.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/data_model/simulation.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/data_model/tasks.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/domains/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/domains/airline/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/domains/airline/data_model.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/domains/airline/environment.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/domains/airline/tools.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/domains/airline/utils.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/domains/mock/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/domains/mock/data_model.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/domains/mock/environment.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/domains/mock/tools.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/domains/mock/utils.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/domains/retail/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/domains/retail/data_model.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/domains/retail/environment.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/domains/retail/tools.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/domains/retail/utils.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/domains/telecom/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/domains/telecom/data_model.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/domains/telecom/environment.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/domains/telecom/tools.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/domains/telecom/user_tools.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/domains/telecom/utils.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/environment/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/environment/db.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/environment/environment.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/environment/server.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/environment/tool.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/environment/toolkit.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/environment/utils/interface_agent.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/evaluator/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/evaluator/evaluator.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/evaluator/evaluator_action.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/evaluator/evaluator_base.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/evaluator/evaluator_env.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/metrics/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/metrics/agent_metrics.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/metrics/break_down_metrics.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/orchestrator/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/orchestrator/environment_manager.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/orchestrator/orchestrator.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/orchestrator/utils.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/registry.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/run.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/scripts/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/scripts/check_data.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/scripts/show_domain_doc.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/scripts/start_servers.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/scripts/view_simulations.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/user/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/user/base.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/user/user_simulator.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/utils/__init__.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/utils/display.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/utils/io_utils.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/utils/llm_utils.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/utils/pydantic_utils.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vendor/tau2/utils/utils.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/versioneer.py +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vite-app/dist/assets/index-C81y9r9l.js +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vite-app/dist/assets/index-C81y9r9l.js.map +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vite-app/dist/assets/index-DpYZaoAr.css +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
- {eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/vite-app/dist/index.html +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.45.dev0
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -34,6 +34,7 @@ from .pytest import evaluation_test, SingleTurnRolloutProcessor, RemoteRolloutPr
|
|
|
34
34
|
from .pytest.parameterize import DefaultParameterIdGenerator
|
|
35
35
|
from .log_utils.elasticsearch_direct_http_handler import ElasticsearchDirectHttpHandler
|
|
36
36
|
from .log_utils.rollout_id_filter import RolloutIdFilter
|
|
37
|
+
from .log_utils.util import setup_rollout_logging_for_elasticsearch_handler
|
|
37
38
|
|
|
38
39
|
from .types.remote_rollout_processor import (
|
|
39
40
|
InitRequest,
|
|
@@ -63,11 +64,18 @@ try:
|
|
|
63
64
|
except ImportError:
|
|
64
65
|
LangSmithAdapter = None
|
|
65
66
|
|
|
67
|
+
|
|
68
|
+
try:
|
|
69
|
+
from .adapters import WeaveAdapter
|
|
70
|
+
except ImportError:
|
|
71
|
+
WeaveAdapter = None
|
|
72
|
+
|
|
66
73
|
warnings.filterwarnings("default", category=DeprecationWarning, module="eval_protocol")
|
|
67
74
|
|
|
68
75
|
__all__ = [
|
|
69
76
|
"ElasticsearchDirectHttpHandler",
|
|
70
77
|
"RolloutIdFilter",
|
|
78
|
+
"setup_rollout_logging_for_elasticsearch_handler",
|
|
71
79
|
"DataLoaderConfig",
|
|
72
80
|
"Status",
|
|
73
81
|
"RemoteRolloutProcessor",
|
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2025-10-
|
|
11
|
+
"date": "2025-10-09T01:23:30-0700",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.2.
|
|
14
|
+
"full-revisionid": "c2ec0c8bb3f927b3c7f77c8a0e4fb955c7685ea6",
|
|
15
|
+
"version": "0.2.45-dev"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
{eval_protocol-0.2.44 → eval_protocol-0.2.45.dev0}/eval_protocol/adapters/fireworks_tracing.py
RENAMED
|
@@ -7,9 +7,9 @@ to pull data from Langfuse deployments with simplified retry logic handling.
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
import logging
|
|
9
9
|
import requests
|
|
10
|
-
import time
|
|
11
10
|
from datetime import datetime
|
|
12
11
|
from typing import Any, Dict, List, Optional, Protocol
|
|
12
|
+
import os
|
|
13
13
|
|
|
14
14
|
from eval_protocol.models import EvaluationRow, InputMetadata, ExecutionMetadata, Message
|
|
15
15
|
from .base import BaseAdapter
|
|
@@ -281,9 +281,8 @@ class FireworksTracingAdapter(BaseAdapter):
|
|
|
281
281
|
from_timestamp: Optional[datetime] = None,
|
|
282
282
|
to_timestamp: Optional[datetime] = None,
|
|
283
283
|
include_tool_calls: bool = True,
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
proxy_max_retries: int = 3,
|
|
284
|
+
sleep_between_gets: float = 0.1,
|
|
285
|
+
max_retries: int = 3,
|
|
287
286
|
span_name: Optional[str] = None,
|
|
288
287
|
converter: Optional[TraceDictConverter] = None,
|
|
289
288
|
) -> List[EvaluationRow]:
|
|
@@ -305,10 +304,8 @@ class FireworksTracingAdapter(BaseAdapter):
|
|
|
305
304
|
from_timestamp: Explicit start time (ISO format)
|
|
306
305
|
to_timestamp: Explicit end time (ISO format)
|
|
307
306
|
include_tool_calls: Whether to include tool calling traces
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
proxy_max_retries: Maximum retries when proxy returns 404 (client-side retries with exponential backoff)
|
|
311
|
-
span_name: If provided, extract messages from generations within this named span
|
|
307
|
+
sleep_between_gets: Sleep time between polling attempts (default: 2.5s)
|
|
308
|
+
max_retries: Max retry attempts used by proxy (default: 3)
|
|
312
309
|
converter: Optional custom converter implementing TraceDictConverter protocol.
|
|
313
310
|
If provided, this will be used instead of the default conversion logic.
|
|
314
311
|
|
|
@@ -318,9 +315,9 @@ class FireworksTracingAdapter(BaseAdapter):
|
|
|
318
315
|
Raises:
|
|
319
316
|
ValueError: If tags list is empty
|
|
320
317
|
"""
|
|
321
|
-
# Validate that tags are provided
|
|
318
|
+
# Validate that tags are provided
|
|
322
319
|
if not tags or len(tags) == 0:
|
|
323
|
-
raise ValueError("At least one tag is required to fetch traces
|
|
320
|
+
raise ValueError("At least one tag is required to fetch traces")
|
|
324
321
|
|
|
325
322
|
eval_rows = []
|
|
326
323
|
|
|
@@ -339,58 +336,42 @@ class FireworksTracingAdapter(BaseAdapter):
|
|
|
339
336
|
"hours_back": hours_back,
|
|
340
337
|
"from_timestamp": from_timestamp.isoformat() if from_timestamp else None,
|
|
341
338
|
"to_timestamp": to_timestamp.isoformat() if to_timestamp else None,
|
|
342
|
-
"sleep_between_gets":
|
|
343
|
-
"max_retries":
|
|
339
|
+
"sleep_between_gets": sleep_between_gets,
|
|
340
|
+
"max_retries": max_retries,
|
|
344
341
|
}
|
|
345
342
|
|
|
346
343
|
# Remove None values
|
|
347
344
|
params = {k: v for k, v in params.items() if v is not None}
|
|
348
345
|
|
|
349
|
-
# Make request to proxy
|
|
346
|
+
# Make request to proxy
|
|
350
347
|
if self.project_id:
|
|
351
348
|
url = f"{self.base_url}/v1/project_id/{self.project_id}/traces"
|
|
352
349
|
else:
|
|
353
350
|
url = f"{self.base_url}/v1/traces"
|
|
354
351
|
|
|
355
|
-
|
|
352
|
+
headers = {"Authorization": f"Bearer {os.environ.get('FIREWORKS_API_KEY')}"}
|
|
353
|
+
|
|
356
354
|
result = None
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
error_msg = e.response.text
|
|
378
|
-
|
|
379
|
-
if should_retry and attempt < proxy_max_retries - 1:
|
|
380
|
-
sleep_time = 2 ** (attempt + 1)
|
|
381
|
-
logger.warning(error_msg)
|
|
382
|
-
time.sleep(sleep_time)
|
|
383
|
-
else:
|
|
384
|
-
# Final retry or non-retryable error
|
|
385
|
-
logger.error("Failed to fetch traces from proxy: %s", error_msg)
|
|
386
|
-
return eval_rows
|
|
387
|
-
except requests.exceptions.RequestException as e:
|
|
388
|
-
# Non-HTTP errors (network issues, timeouts, etc.)
|
|
389
|
-
logger.error("Failed to fetch traces from proxy: %s", str(e))
|
|
390
|
-
return eval_rows
|
|
391
|
-
|
|
392
|
-
if result is None:
|
|
393
|
-
logger.error("Failed to fetch traces after %d retries", proxy_max_retries)
|
|
355
|
+
try:
|
|
356
|
+
response = requests.get(url, params=params, timeout=self.timeout, headers=headers)
|
|
357
|
+
response.raise_for_status()
|
|
358
|
+
result = response.json()
|
|
359
|
+
except requests.exceptions.HTTPError as e:
|
|
360
|
+
error_msg = str(e)
|
|
361
|
+
|
|
362
|
+
# Try to extract detail message from response
|
|
363
|
+
if e.response is not None:
|
|
364
|
+
try:
|
|
365
|
+
error_detail = e.response.json().get("detail", {})
|
|
366
|
+
error_msg = error_detail or e.response.text
|
|
367
|
+
except Exception: # In case e.response.json() fails
|
|
368
|
+
error_msg = f"Proxy error: {e.response.text}"
|
|
369
|
+
|
|
370
|
+
logger.error("Failed to fetch traces from proxy: %s", error_msg)
|
|
371
|
+
return eval_rows
|
|
372
|
+
except requests.exceptions.RequestException as e:
|
|
373
|
+
# Non-HTTP errors (network issues, timeouts, etc.)
|
|
374
|
+
logger.error("Failed to fetch traces from proxy: %s", str(e))
|
|
394
375
|
return eval_rows
|
|
395
376
|
|
|
396
377
|
# Extract traces from response
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""Weave (Weights & Biases) adapter for Eval Protocol.
|
|
2
|
+
|
|
3
|
+
This adapter fetches recent root traces from Weave Trace API and converts them
|
|
4
|
+
to `EvaluationRow` format for use in evaluation pipelines. It is intentionally
|
|
5
|
+
minimal and depends only on requests.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Any, Dict, List, Optional
|
|
11
|
+
import os
|
|
12
|
+
import requests
|
|
13
|
+
|
|
14
|
+
from eval_protocol.models import EvaluationRow, InputMetadata, Message, ExecutionMetadata
|
|
15
|
+
from .base import BaseAdapter
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _extract_messages_from_trace(trace: Dict[str, Any], include_tool_calls: bool = True) -> List[Message]:
|
|
19
|
+
messages: List[Message] = []
|
|
20
|
+
|
|
21
|
+
# Prefer explicit output messages if provided
|
|
22
|
+
output = trace.get("output") or {}
|
|
23
|
+
out_msgs = output.get("messages")
|
|
24
|
+
if isinstance(out_msgs, list):
|
|
25
|
+
for m in out_msgs:
|
|
26
|
+
messages.append(
|
|
27
|
+
Message(
|
|
28
|
+
role=m.get("role"),
|
|
29
|
+
content=m.get("content"),
|
|
30
|
+
tool_calls=m.get("tool_calls") if include_tool_calls else None,
|
|
31
|
+
tool_call_id=m.get("tool_call_id"),
|
|
32
|
+
name=m.get("name"),
|
|
33
|
+
)
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
# If no explicit output messages, fall back to final bubble from choices
|
|
37
|
+
if not messages:
|
|
38
|
+
choices = output.get("choices")
|
|
39
|
+
if isinstance(choices, list) and choices:
|
|
40
|
+
msg = (choices[0] or {}).get("message", {})
|
|
41
|
+
if msg:
|
|
42
|
+
messages.append(Message(role=msg.get("role"), content=msg.get("content")))
|
|
43
|
+
|
|
44
|
+
# Prepend input messages if present and not already contained
|
|
45
|
+
inputs = trace.get("inputs") or {}
|
|
46
|
+
in_msgs = inputs.get("messages")
|
|
47
|
+
if isinstance(in_msgs, list):
|
|
48
|
+
prefixed = [Message(role=m.get("role"), content=m.get("content")) for m in in_msgs]
|
|
49
|
+
messages = prefixed + messages
|
|
50
|
+
|
|
51
|
+
return messages
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _convert_trace_to_evaluation_row(
|
|
55
|
+
trace: Dict[str, Any], include_tool_calls: bool = True
|
|
56
|
+
) -> Optional[EvaluationRow]:
|
|
57
|
+
messages = _extract_messages_from_trace(trace, include_tool_calls=include_tool_calls)
|
|
58
|
+
if not messages:
|
|
59
|
+
return None
|
|
60
|
+
|
|
61
|
+
# Provider-native IDs for UI joinability
|
|
62
|
+
session_data = {
|
|
63
|
+
"weave_trace_id": trace.get("id"),
|
|
64
|
+
"weave_project_id": trace.get("project_id"),
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
# Optional EP identifiers (if present in provider payload)
|
|
68
|
+
meta_in = (trace.get("inputs") or {}).get("metadata") or {}
|
|
69
|
+
meta_out = (trace.get("output") or {}).get("metadata") or {}
|
|
70
|
+
metadata = {**meta_in, **meta_out}
|
|
71
|
+
|
|
72
|
+
input_metadata = InputMetadata(row_id=metadata.get("row_id"), session_data=session_data)
|
|
73
|
+
|
|
74
|
+
# Preserve default factory behavior by only setting provided fields
|
|
75
|
+
exec_kwargs: Dict[str, Any] = {}
|
|
76
|
+
for k in ("invocation_id", "experiment_id", "rollout_id", "run_id"):
|
|
77
|
+
if metadata.get(k) is not None:
|
|
78
|
+
exec_kwargs[k] = metadata[k]
|
|
79
|
+
execution_metadata = ExecutionMetadata(**exec_kwargs)
|
|
80
|
+
|
|
81
|
+
# Capture tools if provider exposes them (prefer inputs)
|
|
82
|
+
tools = None
|
|
83
|
+
inputs = trace.get("inputs") or {}
|
|
84
|
+
if include_tool_calls and isinstance(inputs, dict) and "tools" in inputs:
|
|
85
|
+
tools = inputs.get("tools")
|
|
86
|
+
|
|
87
|
+
return EvaluationRow(
|
|
88
|
+
messages=messages, tools=tools, input_metadata=input_metadata, execution_metadata=execution_metadata
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class WeaveAdapter(BaseAdapter):
|
|
93
|
+
"""Adapter to pull data from Weave Trace API and convert to EvaluationRow format."""
|
|
94
|
+
|
|
95
|
+
def __init__(
|
|
96
|
+
self, base_url: Optional[str] = None, api_token: Optional[str] = None, project_id: Optional[str] = None
|
|
97
|
+
):
|
|
98
|
+
self.base_url = base_url or os.getenv("WEAVE_TRACE_BASE_URL", "https://trace.wandb.ai")
|
|
99
|
+
self.api_token = api_token or os.getenv("WANDB_API_KEY")
|
|
100
|
+
# project_id is in form "<entity>/<project>"
|
|
101
|
+
self.project_id = project_id or (f"{os.getenv('WANDB_ENTITY')}/{os.getenv('WANDB_PROJECT')}")
|
|
102
|
+
if not self.api_token or not self.project_id or "/" not in self.project_id:
|
|
103
|
+
raise ValueError("Missing Weave credentials or project (WANDB_API_KEY and WANDB_ENTITY/WANDB_PROJECT)")
|
|
104
|
+
|
|
105
|
+
def _fetch_traces(self, limit: int = 100) -> List[Dict[str, Any]]:
|
|
106
|
+
url = f"{self.base_url}/calls/stream_query"
|
|
107
|
+
payload = {
|
|
108
|
+
"project_id": self.project_id,
|
|
109
|
+
"filter": {"trace_roots_only": True},
|
|
110
|
+
"limit": limit,
|
|
111
|
+
"offset": 0,
|
|
112
|
+
"sort_by": [{"field": "started_at", "direction": "desc"}],
|
|
113
|
+
"include_feedback": False,
|
|
114
|
+
}
|
|
115
|
+
headers = {"Authorization": f"Bearer {self.api_token}", "Content-Type": "application/json"}
|
|
116
|
+
resp = requests.post(url, json=payload, headers=headers, timeout=30)
|
|
117
|
+
resp.raise_for_status()
|
|
118
|
+
body = resp.json() or {}
|
|
119
|
+
return body.get("data", [])
|
|
120
|
+
|
|
121
|
+
def get_evaluation_rows(self, *args, **kwargs) -> List[EvaluationRow]:
|
|
122
|
+
limit = kwargs.get("limit", 100)
|
|
123
|
+
include_tool_calls = kwargs.get("include_tool_calls", True)
|
|
124
|
+
traces = self._fetch_traces(limit=limit)
|
|
125
|
+
rows: List[EvaluationRow] = []
|
|
126
|
+
for tr in traces:
|
|
127
|
+
row = _convert_trace_to_evaluation_row(tr, include_tool_calls=include_tool_calls)
|
|
128
|
+
if row:
|
|
129
|
+
rows.append(row)
|
|
130
|
+
return rows
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from eval_protocol.types.remote_rollout_processor import ElasticsearchConfig
|
|
3
|
+
from .elasticsearch_direct_http_handler import ElasticsearchDirectHttpHandler
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def setup_rollout_logging_for_elasticsearch_handler(
|
|
7
|
+
handler: ElasticsearchDirectHttpHandler, rollout_id: str, elastic_search_config: ElasticsearchConfig
|
|
8
|
+
) -> None:
|
|
9
|
+
"""
|
|
10
|
+
Whenever a new subprocess is created, we need to setup the rollout context
|
|
11
|
+
for the subprocess. This is useful when implementing your own remote server
|
|
12
|
+
for rollout processing.
|
|
13
|
+
|
|
14
|
+
1. Set the EP_ROLLOUT_ID environment variable
|
|
15
|
+
2. Configure the Elasticsearch handler with the Elasticsearch config
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
# this should only affect this subprocess so logs from this subprocess can
|
|
19
|
+
# be correlated to the rollout
|
|
20
|
+
os.environ["EP_ROLLOUT_ID"] = rollout_id
|
|
21
|
+
|
|
22
|
+
handler.configure(elasticsearch_config=elastic_search_config)
|
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Metadata Extraction Gateway
|
|
3
|
+
A FastAPI service that sits in front of LiteLLM and extracts metadata from URL paths.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from fastapi import FastAPI, Depends, HTTPException, Request, Query
|
|
7
|
+
from typing import Optional, List
|
|
8
|
+
import os
|
|
9
|
+
import redis
|
|
10
|
+
import logging
|
|
11
|
+
import json
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
import sys
|
|
14
|
+
from contextlib import asynccontextmanager
|
|
15
|
+
|
|
16
|
+
from .models import ProxyConfig, LangfuseTracesResponse
|
|
17
|
+
from .auth import AuthProvider, NoAuthProvider
|
|
18
|
+
from .litellm import handle_chat_completion, proxy_to_litellm
|
|
19
|
+
from .langfuse import fetch_langfuse_traces
|
|
20
|
+
|
|
21
|
+
# Configure logging before any other imports (so all modules inherit this config)
|
|
22
|
+
log_level = os.getenv("LOG_LEVEL", "INFO").upper()
|
|
23
|
+
logging.basicConfig(
|
|
24
|
+
level=getattr(logging, log_level),
|
|
25
|
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
|
26
|
+
handlers=[logging.StreamHandler(sys.stdout)],
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def build_proxy_config() -> ProxyConfig:
|
|
33
|
+
"""Load environment and secrets, and build ProxyConfig (no Redis)."""
|
|
34
|
+
# Env
|
|
35
|
+
litellm_url = os.getenv("LITELLM_URL")
|
|
36
|
+
if not litellm_url:
|
|
37
|
+
raise ValueError("LITELLM_URL environment variable must be set")
|
|
38
|
+
request_timeout = float(os.getenv("REQUEST_TIMEOUT", "300.0"))
|
|
39
|
+
|
|
40
|
+
# Secrets - use SECRETS_PATH env var if set, otherwise default to proxy/secrets.json
|
|
41
|
+
secrets_path_str = os.getenv("SECRETS_PATH")
|
|
42
|
+
if secrets_path_str:
|
|
43
|
+
secrets_path = Path(secrets_path_str)
|
|
44
|
+
else:
|
|
45
|
+
secrets_path = Path(__file__).parent / "secrets.json"
|
|
46
|
+
if not secrets_path.exists():
|
|
47
|
+
raise ValueError(
|
|
48
|
+
"secrets.json not found! Please create it from secrets.json.example:\n"
|
|
49
|
+
" cp litellm_proxy_config/proxy/secrets.json.example litellm_proxy_config/proxy/secrets.json\n"
|
|
50
|
+
"Then add your Langfuse API keys to secrets.json"
|
|
51
|
+
)
|
|
52
|
+
try:
|
|
53
|
+
with open(secrets_path, "r") as f:
|
|
54
|
+
secrets_config = json.load(f)
|
|
55
|
+
langfuse_keys = secrets_config["langfuse_keys"]
|
|
56
|
+
default_project_id = secrets_config["default_project_id"]
|
|
57
|
+
logger.info(f"Loaded {len(langfuse_keys)} Langfuse project(s) from secrets.json")
|
|
58
|
+
except KeyError as e:
|
|
59
|
+
raise ValueError(f"Missing required key in secrets.json: {e}")
|
|
60
|
+
except json.JSONDecodeError as e:
|
|
61
|
+
raise ValueError(f"Invalid JSON in secrets.json: {e}")
|
|
62
|
+
|
|
63
|
+
return ProxyConfig(
|
|
64
|
+
litellm_url=litellm_url,
|
|
65
|
+
request_timeout=request_timeout,
|
|
66
|
+
langfuse_keys=langfuse_keys,
|
|
67
|
+
default_project_id=default_project_id,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def init_redis() -> redis.Redis:
|
|
72
|
+
"""Initialize and return a Redis client from environment variables."""
|
|
73
|
+
redis_host = os.getenv("REDIS_HOST")
|
|
74
|
+
if not redis_host:
|
|
75
|
+
raise ValueError("REDIS_HOST environment variable must be set")
|
|
76
|
+
redis_port = int(os.getenv("REDIS_PORT", "6379"))
|
|
77
|
+
redis_password = os.getenv("REDIS_PASSWORD")
|
|
78
|
+
|
|
79
|
+
try:
|
|
80
|
+
client = redis.Redis(
|
|
81
|
+
host=redis_host,
|
|
82
|
+
port=redis_port,
|
|
83
|
+
password=redis_password if redis_password else None,
|
|
84
|
+
decode_responses=True,
|
|
85
|
+
socket_connect_timeout=5,
|
|
86
|
+
socket_timeout=5,
|
|
87
|
+
retry_on_timeout=True,
|
|
88
|
+
)
|
|
89
|
+
client.ping()
|
|
90
|
+
logger.info(f"Connected to Redis at {redis_host}:{redis_port}")
|
|
91
|
+
return client
|
|
92
|
+
except Exception as e:
|
|
93
|
+
raise ConnectionError(f"Failed to connect to Redis at {redis_host}:{redis_port}: {e}")
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def create_app(
|
|
97
|
+
auth_provider: AuthProvider = NoAuthProvider(),
|
|
98
|
+
) -> FastAPI:
|
|
99
|
+
@asynccontextmanager
|
|
100
|
+
async def lifespan(app: FastAPI):
|
|
101
|
+
# Build runtime on startup
|
|
102
|
+
app.state.config = build_proxy_config()
|
|
103
|
+
app.state.redis = init_redis()
|
|
104
|
+
try:
|
|
105
|
+
yield
|
|
106
|
+
finally:
|
|
107
|
+
try:
|
|
108
|
+
app.state.redis.close()
|
|
109
|
+
except Exception:
|
|
110
|
+
pass
|
|
111
|
+
|
|
112
|
+
app = FastAPI(title="LiteLLM Metadata Proxy", lifespan=lifespan)
|
|
113
|
+
|
|
114
|
+
def get_config(request: Request) -> ProxyConfig:
|
|
115
|
+
return request.app.state.config
|
|
116
|
+
|
|
117
|
+
def get_redis(request: Request) -> redis.Redis:
|
|
118
|
+
return request.app.state.redis
|
|
119
|
+
|
|
120
|
+
async def require_auth(request: Request) -> None:
|
|
121
|
+
auth_header = request.headers.get("authorization", "")
|
|
122
|
+
api_key = None
|
|
123
|
+
if auth_header.startswith("Bearer "):
|
|
124
|
+
api_key = auth_header.replace("Bearer ", "").strip()
|
|
125
|
+
|
|
126
|
+
auth_provider.validate(api_key)
|
|
127
|
+
return None
|
|
128
|
+
|
|
129
|
+
# =====================
|
|
130
|
+
# Chat completion routes
|
|
131
|
+
# =====================
|
|
132
|
+
@app.post(
|
|
133
|
+
"/project_id/{project_id}/rollout_id/{rollout_id}/invocation_id/{invocation_id}/experiment_id/{experiment_id}/run_id/{run_id}/row_id/{row_id}/chat/completions"
|
|
134
|
+
)
|
|
135
|
+
@app.post(
|
|
136
|
+
"/v1/project_id/{project_id}/rollout_id/{rollout_id}/invocation_id/{invocation_id}/experiment_id/{experiment_id}/run_id/{run_id}/row_id/{row_id}/chat/completions"
|
|
137
|
+
)
|
|
138
|
+
@app.post(
|
|
139
|
+
"/rollout_id/{rollout_id}/invocation_id/{invocation_id}/experiment_id/{experiment_id}/run_id/{run_id}/row_id/{row_id}/chat/completions"
|
|
140
|
+
)
|
|
141
|
+
@app.post(
|
|
142
|
+
"/v1/rollout_id/{rollout_id}/invocation_id/{invocation_id}/experiment_id/{experiment_id}/run_id/{run_id}/row_id/{row_id}/chat/completions"
|
|
143
|
+
)
|
|
144
|
+
@app.post(
|
|
145
|
+
"/project_id/{project_id}/rollout_id/{rollout_id}/invocation_id/{invocation_id}/experiment_id/{experiment_id}/run_id/{run_id}/row_id/{row_id}/encoded_base_url/{encoded_base_url}/chat/completions"
|
|
146
|
+
)
|
|
147
|
+
@app.post(
|
|
148
|
+
"/v1/project_id/{project_id}/rollout_id/{rollout_id}/invocation_id/{invocation_id}/experiment_id/{experiment_id}/run_id/{run_id}/row_id/{row_id}/encoded_base_url/{encoded_base_url}/chat/completions"
|
|
149
|
+
)
|
|
150
|
+
@app.post(
|
|
151
|
+
"/rollout_id/{rollout_id}/invocation_id/{invocation_id}/experiment_id/{experiment_id}/run_id/{run_id}/row_id/{row_id}/encoded_base_url/{encoded_base_url}/chat/completions"
|
|
152
|
+
)
|
|
153
|
+
@app.post(
|
|
154
|
+
"/v1/rollout_id/{rollout_id}/invocation_id/{invocation_id}/experiment_id/{experiment_id}/run_id/{run_id}/row_id/{row_id}/encoded_base_url/{encoded_base_url}/chat/completions"
|
|
155
|
+
)
|
|
156
|
+
async def chat_completion_with_full_metadata(
|
|
157
|
+
rollout_id: str,
|
|
158
|
+
invocation_id: str,
|
|
159
|
+
experiment_id: str,
|
|
160
|
+
run_id: str,
|
|
161
|
+
row_id: str,
|
|
162
|
+
request: Request,
|
|
163
|
+
project_id: Optional[str] = None,
|
|
164
|
+
encoded_base_url: Optional[str] = None,
|
|
165
|
+
config: ProxyConfig = Depends(get_config),
|
|
166
|
+
redis_client: redis.Redis = Depends(get_redis),
|
|
167
|
+
):
|
|
168
|
+
return await handle_chat_completion(
|
|
169
|
+
config=config,
|
|
170
|
+
redis_client=redis_client,
|
|
171
|
+
request=request,
|
|
172
|
+
project_id=project_id,
|
|
173
|
+
rollout_id=rollout_id,
|
|
174
|
+
invocation_id=invocation_id,
|
|
175
|
+
experiment_id=experiment_id,
|
|
176
|
+
run_id=run_id,
|
|
177
|
+
row_id=row_id,
|
|
178
|
+
encoded_base_url=encoded_base_url,
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
@app.post("/project_id/{project_id}/chat/completions")
|
|
182
|
+
@app.post("/v1/project_id/{project_id}/chat/completions")
|
|
183
|
+
async def chat_completion_with_project_only(
|
|
184
|
+
project_id: str,
|
|
185
|
+
request: Request,
|
|
186
|
+
config: ProxyConfig = Depends(get_config),
|
|
187
|
+
redis_client: redis.Redis = Depends(get_redis),
|
|
188
|
+
):
|
|
189
|
+
return await handle_chat_completion(
|
|
190
|
+
config=config,
|
|
191
|
+
redis_client=redis_client,
|
|
192
|
+
request=request,
|
|
193
|
+
project_id=project_id,
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
# ===============
|
|
197
|
+
# Traces routes
|
|
198
|
+
# ===============
|
|
199
|
+
@app.get("/traces", response_model=LangfuseTracesResponse)
|
|
200
|
+
@app.get("/v1/traces", response_model=LangfuseTracesResponse)
|
|
201
|
+
@app.get("/project_id/{project_id}/traces", response_model=LangfuseTracesResponse)
|
|
202
|
+
@app.get("/v1/project_id/{project_id}/traces", response_model=LangfuseTracesResponse)
|
|
203
|
+
async def get_langfuse_traces(
|
|
204
|
+
tags: List[str] = Query(...), # REQUIRED query param
|
|
205
|
+
project_id: Optional[str] = None,
|
|
206
|
+
limit: int = 100,
|
|
207
|
+
sample_size: Optional[int] = None,
|
|
208
|
+
user_id: Optional[str] = None,
|
|
209
|
+
session_id: Optional[str] = None,
|
|
210
|
+
name: Optional[str] = None,
|
|
211
|
+
environment: Optional[str] = None,
|
|
212
|
+
version: Optional[str] = None,
|
|
213
|
+
release: Optional[str] = None,
|
|
214
|
+
fields: Optional[str] = None,
|
|
215
|
+
hours_back: Optional[int] = None,
|
|
216
|
+
from_timestamp: Optional[str] = None,
|
|
217
|
+
to_timestamp: Optional[str] = None,
|
|
218
|
+
sleep_between_gets: float = 2.5,
|
|
219
|
+
max_retries: int = 3,
|
|
220
|
+
config: ProxyConfig = Depends(get_config),
|
|
221
|
+
redis_client: redis.Redis = Depends(get_redis),
|
|
222
|
+
_: None = Depends(require_auth),
|
|
223
|
+
) -> LangfuseTracesResponse:
|
|
224
|
+
return await fetch_langfuse_traces(
|
|
225
|
+
config=config,
|
|
226
|
+
redis_client=redis_client,
|
|
227
|
+
tags=tags,
|
|
228
|
+
project_id=project_id,
|
|
229
|
+
limit=limit,
|
|
230
|
+
sample_size=sample_size,
|
|
231
|
+
user_id=user_id,
|
|
232
|
+
session_id=session_id,
|
|
233
|
+
name=name,
|
|
234
|
+
environment=environment,
|
|
235
|
+
version=version,
|
|
236
|
+
release=release,
|
|
237
|
+
fields=fields,
|
|
238
|
+
hours_back=hours_back,
|
|
239
|
+
from_timestamp=from_timestamp,
|
|
240
|
+
to_timestamp=to_timestamp,
|
|
241
|
+
sleep_between_gets=sleep_between_gets,
|
|
242
|
+
max_retries=max_retries,
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
# Health
|
|
246
|
+
@app.get("/health")
|
|
247
|
+
async def health():
|
|
248
|
+
return {"status": "healthy", "service": "metadata-proxy"}
|
|
249
|
+
|
|
250
|
+
# Catch-all
|
|
251
|
+
@app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE", "PATCH"])
|
|
252
|
+
async def catch_all_proxy(
|
|
253
|
+
path: str,
|
|
254
|
+
request: Request,
|
|
255
|
+
config: ProxyConfig = Depends(get_config),
|
|
256
|
+
):
|
|
257
|
+
return await proxy_to_litellm(config, path, request)
|
|
258
|
+
|
|
259
|
+
return app
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class AuthProvider(ABC):
|
|
6
|
+
@abstractmethod
|
|
7
|
+
def validate(self, api_key: Optional[str]) -> Optional[str]: ...
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class NoAuthProvider(AuthProvider):
|
|
11
|
+
def validate(self, api_key: Optional[str]) -> Optional[str]:
|
|
12
|
+
return None
|