eval-protocol 0.3.23__tar.gz → 0.3.25__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_protocol-0.3.23/eval_protocol.egg-info → eval_protocol-0.3.25}/PKG-INFO +3 -6
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/_version.py +3 -3
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/adapters/fireworks_tracing.py +3 -55
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/proxy/proxy_core/app.py +15 -11
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/proxy/proxy_core/langfuse.py +0 -1
- eval_protocol-0.3.25/eval_protocol/proxy/proxy_core/litellm.py +173 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/proxy/proxy_core/models.py +1 -1
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/evaluation_test.py +15 -9
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/evaluation_test_utils.py +6 -2
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/exception_config.py +4 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/remote_rollout_processor.py +3 -2
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/reward_function.py +1 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25/eval_protocol.egg-info}/PKG-INFO +3 -6
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol.egg-info/requires.txt +2 -5
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/pyproject.toml +2 -6
- eval_protocol-0.3.23/eval_protocol/proxy/proxy_core/litellm.py +0 -154
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/LICENSE +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/README.md +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/development/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/development/normalize_sandbox_fusion.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/development/utils/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/development/utils/generate_api_key.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/development/utils/subprocess_manager.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/__main__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/adapters/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/adapters/base.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/adapters/bigquery.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/adapters/braintrust.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/adapters/dataframe.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/adapters/huggingface.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/adapters/langchain.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/adapters/langfuse.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/adapters/langsmith.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/adapters/openai_responses.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/adapters/trl.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/adapters/utils.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/adapters/weave.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/agent/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/agent/models.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/agent/orchestrator.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/agent/resource_abc.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/agent/resource_pool.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/agent/resources/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/agent/resources/docker_resource.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/agent/resources/python_state_resource.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/agent/resources/sql_resource.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/agent/task_manager.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/agent/tool_registry.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/auth.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/benchmarks/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/benchmarks/test_aime25.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/benchmarks/test_glm_streaming_compliance.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/benchmarks/test_gpqa.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/cli.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/cli_commands/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/cli_commands/common.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/cli_commands/create_rft.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/cli_commands/export_docs.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/cli_commands/local_test.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/cli_commands/logs.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/cli_commands/upload.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/cli_commands/utils.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/common_utils.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/config.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/data_loader/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/data_loader/factory_data_loader.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/data_loader/inline_data_loader.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/data_loader/jsonl_data_loader.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/data_loader/models.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/dataset_logger/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/datasets/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/datasets/loader.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/directory_utils.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/evaluation.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/event_bus/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/event_bus/event_bus.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/event_bus/logger.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/exceptions.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/execution/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/execution/pipeline.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/fireworks_rft.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/gcp_tools.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/generation/cache.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/generation/clients/base.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/generation/clients.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/generic_server.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/get_pep440_version.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/human_id/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/human_id/dictionary.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/integrations/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/integrations/deepeval.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/integrations/fireworks_v1_completions_client.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/integrations/openai_rft.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/integrations/openeval.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/integrations/tinker_cookbook.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/integrations/tinker_rollout_processor.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/integrations/trl.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/log_utils/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/log_utils/fireworks_tracing_http_handler.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/log_utils/init.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/log_utils/rollout_context.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/log_utils/util.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/logging_utils.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp/adapter.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp/client/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp/client/connection.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp/clients.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp/execution/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp/execution/base_policy.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp/execution/manager.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp/execution/policy.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp/execution/vllm_policy.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp/grid_renderer.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp/mcp_multi_client.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp/mcpgym.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp/process_manager.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp/session/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp/session/manager.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp/simple_process_manager.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp/simulation_server.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_agent/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_agent/config.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_agent/main.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_env.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_servers/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_servers/tau2/README.md +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_servers/tau2/server.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/models.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/packaging.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/platform_api.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/playback_policy.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/proxy/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/proxy/proxy_core/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/proxy/proxy_core/auth.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/proxy/proxy_core/main.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/proxy/proxy_core/redis_utils.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/buffer.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/default_klavis_sandbox_rollout_processor.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/execution.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/github_action_rollout_processor.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/handle_persist_flow.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/integrations/openenv_trl_vllm.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/openenv_rollout_processor.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/parameterize.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/plugin.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/priority_scheduler.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/rollout_processor.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/rollout_result_post_processor.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/store_experiment_link.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/store_results_url.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/tracing_utils.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/types.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/utils.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/validate_signature.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/quickstart/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/quickstart/aha_judge/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/quickstart/aha_judge/llm_judge.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/quickstart/aha_judge/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/quickstart/aha_judge/llm_judge_langfuse.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/quickstart/aha_judge/llm_judge_langsmith.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/quickstart/aha_judge/utils.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/quickstart/llm_judge.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/quickstart/svg_agent/evaluator/test_svgagent.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/quickstart/svg_agent/evaluator/utils.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/quickstart/utils.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/resources.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/accuracy.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/accuracy_length.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/apps_coding_reward.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/apps_execution_utils.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/apps_testing_util.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/bfcl_reward.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/code_execution.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/code_execution_utils.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/cpp_code.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/deepcoder_reward.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/format.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/function_calling.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/json_schema.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/language_consistency.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/lean_prover.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/length.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/math.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/reasoning_steps.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/repetition.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/tag_count.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rl_processing.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/server.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/stats/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/stats/confidence_intervals.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/training/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/training/gepa_trainer.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/training/gepa_utils.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/training/trainer.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/training/utils.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/typed_interface.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/types/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/types/errors.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/types/remote_rollout_processor.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/types/types.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/utils/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/utils/batch_evaluation.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/utils/batch_transformation.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/utils/browser_utils.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/utils/check_server_status.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/utils/dataset_helpers.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/utils/evaluation_row_utils.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/utils/logs_models.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/utils/logs_server.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/utils/module_loader.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/utils/packaging_utils.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/utils/show_results_url.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/utils/static_policy.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/utils/subprocess_utils.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/utils/vite_server.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol.egg-info/SOURCES.txt +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol.egg-info/dependency_links.txt +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol.egg-info/entry_points.txt +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol.egg-info/top_level.txt +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/setup.cfg +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/setup.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_accuracy.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_accuracy_length.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_adapters_e2e.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_agent_orchestrator.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_agent_resources.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_auth.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_batch_evaluation.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_cli_agent.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_cli_args.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_cli_create_rft.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_cli_local_test.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_cli_startup_benchmark.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_code_execution.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_config.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_control_plane_separation.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_cpp_code.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_data_driven_task_manager.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_deepcoder_reward.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_deepeval_integration.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_directory_utils.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_e2b_integration.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_e2b_js_integration.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_edge_cases.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_ep_upload_e2e.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_eval_protocol_import.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_evaluation.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_evaluation_postprocess.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_event_bus.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_event_bus_helper.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_examples_end_to_end.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_exception_config.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_exceptions.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_fireworks_api.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_fireworks_v1_completions_client.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_format.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_fractional_code.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_function_calling.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_gcp_tools.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_generic_server.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_human_id.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_integration.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_json_schema.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_kwargs_validation.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_language_consistency.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_lean_prover.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_lean_prover_runner.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_length.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_list_comparison_math_reward.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_litellm_policy_provider_fields.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_logs_server.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_logs_server_simple.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_math.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_message_field_filtering.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_minimal.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_models.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_models_rl.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_n_variant_batch_integration.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_n_variant_integration.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_no_implicit_dotenv.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_openai_compatibility.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_openai_rft_integration.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_openeval_integration.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_packaging.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_parallel_rollouts.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_platform_api.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_priority_scheduler.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_quickstart_utils.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_readiness.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_reasoning_steps.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_repetition.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_repetition_debug.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_retry_mechanism.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_reward_function.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_reward_protocol_import.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_rl_processing.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_rollout_control_plane_integration.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_rollout_logprobs.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_server.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_show_results_url.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_sqlite_hardening.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_status_migration_changes.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_status_migration_integration.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_status_model.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_tag_count.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_tau_bench_airline_smoke.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_training_utils.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_typed_interface.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_typed_interface_rl.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_upload_entrypoint.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_url_handling.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_vite_server.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/agent/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/agent/base.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/agent/llm_agent.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/api_service/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/api_service/api_config.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/api_service/data_model.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/api_service/simulation_service.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/cli.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/config.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/data/domains/airline/policy.md +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/data/domains/mock/policy.md +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/data/domains/retail/policy.md +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/data_model/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/data_model/message.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/data_model/simulation.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/data_model/tasks.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/airline/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/airline/data_model.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/airline/environment.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/airline/tools.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/airline/utils.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/mock/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/mock/data_model.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/mock/environment.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/mock/tools.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/mock/utils.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/retail/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/retail/data_model.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/retail/environment.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/retail/tools.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/retail/utils.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/telecom/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/telecom/data_model.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/telecom/environment.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/telecom/tools.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/telecom/user_tools.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/telecom/utils.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/environment/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/environment/db.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/environment/environment.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/environment/server.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/environment/tool.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/environment/toolkit.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/environment/utils/interface_agent.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/evaluator/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/evaluator/evaluator.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/evaluator/evaluator_action.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/evaluator/evaluator_base.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/evaluator/evaluator_env.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/metrics/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/metrics/agent_metrics.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/metrics/break_down_metrics.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/orchestrator/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/orchestrator/environment_manager.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/orchestrator/orchestrator.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/orchestrator/utils.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/registry.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/run.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/scripts/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/scripts/check_data.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/scripts/show_domain_doc.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/scripts/start_servers.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/scripts/view_simulations.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/user/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/user/base.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/user/user_simulator.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/utils/__init__.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/utils/display.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/utils/io_utils.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/utils/llm_utils.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/utils/pydantic_utils.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/utils/utils.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/versioneer.py +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vite-app/dist/assets/index-DFeF7AG_.js +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vite-app/dist/assets/index-DFeF7AG_.js.map +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vite-app/dist/assets/index-DvKW7FQL.css +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
- {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vite-app/dist/index.html +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.25
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -24,7 +24,7 @@ Requires-Dist: hydra-core>=1.3.2
|
|
|
24
24
|
Requires-Dist: omegaconf>=2.3.0
|
|
25
25
|
Requires-Dist: httpx>=0.24.0
|
|
26
26
|
Requires-Dist: anthropic>=0.59.0
|
|
27
|
-
Requires-Dist: litellm<1.
|
|
27
|
+
Requires-Dist: litellm<1.75.0
|
|
28
28
|
Requires-Dist: pytest>=6.0.0
|
|
29
29
|
Requires-Dist: pytest-asyncio>=0.21.0
|
|
30
30
|
Requires-Dist: peewee>=3.18.2
|
|
@@ -111,14 +111,11 @@ Requires-Dist: langchain-core>=0.3.75; extra == "langgraph"
|
|
|
111
111
|
Provides-Extra: langgraph-tools
|
|
112
112
|
Requires-Dist: langgraph>=0.6.7; extra == "langgraph-tools"
|
|
113
113
|
Requires-Dist: langchain>=0.3.0; extra == "langgraph-tools"
|
|
114
|
+
Requires-Dist: langchain-fireworks>=0.3.0; extra == "langgraph-tools"
|
|
114
115
|
Provides-Extra: proxy
|
|
115
116
|
Requires-Dist: redis>=5.0.0; extra == "proxy"
|
|
116
117
|
Requires-Dist: langfuse>=2.0.0; extra == "proxy"
|
|
117
118
|
Requires-Dist: uuid6>=2025.0.0; extra == "proxy"
|
|
118
|
-
Requires-Dist: litellm<1.82.0,>=1.81.0; extra == "proxy"
|
|
119
|
-
Requires-Dist: opentelemetry-api>=1.29.0; extra == "proxy"
|
|
120
|
-
Requires-Dist: opentelemetry-sdk>=1.29.0; extra == "proxy"
|
|
121
|
-
Requires-Dist: opentelemetry-exporter-otlp>=1.29.0; extra == "proxy"
|
|
122
119
|
Dynamic: license-file
|
|
123
120
|
|
|
124
121
|
# Eval Protocol
|
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2026-03-
|
|
11
|
+
"date": "2026-03-13T16:09:44-0700",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.3.
|
|
14
|
+
"full-revisionid": "3c8d8f23f7b301697f246c64e57d08fa1c7af50b",
|
|
15
|
+
"version": "0.3.25"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
|
@@ -8,10 +8,8 @@ from __future__ import annotations
|
|
|
8
8
|
import logging
|
|
9
9
|
import requests
|
|
10
10
|
from datetime import datetime
|
|
11
|
-
import ast
|
|
12
|
-
import json
|
|
13
|
-
import os
|
|
14
11
|
from typing import Any, Dict, List, Optional, Protocol
|
|
12
|
+
import os
|
|
15
13
|
|
|
16
14
|
from eval_protocol.models import EvaluationRow, InputMetadata, ExecutionMetadata, Message
|
|
17
15
|
from .base import BaseAdapter
|
|
@@ -46,43 +44,6 @@ class TraceDictConverter(Protocol):
|
|
|
46
44
|
...
|
|
47
45
|
|
|
48
46
|
|
|
49
|
-
def extract_otel_attributes(observations: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
|
|
50
|
-
"""Attempt to extract and parse attributes from raw_gen_ai_request observation. This only works when stored in OTEL format.
|
|
51
|
-
|
|
52
|
-
Args:
|
|
53
|
-
observations: List of observation dictionaries from the trace
|
|
54
|
-
|
|
55
|
-
Returns:
|
|
56
|
-
Dict with all attributes parsed. Or None if not found.
|
|
57
|
-
"""
|
|
58
|
-
for obs in observations:
|
|
59
|
-
if obs.get("name") == "raw_gen_ai_request" and obs.get("type") == "SPAN":
|
|
60
|
-
metadata = obs.get("metadata") or {}
|
|
61
|
-
attributes = metadata.get("attributes") or {}
|
|
62
|
-
|
|
63
|
-
result: Dict[str, Any] = {}
|
|
64
|
-
|
|
65
|
-
for key, value in attributes.items():
|
|
66
|
-
# Try to parse stringified objects (could be Python repr or JSON)
|
|
67
|
-
if isinstance(value, str) and value.startswith(("[", "{")):
|
|
68
|
-
try:
|
|
69
|
-
result[key] = ast.literal_eval(value)
|
|
70
|
-
except Exception as e:
|
|
71
|
-
logger.debug("Failed to parse %s with ast.literal_eval: %s", key, e)
|
|
72
|
-
try:
|
|
73
|
-
result[key] = json.loads(value)
|
|
74
|
-
except Exception as e:
|
|
75
|
-
logger.debug("Failed to parse %s with json.loads: %s", key, e)
|
|
76
|
-
result[key] = value
|
|
77
|
-
else:
|
|
78
|
-
result[key] = value
|
|
79
|
-
|
|
80
|
-
if result:
|
|
81
|
-
return result
|
|
82
|
-
|
|
83
|
-
return None
|
|
84
|
-
|
|
85
|
-
|
|
86
47
|
def convert_trace_dict_to_evaluation_row(
|
|
87
48
|
trace: Dict[str, Any], include_tool_calls: bool = True, span_name: Optional[str] = None
|
|
88
49
|
) -> Optional[EvaluationRow]:
|
|
@@ -135,19 +96,6 @@ def convert_trace_dict_to_evaluation_row(
|
|
|
135
96
|
):
|
|
136
97
|
break # Break early if we've found all the metadata we need
|
|
137
98
|
|
|
138
|
-
observations = trace.get("observations") or []
|
|
139
|
-
# We can only extract when stored in OTEL format.
|
|
140
|
-
otel_attributes = extract_otel_attributes(observations)
|
|
141
|
-
if otel_attributes:
|
|
142
|
-
# Find choices from any provider (llm.*.choices pattern)
|
|
143
|
-
choices = None
|
|
144
|
-
for key, value in otel_attributes.items():
|
|
145
|
-
if key.endswith(".choices") and isinstance(value, list):
|
|
146
|
-
choices = value
|
|
147
|
-
break
|
|
148
|
-
if choices and len(choices) > 0:
|
|
149
|
-
execution_metadata.finish_reason = choices[0].get("finish_reason")
|
|
150
|
-
|
|
151
99
|
return EvaluationRow(
|
|
152
100
|
messages=messages,
|
|
153
101
|
tools=tools,
|
|
@@ -212,7 +160,7 @@ def extract_messages_from_trace_dict(
|
|
|
212
160
|
# Fallback: use the last GENERATION observation which typically contains full chat history
|
|
213
161
|
if not messages:
|
|
214
162
|
try:
|
|
215
|
-
all_observations = trace.get("observations"
|
|
163
|
+
all_observations = trace.get("observations", [])
|
|
216
164
|
gens = [obs for obs in all_observations if obs.get("type") == "GENERATION"]
|
|
217
165
|
if gens:
|
|
218
166
|
gens.sort(key=lambda x: x.get("start_time", ""))
|
|
@@ -238,7 +186,7 @@ def get_final_generation_in_span_dict(trace: Dict[str, Any], span_name: str) ->
|
|
|
238
186
|
The final generation dictionary, or None if not found
|
|
239
187
|
"""
|
|
240
188
|
# Get all observations from the trace
|
|
241
|
-
all_observations = trace.get("observations"
|
|
189
|
+
all_observations = trace.get("observations", [])
|
|
242
190
|
|
|
243
191
|
# Find a span with the given name that has generation children
|
|
244
192
|
parent_span = None
|
|
@@ -15,7 +15,7 @@ from contextlib import asynccontextmanager
|
|
|
15
15
|
|
|
16
16
|
from .models import ProxyConfig, LangfuseTracesResponse, TracesParams, ChatParams, ChatRequestHook, TracesRequestHook
|
|
17
17
|
from .auth import AuthProvider, NoAuthProvider
|
|
18
|
-
from .litellm import handle_chat_completion
|
|
18
|
+
from .litellm import handle_chat_completion, proxy_to_litellm
|
|
19
19
|
from .langfuse import fetch_langfuse_traces, pointwise_fetch_langfuse_trace
|
|
20
20
|
|
|
21
21
|
# Configure logging before any other imports (so all modules inherit this config)
|
|
@@ -35,6 +35,10 @@ def build_proxy_config(
|
|
|
35
35
|
preprocess_traces_request: Optional[TracesRequestHook] = None,
|
|
36
36
|
) -> ProxyConfig:
|
|
37
37
|
"""Load environment and secrets, and build ProxyConfig"""
|
|
38
|
+
# Env
|
|
39
|
+
litellm_url = os.getenv("LITELLM_URL")
|
|
40
|
+
if not litellm_url:
|
|
41
|
+
raise ValueError("LITELLM_URL environment variable must be set")
|
|
38
42
|
request_timeout = float(os.getenv("REQUEST_TIMEOUT", "300.0"))
|
|
39
43
|
langfuse_host = os.getenv("LANGFUSE_HOST", "https://cloud.langfuse.com")
|
|
40
44
|
|
|
@@ -62,6 +66,7 @@ def build_proxy_config(
|
|
|
62
66
|
raise ValueError(f"Invalid format in secrets file {secrets_path.name}: {e}")
|
|
63
67
|
|
|
64
68
|
return ProxyConfig(
|
|
69
|
+
litellm_url=litellm_url,
|
|
65
70
|
request_timeout=request_timeout,
|
|
66
71
|
langfuse_host=langfuse_host,
|
|
67
72
|
langfuse_keys=langfuse_keys,
|
|
@@ -108,16 +113,6 @@ def create_app(
|
|
|
108
113
|
app.state.config = build_proxy_config(preprocess_chat_request, preprocess_traces_request)
|
|
109
114
|
app.state.redis = init_redis()
|
|
110
115
|
|
|
111
|
-
config = app.state.config
|
|
112
|
-
default_keys = config.langfuse_keys[config.default_project_id]
|
|
113
|
-
os.environ["LANGFUSE_PUBLIC_KEY"] = default_keys["public_key"]
|
|
114
|
-
os.environ["LANGFUSE_SECRET_KEY"] = default_keys["secret_key"]
|
|
115
|
-
os.environ.setdefault("LANGFUSE_HOST", config.langfuse_host)
|
|
116
|
-
|
|
117
|
-
import litellm
|
|
118
|
-
|
|
119
|
-
litellm.callbacks = ["langfuse_otel"]
|
|
120
|
-
|
|
121
116
|
try:
|
|
122
117
|
yield
|
|
123
118
|
finally:
|
|
@@ -302,4 +297,13 @@ def create_app(
|
|
|
302
297
|
async def health():
|
|
303
298
|
return {"status": "healthy", "service": "metadata-proxy"}
|
|
304
299
|
|
|
300
|
+
# Catch-all
|
|
301
|
+
@app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE", "PATCH"])
|
|
302
|
+
async def catch_all_proxy(
|
|
303
|
+
path: str,
|
|
304
|
+
request: Request,
|
|
305
|
+
config: ProxyConfig = Depends(get_config),
|
|
306
|
+
):
|
|
307
|
+
return await proxy_to_litellm(config, path, request)
|
|
308
|
+
|
|
305
309
|
return app
|
|
@@ -50,7 +50,6 @@ def _serialize_trace_to_dict(trace_full: Any) -> Dict[str, Any]:
|
|
|
50
50
|
"input": getattr(obs, "input", None),
|
|
51
51
|
"output": getattr(obs, "output", None),
|
|
52
52
|
"parent_observation_id": getattr(obs, "parent_observation_id", None),
|
|
53
|
-
"metadata": getattr(obs, "metadata", None),
|
|
54
53
|
}
|
|
55
54
|
for obs in getattr(trace_full, "observations", [])
|
|
56
55
|
]
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LiteLLM client - handles all communication with LiteLLM service.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import base64
|
|
7
|
+
import httpx
|
|
8
|
+
import logging
|
|
9
|
+
from uuid6 import uuid7
|
|
10
|
+
from fastapi import Request, Response, HTTPException
|
|
11
|
+
import redis
|
|
12
|
+
from .redis_utils import register_insertion_id
|
|
13
|
+
from .models import ProxyConfig, ChatParams
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
async def handle_chat_completion(
|
|
19
|
+
config: ProxyConfig,
|
|
20
|
+
redis_client: redis.Redis,
|
|
21
|
+
request: Request,
|
|
22
|
+
params: ChatParams,
|
|
23
|
+
) -> Response:
|
|
24
|
+
"""
|
|
25
|
+
Handle chat completion requests and forward to LiteLLM.
|
|
26
|
+
|
|
27
|
+
If metadata IDs (rollout_id, etc.) are provided, they'll be added as tags
|
|
28
|
+
and the assistant message count will be tracked in Redis.
|
|
29
|
+
|
|
30
|
+
If encoded_base_url is provided, it will be decoded and added to the request.
|
|
31
|
+
"""
|
|
32
|
+
body = await request.body()
|
|
33
|
+
data = json.loads(body) if body else {}
|
|
34
|
+
|
|
35
|
+
if config.preprocess_chat_request:
|
|
36
|
+
data, params = config.preprocess_chat_request(data, request, params)
|
|
37
|
+
|
|
38
|
+
project_id = params.project_id
|
|
39
|
+
rollout_id = params.rollout_id
|
|
40
|
+
invocation_id = params.invocation_id
|
|
41
|
+
experiment_id = params.experiment_id
|
|
42
|
+
run_id = params.run_id
|
|
43
|
+
row_id = params.row_id
|
|
44
|
+
encoded_base_url = params.encoded_base_url
|
|
45
|
+
|
|
46
|
+
# Use default project if not specified
|
|
47
|
+
if project_id is None:
|
|
48
|
+
project_id = config.default_project_id
|
|
49
|
+
|
|
50
|
+
# Decode and add base_url if provided
|
|
51
|
+
if encoded_base_url:
|
|
52
|
+
try:
|
|
53
|
+
# Decode from URL-safe base64
|
|
54
|
+
decoded_bytes = base64.urlsafe_b64decode(encoded_base_url)
|
|
55
|
+
base_url = decoded_bytes.decode("utf-8")
|
|
56
|
+
data["base_url"] = base_url
|
|
57
|
+
logger.debug(f"Decoded base_url: {base_url}")
|
|
58
|
+
except Exception as e:
|
|
59
|
+
logger.error(f"Failed to decode base_url: {e}")
|
|
60
|
+
raise HTTPException(status_code=400, detail=f"Invalid encoded_base_url: {str(e)}")
|
|
61
|
+
|
|
62
|
+
# Extract API key from Authorization header and inject into request body
|
|
63
|
+
auth_header = request.headers.get("authorization", "")
|
|
64
|
+
if auth_header.startswith("Bearer "):
|
|
65
|
+
api_key = auth_header.replace("Bearer ", "").strip()
|
|
66
|
+
# Only inject API key if model is a Fireworks model
|
|
67
|
+
model = data.get("model")
|
|
68
|
+
if model and isinstance(model, str) and model.startswith("fireworks_ai"):
|
|
69
|
+
data["api_key"] = api_key
|
|
70
|
+
|
|
71
|
+
# If metadata IDs are provided, add them as tags
|
|
72
|
+
insertion_id = None
|
|
73
|
+
if rollout_id is not None:
|
|
74
|
+
insertion_id = str(uuid7())
|
|
75
|
+
|
|
76
|
+
if "metadata" not in data:
|
|
77
|
+
data["metadata"] = {}
|
|
78
|
+
if "tags" not in data["metadata"]:
|
|
79
|
+
data["metadata"]["tags"] = []
|
|
80
|
+
|
|
81
|
+
# Add extracted IDs as tags
|
|
82
|
+
data["metadata"]["tags"].extend(
|
|
83
|
+
[
|
|
84
|
+
f"rollout_id:{rollout_id}",
|
|
85
|
+
f"insertion_id:{insertion_id}",
|
|
86
|
+
f"invocation_id:{invocation_id}",
|
|
87
|
+
f"experiment_id:{experiment_id}",
|
|
88
|
+
f"run_id:{run_id}",
|
|
89
|
+
f"row_id:{row_id}",
|
|
90
|
+
]
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# Add Langfuse configuration
|
|
94
|
+
data["langfuse_public_key"] = config.langfuse_keys[project_id]["public_key"]
|
|
95
|
+
data["langfuse_secret_key"] = config.langfuse_keys[project_id]["secret_key"]
|
|
96
|
+
data["langfuse_host"] = config.langfuse_host
|
|
97
|
+
|
|
98
|
+
# Forward to LiteLLM's standard /chat/completions endpoint
|
|
99
|
+
# Set longer timeout for LLM API calls (LLMs can be slow)
|
|
100
|
+
timeout = httpx.Timeout(config.request_timeout)
|
|
101
|
+
async with httpx.AsyncClient(timeout=timeout) as client:
|
|
102
|
+
# Copy headers from original request but exclude content-length (httpx will set it correctly)
|
|
103
|
+
headers = dict(request.headers)
|
|
104
|
+
headers.pop("host", None)
|
|
105
|
+
headers.pop("content-length", None) # Let httpx calculate the correct length
|
|
106
|
+
headers["content-type"] = "application/json"
|
|
107
|
+
|
|
108
|
+
# Forward to LiteLLM
|
|
109
|
+
litellm_url = f"{config.litellm_url}/chat/completions"
|
|
110
|
+
|
|
111
|
+
response = await client.post(
|
|
112
|
+
litellm_url,
|
|
113
|
+
json=data, # httpx will serialize and set correct Content-Length
|
|
114
|
+
headers=headers,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Register insertion_id in Redis only on successful response
|
|
118
|
+
if response.status_code == 200 and insertion_id is not None and rollout_id is not None:
|
|
119
|
+
register_insertion_id(redis_client, rollout_id, insertion_id)
|
|
120
|
+
|
|
121
|
+
# Return the response
|
|
122
|
+
return Response(
|
|
123
|
+
content=response.content,
|
|
124
|
+
status_code=response.status_code,
|
|
125
|
+
headers=dict(response.headers),
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
async def proxy_to_litellm(config: ProxyConfig, path: str, request: Request) -> Response:
|
|
130
|
+
"""
|
|
131
|
+
Catch-all proxy: Forward any request to LiteLLM, extracting API key from Authorization header.
|
|
132
|
+
"""
|
|
133
|
+
# Set longer timeout for LLM API calls (LLMs can be slow)
|
|
134
|
+
timeout = httpx.Timeout(config.request_timeout)
|
|
135
|
+
async with httpx.AsyncClient(timeout=timeout) as client:
|
|
136
|
+
# Copy headers
|
|
137
|
+
headers = dict(request.headers)
|
|
138
|
+
headers.pop("host", None)
|
|
139
|
+
headers.pop("content-length", None)
|
|
140
|
+
|
|
141
|
+
# Get body
|
|
142
|
+
body = await request.body()
|
|
143
|
+
|
|
144
|
+
# Pass through API key from Authorization header
|
|
145
|
+
if request.method in ["POST", "PUT", "PATCH"] and body:
|
|
146
|
+
try:
|
|
147
|
+
data = json.loads(body)
|
|
148
|
+
|
|
149
|
+
auth_header = request.headers.get("authorization", "")
|
|
150
|
+
if auth_header.startswith("Bearer "):
|
|
151
|
+
api_key = auth_header.replace("Bearer ", "").strip()
|
|
152
|
+
data["api_key"] = api_key
|
|
153
|
+
|
|
154
|
+
# Re-serialize
|
|
155
|
+
body = json.dumps(data).encode()
|
|
156
|
+
except json.JSONDecodeError:
|
|
157
|
+
pass
|
|
158
|
+
|
|
159
|
+
# Forward to LiteLLM
|
|
160
|
+
litellm_url = f"{config.litellm_url}/{path}"
|
|
161
|
+
|
|
162
|
+
response = await client.request(
|
|
163
|
+
method=request.method,
|
|
164
|
+
url=litellm_url,
|
|
165
|
+
headers=headers,
|
|
166
|
+
content=body,
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
return Response(
|
|
170
|
+
content=response.content,
|
|
171
|
+
status_code=response.status_code,
|
|
172
|
+
headers=dict(response.headers),
|
|
173
|
+
)
|
|
@@ -53,6 +53,7 @@ class TracesParams(BaseModel):
|
|
|
53
53
|
class ProxyConfig(BaseModel):
|
|
54
54
|
"""Configuration model for the LiteLLM Metadata Proxy"""
|
|
55
55
|
|
|
56
|
+
litellm_url: str
|
|
56
57
|
request_timeout: float = 300.0
|
|
57
58
|
langfuse_host: str
|
|
58
59
|
langfuse_keys: Dict[str, Dict[str, str]]
|
|
@@ -72,7 +73,6 @@ class ObservationResponse(BaseModel):
|
|
|
72
73
|
input: Optional[Any] = None
|
|
73
74
|
output: Optional[Any] = None
|
|
74
75
|
parent_observation_id: Optional[str] = None
|
|
75
|
-
metadata: Optional[Dict[str, Any]] = None
|
|
76
76
|
|
|
77
77
|
|
|
78
78
|
class TraceResponse(BaseModel):
|
|
@@ -449,6 +449,8 @@ def evaluation_test(
|
|
|
449
449
|
finally:
|
|
450
450
|
if output_buffer:
|
|
451
451
|
await output_buffer.close()
|
|
452
|
+
await rollout_processor.acleanup()
|
|
453
|
+
rollout_processor.cleanup()
|
|
452
454
|
|
|
453
455
|
for res in priority_results:
|
|
454
456
|
run_idx = (res.execution_metadata.extra or {}).get("run_index", 0)
|
|
@@ -697,15 +699,19 @@ def evaluation_test(
|
|
|
697
699
|
# Lazy import (cached after first import above)
|
|
698
700
|
from eval_protocol.pytest.default_mcp_gym_rollout_processor import MCPGymRolloutProcessor
|
|
699
701
|
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
702
|
+
try:
|
|
703
|
+
if isinstance(rollout_processor, MCPGymRolloutProcessor):
|
|
704
|
+
# For MCPGymRolloutProcessor, create and execute tasks one at a time to avoid port conflicts
|
|
705
|
+
for run_idx in range(num_runs):
|
|
706
|
+
task = asyncio.create_task(execute_run(run_idx, config))
|
|
707
|
+
await task
|
|
708
|
+
else:
|
|
709
|
+
# For other processors, create all tasks at once and run in parallel
|
|
710
|
+
# Concurrency is now controlled by the shared semaphore in each rollout processor
|
|
711
|
+
await run_tasks_with_run_progress(execute_run, num_runs, config)
|
|
712
|
+
finally:
|
|
713
|
+
await rollout_processor.acleanup()
|
|
714
|
+
rollout_processor.cleanup()
|
|
709
715
|
|
|
710
716
|
experiment_duration_seconds = time.perf_counter() - experiment_start_time
|
|
711
717
|
|
|
@@ -476,8 +476,12 @@ async def rollout_processor_with_retry(
|
|
|
476
476
|
yield result
|
|
477
477
|
|
|
478
478
|
finally:
|
|
479
|
-
|
|
480
|
-
|
|
479
|
+
# Cleanup is intentionally NOT called here. rollout_processor_with_retry
|
|
480
|
+
# is invoked per-run, but the processor (and its session) is shared
|
|
481
|
+
# across parallel runs. Closing per-run would kill in-flight requests
|
|
482
|
+
# in other runs. Cleanup is called once after all runs complete in
|
|
483
|
+
# evaluation_test.py.
|
|
484
|
+
pass
|
|
481
485
|
|
|
482
486
|
|
|
483
487
|
def sanitize_filename(text: str) -> str:
|
|
@@ -23,6 +23,7 @@ def get_default_retryable_exceptions() -> Set[Type[Exception]]:
|
|
|
23
23
|
return _default_retryable_exceptions
|
|
24
24
|
|
|
25
25
|
# Lazy imports (these are expensive)
|
|
26
|
+
import aiohttp
|
|
26
27
|
import httpx
|
|
27
28
|
import litellm
|
|
28
29
|
import requests
|
|
@@ -32,6 +33,9 @@ def get_default_retryable_exceptions() -> Set[Type[Exception]]:
|
|
|
32
33
|
ConnectionError, # type: ignore[assignment]
|
|
33
34
|
TimeoutError, # type: ignore[assignment]
|
|
34
35
|
OSError, # type: ignore[assignment] # Covers network-related OS errors
|
|
36
|
+
# aiohttp library exceptions
|
|
37
|
+
aiohttp.ClientConnectionError,
|
|
38
|
+
aiohttp.ServerDisconnectedError,
|
|
35
39
|
# Requests library exceptions
|
|
36
40
|
requests.exceptions.ConnectionError,
|
|
37
41
|
requests.exceptions.Timeout,
|
{eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/remote_rollout_processor.py
RENAMED
|
@@ -104,6 +104,9 @@ class RemoteRolloutProcessor(RolloutProcessor):
|
|
|
104
104
|
try:
|
|
105
105
|
session = self._get_or_create_session()
|
|
106
106
|
async with session.post(init_url, json=init_payload.model_dump(), timeout=timeout_init) as resp:
|
|
107
|
+
if resp.status >= 500:
|
|
108
|
+
body = await resp.text()
|
|
109
|
+
raise ConnectionError(f"Remote /init returned server error (HTTP {resp.status}): {body}")
|
|
107
110
|
if resp.status >= 400:
|
|
108
111
|
body = await resp.text()
|
|
109
112
|
raise RuntimeError(f"Remote /init failed (HTTP {resp.status}): {body}")
|
|
@@ -215,8 +218,6 @@ class RemoteRolloutProcessor(RolloutProcessor):
|
|
|
215
218
|
loop = asyncio.get_running_loop()
|
|
216
219
|
loop.create_task(self._session.close())
|
|
217
220
|
except RuntimeError:
|
|
218
|
-
# No running event loop - can't safely close the session.
|
|
219
|
-
# The session will be garbage collected eventually, but warn about it.
|
|
220
221
|
logger.warning(
|
|
221
222
|
"RemoteRolloutProcessor.cleanup() called outside of async context. "
|
|
222
223
|
"Session may not be properly closed. Use `await processor.acleanup()` when possible."
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.25
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -24,7 +24,7 @@ Requires-Dist: hydra-core>=1.3.2
|
|
|
24
24
|
Requires-Dist: omegaconf>=2.3.0
|
|
25
25
|
Requires-Dist: httpx>=0.24.0
|
|
26
26
|
Requires-Dist: anthropic>=0.59.0
|
|
27
|
-
Requires-Dist: litellm<1.
|
|
27
|
+
Requires-Dist: litellm<1.75.0
|
|
28
28
|
Requires-Dist: pytest>=6.0.0
|
|
29
29
|
Requires-Dist: pytest-asyncio>=0.21.0
|
|
30
30
|
Requires-Dist: peewee>=3.18.2
|
|
@@ -111,14 +111,11 @@ Requires-Dist: langchain-core>=0.3.75; extra == "langgraph"
|
|
|
111
111
|
Provides-Extra: langgraph-tools
|
|
112
112
|
Requires-Dist: langgraph>=0.6.7; extra == "langgraph-tools"
|
|
113
113
|
Requires-Dist: langchain>=0.3.0; extra == "langgraph-tools"
|
|
114
|
+
Requires-Dist: langchain-fireworks>=0.3.0; extra == "langgraph-tools"
|
|
114
115
|
Provides-Extra: proxy
|
|
115
116
|
Requires-Dist: redis>=5.0.0; extra == "proxy"
|
|
116
117
|
Requires-Dist: langfuse>=2.0.0; extra == "proxy"
|
|
117
118
|
Requires-Dist: uuid6>=2025.0.0; extra == "proxy"
|
|
118
|
-
Requires-Dist: litellm<1.82.0,>=1.81.0; extra == "proxy"
|
|
119
|
-
Requires-Dist: opentelemetry-api>=1.29.0; extra == "proxy"
|
|
120
|
-
Requires-Dist: opentelemetry-sdk>=1.29.0; extra == "proxy"
|
|
121
|
-
Requires-Dist: opentelemetry-exporter-otlp>=1.29.0; extra == "proxy"
|
|
122
119
|
Dynamic: license-file
|
|
123
120
|
|
|
124
121
|
# Eval Protocol
|
|
@@ -12,7 +12,7 @@ hydra-core>=1.3.2
|
|
|
12
12
|
omegaconf>=2.3.0
|
|
13
13
|
httpx>=0.24.0
|
|
14
14
|
anthropic>=0.59.0
|
|
15
|
-
litellm<1.
|
|
15
|
+
litellm<1.75.0
|
|
16
16
|
pytest>=6.0.0
|
|
17
17
|
pytest-asyncio>=0.21.0
|
|
18
18
|
peewee>=3.18.2
|
|
@@ -93,6 +93,7 @@ langchain-core>=0.3.75
|
|
|
93
93
|
[langgraph_tools]
|
|
94
94
|
langgraph>=0.6.7
|
|
95
95
|
langchain>=0.3.0
|
|
96
|
+
langchain-fireworks>=0.3.0
|
|
96
97
|
|
|
97
98
|
[langsmith]
|
|
98
99
|
langsmith>=0.1.86
|
|
@@ -107,10 +108,6 @@ openevals>=0.1.0
|
|
|
107
108
|
redis>=5.0.0
|
|
108
109
|
langfuse>=2.0.0
|
|
109
110
|
uuid6>=2025.0.0
|
|
110
|
-
litellm<1.82.0,>=1.81.0
|
|
111
|
-
opentelemetry-api>=1.29.0
|
|
112
|
-
opentelemetry-sdk>=1.29.0
|
|
113
|
-
opentelemetry-exporter-otlp>=1.29.0
|
|
114
111
|
|
|
115
112
|
[pydantic]
|
|
116
113
|
pydantic-ai>=1.0.2
|
|
@@ -31,7 +31,7 @@ dependencies = [
|
|
|
31
31
|
"omegaconf>=2.3.0",
|
|
32
32
|
"httpx>=0.24.0",
|
|
33
33
|
"anthropic>=0.59.0",
|
|
34
|
-
"litellm
|
|
34
|
+
"litellm<1.75.0",
|
|
35
35
|
"pytest>=6.0.0",
|
|
36
36
|
"pytest-asyncio>=0.21.0",
|
|
37
37
|
"peewee>=3.18.2",
|
|
@@ -146,17 +146,13 @@ langgraph = [
|
|
|
146
146
|
langgraph_tools = [
|
|
147
147
|
"langgraph>=0.6.7",
|
|
148
148
|
"langchain>=0.3.0",
|
|
149
|
-
|
|
149
|
+
"langchain-fireworks>=0.3.0",
|
|
150
150
|
]
|
|
151
151
|
|
|
152
152
|
proxy = [
|
|
153
153
|
"redis>=5.0.0",
|
|
154
154
|
"langfuse>=2.0.0",
|
|
155
155
|
"uuid6>=2025.0.0",
|
|
156
|
-
"litellm>=1.81.0,<1.82.0",
|
|
157
|
-
"opentelemetry-api>=1.29.0",
|
|
158
|
-
"opentelemetry-sdk>=1.29.0",
|
|
159
|
-
"opentelemetry-exporter-otlp>=1.29.0",
|
|
160
156
|
]
|
|
161
157
|
|
|
162
158
|
[project.scripts]
|