eval-protocol 0.2.3__tar.gz → 0.2.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_protocol-0.2.3/eval_protocol.egg-info → eval_protocol-0.2.5}/PKG-INFO +18 -9
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/README.md +12 -6
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/_version.py +3 -3
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/adapters/langfuse.py +120 -135
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/cli.py +7 -0
- eval_protocol-0.2.5/eval_protocol/cli_commands/logs.py +29 -0
- eval_protocol-0.2.5/eval_protocol/dataset_logger/__init__.py +3 -0
- eval_protocol-0.2.5/eval_protocol/dataset_logger/dataset_logger.py +35 -0
- eval_protocol-0.2.5/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +114 -0
- eval_protocol-0.2.5/eval_protocol/human_id/__init__.py +34 -0
- eval_protocol-0.2.5/eval_protocol/human_id/dictionary.py +507 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp/client/connection.py +19 -1
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp/execution/manager.py +3 -38
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp/mcp_multi_client.py +48 -24
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp/mcpgym.py +33 -2
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp/session/manager.py +7 -9
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp_env.py +28 -12
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/models.py +58 -6
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/pytest/default_agent_rollout_processor.py +48 -34
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +17 -19
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/pytest/evaluation_test.py +145 -68
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/types/types.py +4 -18
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/utils/__init__.py +5 -0
- eval_protocol-0.2.5/eval_protocol/utils/logs_server.py +295 -0
- eval_protocol-0.2.5/eval_protocol/utils/vite_server.py +112 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5/eval_protocol.egg-info}/PKG-INFO +18 -9
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol.egg-info/SOURCES.txt +8 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol.egg-info/requires.txt +4 -1
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/pyproject.toml +6 -2
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_parallel_rollouts.py +2 -2
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_rollout_control_plane_integration.py +10 -2
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_url_handling.py +26 -12
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/LICENSE +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/development/__init__.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/development/normalize_sandbox_fusion.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/development/utils/__init__.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/development/utils/generate_api_key.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/development/utils/subprocess_manager.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/__init__.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/__main__.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/adapters/__init__.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/adapters/braintrust.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/adapters/huggingface.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/adapters/trl.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/agent/__init__.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/agent/models.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/agent/orchestrator.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/agent/resource_abc.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/agent/resource_pool.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/agent/resources/__init__.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/agent/resources/docker_resource.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/agent/resources/http_rollout_protocol.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/agent/resources/http_rollout_resource.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/agent/resources/python_state_resource.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/agent/resources/sql_resource.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/agent/task_manager.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/agent/tool_registry.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/auth.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/cli_commands/__init__.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/cli_commands/common.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/cli_commands/deploy.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/cli_commands/preview.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/common_utils.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/config.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/datasets/__init__.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/datasets/loader.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/evaluation.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/execution/__init__.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/execution/pipeline.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/gcp_tools.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/generation/cache.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/generation/clients/base.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/generation/clients.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/generic_server.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/integrations/__init__.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/integrations/braintrust.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/integrations/deepeval.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/integrations/openeval.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/integrations/trl.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp/__init__.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp/adapter.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp/client/__init__.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp/clients.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp/execution/__init__.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp/execution/base_policy.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp/execution/policy.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp/grid_renderer.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp/process_manager.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp/session/__init__.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp/simple_process_manager.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp/simulation_server.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp_agent/__init__.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp_agent/config.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp_agent/intermediary_server.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp_agent/main.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp_agent/orchestration/remote_http_client.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp_agent/session.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/packaging.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/platform_api.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/playback_policy.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/pytest/__init__.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/pytest/default_no_op_rollout_process.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/pytest/types.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/pytest/utils.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/resources.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/reward_function.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/__init__.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/accuracy.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/accuracy_length.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/apps_coding_reward.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/apps_execution_utils.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/apps_testing_util.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/bfcl_reward.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/code_execution.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/code_execution_utils.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/cpp_code.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/deepcoder_reward.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/format.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/function_calling.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/json_schema.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/language_consistency.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/lean_prover.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/length.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/math.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/reasoning_steps.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/repetition.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/tag_count.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rl_processing.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/server.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/typed_interface.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/types/__init__.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/utils/batch_evaluation.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/utils/batch_transformation.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/utils/dataset_helpers.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/utils/module_loader.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/utils/packaging_utils.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/utils/static_policy.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol.egg-info/dependency_links.txt +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol.egg-info/entry_points.txt +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol.egg-info/top_level.txt +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/setup.cfg +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/setup.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_accuracy.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_accuracy_length.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_adapters_e2e.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_agent_orchestrator.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_agent_resources.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_auth.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_batch_evaluation.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_braintrust_adapter.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_braintrust_example.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_cli.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_cli_agent.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_cli_args.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_code_execution.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_config.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_control_plane_separation.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_cpp_code.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_data_driven_task_manager.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_deepcoder_reward.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_deepeval_integration.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_deploy_integration.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_e2b_integration.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_e2b_js_integration.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_edge_cases.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_eval_protocol_import.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_evaluation.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_evaluation_integration.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_evaluation_preview_integration.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_examples_end_to_end.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_fireworks_api.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_format.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_fractional_code.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_frozen_lake_http_server.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_frozen_lake_seed_evaluation.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_function_calling.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_gcp_tools.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_generic_server.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_integration.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_json_schema.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_kwargs_validation.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_language_consistency.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_lean_prover.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_lean_prover_runner.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_length.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_math.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_minimal.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_models.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_models_rl.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_n_variant_batch_integration.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_n_variant_integration.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_openai_compatibility.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_openeval_integration.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_packaging.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_platform_api.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_readiness.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_reasoning_steps.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_repetition.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_repetition_debug.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_reward_function.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_reward_protocol_import.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_rl_processing.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_server.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_tag_count.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_typed_interface.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_typed_interface_rl.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/__init__.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/agent/__init__.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/agent/base.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/agent/llm_agent.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/api_service/__init__.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/api_service/api_config.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/api_service/data_model.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/api_service/simulation_service.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/cli.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/config.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/data_model/__init__.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/data_model/message.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/data_model/simulation.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/data_model/tasks.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/__init__.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/airline/__init__.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/airline/data_model.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/airline/environment.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/airline/tools.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/airline/utils.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/mock/__init__.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/mock/data_model.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/mock/environment.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/mock/tools.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/mock/utils.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/retail/__init__.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/retail/data_model.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/retail/environment.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/retail/tools.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/retail/utils.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/telecom/__init__.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/telecom/data_model.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/telecom/environment.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/telecom/tools.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/telecom/user_tools.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/telecom/utils.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/environment/__init__.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/environment/db.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/environment/environment.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/environment/server.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/environment/tool.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/environment/toolkit.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/environment/utils/interface_agent.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/evaluator/__init__.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/evaluator/evaluator.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/evaluator/evaluator_action.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/evaluator/evaluator_base.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/evaluator/evaluator_env.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/metrics/__init__.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/metrics/agent_metrics.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/metrics/break_down_metrics.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/orchestrator/__init__.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/orchestrator/environment_manager.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/orchestrator/orchestrator.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/orchestrator/utils.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/registry.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/run.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/scripts/__init__.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/scripts/check_data.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/scripts/show_domain_doc.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/scripts/start_servers.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/scripts/view_simulations.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/user/__init__.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/user/base.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/user/user_simulator.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/utils/__init__.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/utils/display.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/utils/io_utils.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/utils/llm_utils.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/utils/pydantic_utils.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/utils/utils.py +0 -0
- {eval_protocol-0.2.3 → eval_protocol-0.2.5}/versioneer.py +0 -0
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.5
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
|
-
License-Expression:
|
|
6
|
+
License-Expression: MIT
|
|
7
7
|
Project-URL: Homepage, https://github.com/fireworks-ai/eval-protocol
|
|
8
8
|
Classifier: Programming Language :: Python :: 3
|
|
9
9
|
Classifier: Operating System :: OS Independent
|
|
@@ -13,7 +13,6 @@ License-File: LICENSE
|
|
|
13
13
|
Requires-Dist: requests>=2.25.0
|
|
14
14
|
Requires-Dist: pydantic>=2.0.0
|
|
15
15
|
Requires-Dist: dataclasses-json>=0.5.7
|
|
16
|
-
Requires-Dist: fastapi>=0.68.0
|
|
17
16
|
Requires-Dist: uvicorn>=0.15.0
|
|
18
17
|
Requires-Dist: python-dotenv>=0.19.0
|
|
19
18
|
Requires-Dist: openai==1.78.1
|
|
@@ -39,6 +38,10 @@ Requires-Dist: litellm>=1.0.0
|
|
|
39
38
|
Requires-Dist: addict>=2.4.0
|
|
40
39
|
Requires-Dist: deepdiff>=6.0.0
|
|
41
40
|
Requires-Dist: pandas>=1.5.0
|
|
41
|
+
Requires-Dist: watchdog>=2.1.0
|
|
42
|
+
Requires-Dist: websockets>=15.0.1
|
|
43
|
+
Requires-Dist: fireworks-ai>=0.19.12
|
|
44
|
+
Requires-Dist: fastapi>=0.116.1
|
|
42
45
|
Provides-Extra: dev
|
|
43
46
|
Requires-Dist: build; extra == "dev"
|
|
44
47
|
Requires-Dist: twine; extra == "dev"
|
|
@@ -96,8 +99,8 @@ Dynamic: license-file
|
|
|
96
99
|
|
|
97
100
|
[](https://pypi.org/project/eval-protocol/)
|
|
98
101
|
|
|
99
|
-
EP is an open specification, Python SDK,
|
|
100
|
-
standardized way to write evaluations for large language model (LLM)
|
|
102
|
+
EP is an open specification, Python SDK, pytest wrapper, and suite of tools that
|
|
103
|
+
provides a standardized way to write evaluations for large language model (LLM)
|
|
101
104
|
applications. Start with simple single-turn evals for model selection and prompt
|
|
102
105
|
engineering, then scale up to complex multi-turn reinforcement learning (RL) for
|
|
103
106
|
agents using Model Context Protocol (MCP). EP ensures consistent patterns for
|
|
@@ -106,6 +109,12 @@ sophisticated agent evaluations that work across real-world scenarios, from
|
|
|
106
109
|
markdown generation tasks to customer service agents with tool calling
|
|
107
110
|
capabilities.
|
|
108
111
|
|
|
112
|
+
<p align="center">
|
|
113
|
+
<img src="./assets/ui.png" alt="UI" />
|
|
114
|
+
<br>
|
|
115
|
+
<sub><b>Log Viewer: Monitor your evaluation rollouts in real time.</b></sub>
|
|
116
|
+
</p>
|
|
117
|
+
|
|
109
118
|
## Quick Example
|
|
110
119
|
|
|
111
120
|
Here's a simple test function that checks if a model's response contains **bold** text formatting:
|
|
@@ -129,17 +138,17 @@ def test_bold_format(row: EvaluationRow) -> EvaluationRow:
|
|
|
129
138
|
"""
|
|
130
139
|
Simple evaluation that checks if the model's response contains bold text.
|
|
131
140
|
"""
|
|
132
|
-
|
|
141
|
+
|
|
133
142
|
assistant_response = row.messages[-1].content
|
|
134
|
-
|
|
143
|
+
|
|
135
144
|
# Check if response contains **bold** text
|
|
136
145
|
has_bold = "**" in assistant_response
|
|
137
|
-
|
|
146
|
+
|
|
138
147
|
if has_bold:
|
|
139
148
|
result = EvaluateResult(score=1.0, reason="✅ Response contains bold text")
|
|
140
149
|
else:
|
|
141
150
|
result = EvaluateResult(score=0.0, reason="❌ No bold text found")
|
|
142
|
-
|
|
151
|
+
|
|
143
152
|
row.evaluation_result = result
|
|
144
153
|
return row
|
|
145
154
|
```
|
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
[](https://pypi.org/project/eval-protocol/)
|
|
4
4
|
|
|
5
|
-
EP is an open specification, Python SDK,
|
|
6
|
-
standardized way to write evaluations for large language model (LLM)
|
|
5
|
+
EP is an open specification, Python SDK, pytest wrapper, and suite of tools that
|
|
6
|
+
provides a standardized way to write evaluations for large language model (LLM)
|
|
7
7
|
applications. Start with simple single-turn evals for model selection and prompt
|
|
8
8
|
engineering, then scale up to complex multi-turn reinforcement learning (RL) for
|
|
9
9
|
agents using Model Context Protocol (MCP). EP ensures consistent patterns for
|
|
@@ -12,6 +12,12 @@ sophisticated agent evaluations that work across real-world scenarios, from
|
|
|
12
12
|
markdown generation tasks to customer service agents with tool calling
|
|
13
13
|
capabilities.
|
|
14
14
|
|
|
15
|
+
<p align="center">
|
|
16
|
+
<img src="./assets/ui.png" alt="UI" />
|
|
17
|
+
<br>
|
|
18
|
+
<sub><b>Log Viewer: Monitor your evaluation rollouts in real time.</b></sub>
|
|
19
|
+
</p>
|
|
20
|
+
|
|
15
21
|
## Quick Example
|
|
16
22
|
|
|
17
23
|
Here's a simple test function that checks if a model's response contains **bold** text formatting:
|
|
@@ -35,17 +41,17 @@ def test_bold_format(row: EvaluationRow) -> EvaluationRow:
|
|
|
35
41
|
"""
|
|
36
42
|
Simple evaluation that checks if the model's response contains bold text.
|
|
37
43
|
"""
|
|
38
|
-
|
|
44
|
+
|
|
39
45
|
assistant_response = row.messages[-1].content
|
|
40
|
-
|
|
46
|
+
|
|
41
47
|
# Check if response contains **bold** text
|
|
42
48
|
has_bold = "**" in assistant_response
|
|
43
|
-
|
|
49
|
+
|
|
44
50
|
if has_bold:
|
|
45
51
|
result = EvaluateResult(score=1.0, reason="✅ Response contains bold text")
|
|
46
52
|
else:
|
|
47
53
|
result = EvaluateResult(score=0.0, reason="❌ No bold text found")
|
|
48
|
-
|
|
54
|
+
|
|
49
55
|
row.evaluation_result = result
|
|
50
56
|
return row
|
|
51
57
|
```
|
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2025-08-
|
|
11
|
+
"date": "2025-08-06T01:34:18-0700",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.2.
|
|
14
|
+
"full-revisionid": "1a37ee141ebe4084654889ace2aba9c1529acf1c",
|
|
15
|
+
"version": "0.2.5"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
|
@@ -14,21 +14,19 @@ logger = logging.getLogger(__name__)
|
|
|
14
14
|
|
|
15
15
|
try:
|
|
16
16
|
from langfuse import Langfuse
|
|
17
|
+
|
|
17
18
|
LANGFUSE_AVAILABLE = True
|
|
18
19
|
except ImportError:
|
|
19
20
|
LANGFUSE_AVAILABLE = False
|
|
20
|
-
logger.warning(
|
|
21
|
-
"Langfuse not installed. Install with: pip install 'eval-protocol[langfuse]'"
|
|
22
|
-
)
|
|
23
21
|
|
|
24
22
|
|
|
25
23
|
class LangfuseAdapter:
|
|
26
24
|
"""Adapter to pull data from Langfuse and convert to EvaluationRow format.
|
|
27
|
-
|
|
25
|
+
|
|
28
26
|
This adapter can pull both chat conversations and tool calling traces from
|
|
29
27
|
Langfuse deployments and convert them into the EvaluationRow format expected
|
|
30
28
|
by the evaluation protocol.
|
|
31
|
-
|
|
29
|
+
|
|
32
30
|
Examples:
|
|
33
31
|
Basic usage:
|
|
34
32
|
>>> adapter = LangfuseAdapter(
|
|
@@ -37,7 +35,7 @@ class LangfuseAdapter:
|
|
|
37
35
|
... host="https://your-langfuse-deployment.com"
|
|
38
36
|
... )
|
|
39
37
|
>>> rows = list(adapter.get_evaluation_rows(limit=10))
|
|
40
|
-
|
|
38
|
+
|
|
41
39
|
Filter by specific criteria:
|
|
42
40
|
>>> rows = list(adapter.get_evaluation_rows(
|
|
43
41
|
... limit=50,
|
|
@@ -46,7 +44,7 @@ class LangfuseAdapter:
|
|
|
46
44
|
... from_timestamp=datetime.now() - timedelta(days=7)
|
|
47
45
|
... ))
|
|
48
46
|
"""
|
|
49
|
-
|
|
47
|
+
|
|
50
48
|
def __init__(
|
|
51
49
|
self,
|
|
52
50
|
public_key: str,
|
|
@@ -55,25 +53,19 @@ class LangfuseAdapter:
|
|
|
55
53
|
project_id: Optional[str] = None,
|
|
56
54
|
):
|
|
57
55
|
"""Initialize the Langfuse adapter.
|
|
58
|
-
|
|
56
|
+
|
|
59
57
|
Args:
|
|
60
58
|
public_key: Langfuse public key
|
|
61
|
-
secret_key: Langfuse secret key
|
|
59
|
+
secret_key: Langfuse secret key
|
|
62
60
|
host: Langfuse host URL (default: https://cloud.langfuse.com)
|
|
63
61
|
project_id: Optional project ID to filter traces
|
|
64
62
|
"""
|
|
65
63
|
if not LANGFUSE_AVAILABLE:
|
|
66
|
-
raise ImportError(
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
self.client = Langfuse(
|
|
71
|
-
public_key=public_key,
|
|
72
|
-
secret_key=secret_key,
|
|
73
|
-
host=host
|
|
74
|
-
)
|
|
64
|
+
raise ImportError("Langfuse not installed. Install with: pip install 'eval-protocol[langfuse]'")
|
|
65
|
+
|
|
66
|
+
self.client = Langfuse(public_key=public_key, secret_key=secret_key, host=host)
|
|
75
67
|
self.project_id = project_id
|
|
76
|
-
|
|
68
|
+
|
|
77
69
|
def get_evaluation_rows(
|
|
78
70
|
self,
|
|
79
71
|
limit: int = 100,
|
|
@@ -85,16 +77,16 @@ class LangfuseAdapter:
|
|
|
85
77
|
include_tool_calls: bool = True,
|
|
86
78
|
) -> Iterator[EvaluationRow]:
|
|
87
79
|
"""Pull traces from Langfuse and convert to EvaluationRow format.
|
|
88
|
-
|
|
80
|
+
|
|
89
81
|
Args:
|
|
90
82
|
limit: Maximum number of rows to return
|
|
91
83
|
tags: Filter by specific tags
|
|
92
84
|
user_id: Filter by user ID
|
|
93
|
-
session_id: Filter by session ID
|
|
85
|
+
session_id: Filter by session ID
|
|
94
86
|
from_timestamp: Filter traces after this timestamp
|
|
95
87
|
to_timestamp: Filter traces before this timestamp
|
|
96
88
|
include_tool_calls: Whether to include tool calling traces
|
|
97
|
-
|
|
89
|
+
|
|
98
90
|
Yields:
|
|
99
91
|
EvaluationRow: Converted evaluation rows
|
|
100
92
|
"""
|
|
@@ -102,12 +94,12 @@ class LangfuseAdapter:
|
|
|
102
94
|
traces = self.client.get_traces(
|
|
103
95
|
limit=limit,
|
|
104
96
|
tags=tags,
|
|
105
|
-
user_id=user_id,
|
|
97
|
+
user_id=user_id,
|
|
106
98
|
session_id=session_id,
|
|
107
99
|
from_timestamp=from_timestamp,
|
|
108
|
-
to_timestamp=to_timestamp
|
|
100
|
+
to_timestamp=to_timestamp,
|
|
109
101
|
)
|
|
110
|
-
|
|
102
|
+
|
|
111
103
|
for trace in traces.data:
|
|
112
104
|
try:
|
|
113
105
|
eval_row = self._convert_trace_to_evaluation_row(trace, include_tool_calls)
|
|
@@ -116,18 +108,18 @@ class LangfuseAdapter:
|
|
|
116
108
|
except (AttributeError, ValueError, KeyError) as e:
|
|
117
109
|
logger.warning("Failed to convert trace %s: %s", trace.id, e)
|
|
118
110
|
continue
|
|
119
|
-
|
|
111
|
+
|
|
120
112
|
def get_evaluation_rows_by_ids(
|
|
121
|
-
self,
|
|
113
|
+
self,
|
|
122
114
|
trace_ids: List[str],
|
|
123
115
|
include_tool_calls: bool = True,
|
|
124
116
|
) -> Iterator[EvaluationRow]:
|
|
125
117
|
"""Get specific traces by their IDs and convert to EvaluationRow format.
|
|
126
|
-
|
|
118
|
+
|
|
127
119
|
Args:
|
|
128
120
|
trace_ids: List of trace IDs to fetch
|
|
129
121
|
include_tool_calls: Whether to include tool calling traces
|
|
130
|
-
|
|
122
|
+
|
|
131
123
|
Yields:
|
|
132
124
|
EvaluationRow: Converted evaluation rows
|
|
133
125
|
"""
|
|
@@ -140,137 +132,131 @@ class LangfuseAdapter:
|
|
|
140
132
|
except (AttributeError, ValueError, KeyError) as e:
|
|
141
133
|
logger.warning("Failed to fetch/convert trace %s: %s", trace_id, e)
|
|
142
134
|
continue
|
|
143
|
-
|
|
144
|
-
def _convert_trace_to_evaluation_row(
|
|
145
|
-
self,
|
|
146
|
-
trace: Any,
|
|
147
|
-
include_tool_calls: bool = True
|
|
148
|
-
) -> Optional[EvaluationRow]:
|
|
135
|
+
|
|
136
|
+
def _convert_trace_to_evaluation_row(self, trace: Any, include_tool_calls: bool = True) -> Optional[EvaluationRow]:
|
|
149
137
|
"""Convert a Langfuse trace to EvaluationRow format.
|
|
150
|
-
|
|
138
|
+
|
|
151
139
|
Args:
|
|
152
140
|
trace: Langfuse trace object
|
|
153
141
|
include_tool_calls: Whether to include tool calling information
|
|
154
|
-
|
|
142
|
+
|
|
155
143
|
Returns:
|
|
156
144
|
EvaluationRow or None if conversion fails
|
|
157
145
|
"""
|
|
158
146
|
try:
|
|
159
147
|
# Get observations (generations, spans) from the trace
|
|
160
148
|
observations = self.client.get_observations(trace_id=trace.id).data
|
|
161
|
-
|
|
149
|
+
|
|
162
150
|
# Convert observations to messages
|
|
163
151
|
messages = self._extract_messages_from_observations(observations, include_tool_calls)
|
|
164
|
-
|
|
152
|
+
|
|
165
153
|
if not messages:
|
|
166
154
|
return None
|
|
167
|
-
|
|
155
|
+
|
|
168
156
|
# Extract metadata
|
|
169
157
|
input_metadata = self._create_input_metadata(trace, observations)
|
|
170
|
-
|
|
158
|
+
|
|
171
159
|
# Extract ground truth if available (from trace metadata or tags)
|
|
172
160
|
ground_truth = self._extract_ground_truth(trace)
|
|
173
|
-
|
|
161
|
+
|
|
174
162
|
# Extract tools if available
|
|
175
163
|
tools = self._extract_tools(observations) if include_tool_calls else None
|
|
176
|
-
|
|
164
|
+
|
|
177
165
|
return EvaluationRow(
|
|
178
166
|
messages=messages,
|
|
179
167
|
tools=tools,
|
|
180
168
|
input_metadata=input_metadata,
|
|
181
169
|
ground_truth=ground_truth,
|
|
182
170
|
)
|
|
183
|
-
|
|
171
|
+
|
|
184
172
|
except (AttributeError, ValueError, KeyError) as e:
|
|
185
173
|
logger.error("Error converting trace %s: %s", trace.id, e)
|
|
186
174
|
return None
|
|
187
|
-
|
|
175
|
+
|
|
188
176
|
def _extract_messages_from_observations(
|
|
189
|
-
self,
|
|
190
|
-
observations: List[Any],
|
|
191
|
-
include_tool_calls: bool = True
|
|
177
|
+
self, observations: List[Any], include_tool_calls: bool = True
|
|
192
178
|
) -> List[Message]:
|
|
193
179
|
"""Extract messages from Langfuse observations.
|
|
194
|
-
|
|
180
|
+
|
|
195
181
|
Args:
|
|
196
182
|
observations: List of Langfuse observation objects
|
|
197
183
|
include_tool_calls: Whether to include tool calling information
|
|
198
|
-
|
|
184
|
+
|
|
199
185
|
Returns:
|
|
200
186
|
List of Message objects
|
|
201
187
|
"""
|
|
202
188
|
messages = []
|
|
203
|
-
|
|
189
|
+
|
|
204
190
|
# Sort observations by timestamp
|
|
205
191
|
sorted_observations = sorted(observations, key=lambda x: x.start_time or datetime.min)
|
|
206
|
-
|
|
192
|
+
|
|
207
193
|
for obs in sorted_observations:
|
|
208
194
|
try:
|
|
209
|
-
if hasattr(obs,
|
|
195
|
+
if hasattr(obs, "input") and obs.input:
|
|
210
196
|
# Handle different input formats
|
|
211
197
|
if isinstance(obs.input, dict):
|
|
212
|
-
if
|
|
198
|
+
if "messages" in obs.input:
|
|
213
199
|
# OpenAI-style messages format
|
|
214
|
-
for msg in obs.input[
|
|
200
|
+
for msg in obs.input["messages"]:
|
|
215
201
|
messages.append(self._dict_to_message(msg, include_tool_calls))
|
|
216
|
-
elif
|
|
202
|
+
elif "role" in obs.input:
|
|
217
203
|
# Single message format
|
|
218
204
|
messages.append(self._dict_to_message(obs.input, include_tool_calls))
|
|
219
|
-
elif
|
|
205
|
+
elif "prompt" in obs.input:
|
|
220
206
|
# Simple prompt format
|
|
221
|
-
messages.append(Message(role="user", content=str(obs.input[
|
|
207
|
+
messages.append(Message(role="user", content=str(obs.input["prompt"])))
|
|
222
208
|
elif isinstance(obs.input, str):
|
|
223
209
|
# Simple string input
|
|
224
210
|
messages.append(Message(role="user", content=obs.input))
|
|
225
|
-
|
|
226
|
-
if hasattr(obs,
|
|
211
|
+
|
|
212
|
+
if hasattr(obs, "output") and obs.output:
|
|
227
213
|
# Handle output
|
|
228
214
|
if isinstance(obs.output, dict):
|
|
229
|
-
if
|
|
230
|
-
messages.append(Message(role="assistant", content=str(obs.output[
|
|
231
|
-
elif
|
|
232
|
-
msg_dict = obs.output[
|
|
215
|
+
if "content" in obs.output:
|
|
216
|
+
messages.append(Message(role="assistant", content=str(obs.output["content"])))
|
|
217
|
+
elif "message" in obs.output:
|
|
218
|
+
msg_dict = obs.output["message"]
|
|
233
219
|
messages.append(self._dict_to_message(msg_dict, include_tool_calls))
|
|
234
220
|
else:
|
|
235
221
|
# Fallback: convert entire output to string
|
|
236
222
|
messages.append(Message(role="assistant", content=str(obs.output)))
|
|
237
223
|
elif isinstance(obs.output, str):
|
|
238
224
|
messages.append(Message(role="assistant", content=obs.output))
|
|
239
|
-
|
|
225
|
+
|
|
240
226
|
except (AttributeError, ValueError, KeyError) as e:
|
|
241
227
|
logger.warning("Error processing observation %s: %s", obs.id, e)
|
|
242
228
|
continue
|
|
243
|
-
|
|
229
|
+
|
|
244
230
|
return messages
|
|
245
|
-
|
|
231
|
+
|
|
246
232
|
def _dict_to_message(self, msg_dict: Dict[str, Any], include_tool_calls: bool = True) -> Message:
|
|
247
233
|
"""Convert a dictionary to a Message object.
|
|
248
|
-
|
|
234
|
+
|
|
249
235
|
Args:
|
|
250
236
|
msg_dict: Dictionary containing message data
|
|
251
237
|
include_tool_calls: Whether to include tool calling information
|
|
252
|
-
|
|
238
|
+
|
|
253
239
|
Returns:
|
|
254
240
|
Message object
|
|
255
241
|
"""
|
|
256
242
|
# Extract basic message components
|
|
257
|
-
role = msg_dict.get(
|
|
258
|
-
content = msg_dict.get(
|
|
259
|
-
name = msg_dict.get(
|
|
260
|
-
|
|
243
|
+
role = msg_dict.get("role", "assistant")
|
|
244
|
+
content = msg_dict.get("content")
|
|
245
|
+
name = msg_dict.get("name")
|
|
246
|
+
|
|
261
247
|
# Handle tool calls if enabled
|
|
262
248
|
tool_calls = None
|
|
263
249
|
tool_call_id = None
|
|
264
250
|
function_call = None
|
|
265
|
-
|
|
251
|
+
|
|
266
252
|
if include_tool_calls:
|
|
267
|
-
if
|
|
268
|
-
tool_calls = msg_dict[
|
|
269
|
-
if
|
|
270
|
-
tool_call_id = msg_dict[
|
|
271
|
-
if
|
|
272
|
-
function_call = msg_dict[
|
|
273
|
-
|
|
253
|
+
if "tool_calls" in msg_dict:
|
|
254
|
+
tool_calls = msg_dict["tool_calls"]
|
|
255
|
+
if "tool_call_id" in msg_dict:
|
|
256
|
+
tool_call_id = msg_dict["tool_call_id"]
|
|
257
|
+
if "function_call" in msg_dict:
|
|
258
|
+
function_call = msg_dict["function_call"]
|
|
259
|
+
|
|
274
260
|
return Message(
|
|
275
261
|
role=role,
|
|
276
262
|
content=content,
|
|
@@ -279,106 +265,105 @@ class LangfuseAdapter:
|
|
|
279
265
|
tool_calls=tool_calls,
|
|
280
266
|
function_call=function_call,
|
|
281
267
|
)
|
|
282
|
-
|
|
268
|
+
|
|
283
269
|
def _create_input_metadata(self, trace: Any, observations: List[Any]) -> InputMetadata:
|
|
284
270
|
"""Create InputMetadata from trace and observations.
|
|
285
|
-
|
|
271
|
+
|
|
286
272
|
Args:
|
|
287
273
|
trace: Langfuse trace object
|
|
288
274
|
observations: List of observation objects
|
|
289
|
-
|
|
275
|
+
|
|
290
276
|
Returns:
|
|
291
277
|
InputMetadata object
|
|
292
278
|
"""
|
|
293
279
|
# Extract completion parameters from observations
|
|
294
280
|
completion_params = CompletionParams()
|
|
295
|
-
|
|
281
|
+
|
|
296
282
|
# Look for model parameters in observations
|
|
297
283
|
for obs in observations:
|
|
298
|
-
if hasattr(obs,
|
|
284
|
+
if hasattr(obs, "model") and obs.model:
|
|
299
285
|
completion_params.model = obs.model
|
|
300
|
-
if hasattr(obs,
|
|
286
|
+
if hasattr(obs, "model_parameters") and obs.model_parameters:
|
|
301
287
|
params = obs.model_parameters
|
|
302
|
-
if
|
|
303
|
-
completion_params.temperature = params[
|
|
304
|
-
if
|
|
305
|
-
completion_params.max_tokens = params[
|
|
306
|
-
if
|
|
307
|
-
completion_params.top_p = params[
|
|
288
|
+
if "temperature" in params:
|
|
289
|
+
completion_params.temperature = params["temperature"]
|
|
290
|
+
if "max_tokens" in params:
|
|
291
|
+
completion_params.max_tokens = params["max_tokens"]
|
|
292
|
+
if "top_p" in params:
|
|
293
|
+
completion_params.top_p = params["top_p"]
|
|
308
294
|
break
|
|
309
|
-
|
|
295
|
+
|
|
310
296
|
# Create dataset info from trace metadata
|
|
311
297
|
dataset_info = {
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
298
|
+
"trace_id": trace.id,
|
|
299
|
+
"trace_name": getattr(trace, "name", None),
|
|
300
|
+
"trace_tags": getattr(trace, "tags", []),
|
|
301
|
+
"langfuse_project_id": self.project_id,
|
|
316
302
|
}
|
|
317
|
-
|
|
303
|
+
|
|
318
304
|
# Add trace metadata if available
|
|
319
|
-
if hasattr(trace,
|
|
320
|
-
dataset_info[
|
|
321
|
-
|
|
305
|
+
if hasattr(trace, "metadata") and trace.metadata:
|
|
306
|
+
dataset_info["trace_metadata"] = trace.metadata
|
|
307
|
+
|
|
322
308
|
# Create session data
|
|
323
309
|
session_data = {
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
310
|
+
"session_id": getattr(trace, "session_id", None),
|
|
311
|
+
"user_id": getattr(trace, "user_id", None),
|
|
312
|
+
"timestamp": getattr(trace, "timestamp", None),
|
|
313
|
+
"langfuse_trace_url": (
|
|
314
|
+
f"{self.client.host}/project/{self.project_id}/traces/{trace.id}" if self.project_id else None
|
|
315
|
+
),
|
|
328
316
|
}
|
|
329
|
-
|
|
317
|
+
|
|
330
318
|
return InputMetadata(
|
|
331
319
|
row_id=trace.id,
|
|
332
320
|
completion_params=completion_params,
|
|
333
321
|
dataset_info=dataset_info,
|
|
334
322
|
session_data=session_data,
|
|
335
323
|
)
|
|
336
|
-
|
|
324
|
+
|
|
337
325
|
def _extract_ground_truth(self, trace: Any) -> Optional[str]:
|
|
338
326
|
"""Extract ground truth from trace if available.
|
|
339
|
-
|
|
327
|
+
|
|
340
328
|
Args:
|
|
341
329
|
trace: Langfuse trace object
|
|
342
|
-
|
|
330
|
+
|
|
343
331
|
Returns:
|
|
344
332
|
Ground truth string or None
|
|
345
333
|
"""
|
|
346
334
|
# Check trace metadata for ground truth
|
|
347
|
-
if hasattr(trace,
|
|
335
|
+
if hasattr(trace, "metadata") and trace.metadata:
|
|
348
336
|
if isinstance(trace.metadata, dict):
|
|
349
|
-
return trace.metadata.get(
|
|
350
|
-
|
|
337
|
+
return trace.metadata.get("ground_truth") or trace.metadata.get("expected_answer")
|
|
338
|
+
|
|
351
339
|
# Check tags for ground truth indicators
|
|
352
|
-
if hasattr(trace,
|
|
340
|
+
if hasattr(trace, "tags") and trace.tags:
|
|
353
341
|
for tag in trace.tags:
|
|
354
|
-
if tag.startswith(
|
|
355
|
-
return tag.replace(
|
|
356
|
-
|
|
342
|
+
if tag.startswith("ground_truth:"):
|
|
343
|
+
return tag.replace("ground_truth:", "", 1)
|
|
344
|
+
|
|
357
345
|
return None
|
|
358
|
-
|
|
346
|
+
|
|
359
347
|
def _extract_tools(self, observations: List[Any]) -> Optional[List[Dict[str, Any]]]:
|
|
360
348
|
"""Extract tool definitions from observations.
|
|
361
|
-
|
|
349
|
+
|
|
362
350
|
Args:
|
|
363
351
|
observations: List of observation objects
|
|
364
|
-
|
|
352
|
+
|
|
365
353
|
Returns:
|
|
366
354
|
List of tool definitions or None
|
|
367
355
|
"""
|
|
368
356
|
tools = []
|
|
369
|
-
|
|
357
|
+
|
|
370
358
|
for obs in observations:
|
|
371
|
-
if hasattr(obs,
|
|
372
|
-
if
|
|
373
|
-
tools.extend(obs.input[
|
|
374
|
-
elif
|
|
359
|
+
if hasattr(obs, "input") and obs.input and isinstance(obs.input, dict):
|
|
360
|
+
if "tools" in obs.input:
|
|
361
|
+
tools.extend(obs.input["tools"])
|
|
362
|
+
elif "functions" in obs.input:
|
|
375
363
|
# Convert functions to tools format
|
|
376
|
-
for func in obs.input[
|
|
377
|
-
tools.append({
|
|
378
|
-
|
|
379
|
-
'function': func
|
|
380
|
-
})
|
|
381
|
-
|
|
364
|
+
for func in obs.input["functions"]:
|
|
365
|
+
tools.append({"type": "function", "function": func})
|
|
366
|
+
|
|
382
367
|
return tools if tools else None
|
|
383
368
|
|
|
384
369
|
|
|
@@ -389,19 +374,19 @@ def create_langfuse_adapter(
|
|
|
389
374
|
project_id: Optional[str] = None,
|
|
390
375
|
) -> LangfuseAdapter:
|
|
391
376
|
"""Factory function to create a Langfuse adapter.
|
|
392
|
-
|
|
377
|
+
|
|
393
378
|
Args:
|
|
394
379
|
public_key: Langfuse public key
|
|
395
380
|
secret_key: Langfuse secret key
|
|
396
381
|
host: Langfuse host URL
|
|
397
382
|
project_id: Optional project ID
|
|
398
|
-
|
|
383
|
+
|
|
399
384
|
Returns:
|
|
400
385
|
LangfuseAdapter instance
|
|
401
386
|
"""
|
|
402
387
|
return LangfuseAdapter(
|
|
403
388
|
public_key=public_key,
|
|
404
|
-
secret_key=secret_key,
|
|
389
|
+
secret_key=secret_key,
|
|
405
390
|
host=host,
|
|
406
391
|
project_id=project_id,
|
|
407
|
-
)
|
|
392
|
+
)
|
|
@@ -14,6 +14,7 @@ from pathlib import Path
|
|
|
14
14
|
|
|
15
15
|
logger = logging.getLogger(__name__)
|
|
16
16
|
|
|
17
|
+
|
|
17
18
|
from eval_protocol.evaluation import create_evaluation, preview_evaluation
|
|
18
19
|
|
|
19
20
|
from .cli_commands.agent_eval_cmd import agent_eval_command
|
|
@@ -24,6 +25,7 @@ from .cli_commands.common import (
|
|
|
24
25
|
)
|
|
25
26
|
from .cli_commands.deploy import deploy_command
|
|
26
27
|
from .cli_commands.deploy_mcp import deploy_mcp_command
|
|
28
|
+
from .cli_commands.logs import logs_command
|
|
27
29
|
from .cli_commands.preview import preview_command
|
|
28
30
|
from .cli_commands.run_eval_cmd import hydra_cli_entry_point
|
|
29
31
|
|
|
@@ -285,6 +287,9 @@ def parse_args(args=None):
|
|
|
285
287
|
help="Override the number of parallel rollouts to execute for each task.",
|
|
286
288
|
)
|
|
287
289
|
|
|
290
|
+
# Logs command
|
|
291
|
+
logs_parser = subparsers.add_parser("logs", help="Serve logs with file watching and real-time updates")
|
|
292
|
+
|
|
288
293
|
# Run command (for Hydra-based evaluations)
|
|
289
294
|
# This subparser intentionally defines no arguments itself.
|
|
290
295
|
# All arguments after 'run' will be passed to Hydra by parse_known_args.
|
|
@@ -338,6 +343,8 @@ def main():
|
|
|
338
343
|
return deploy_mcp_command(args)
|
|
339
344
|
elif args.command == "agent-eval":
|
|
340
345
|
return agent_eval_command(args)
|
|
346
|
+
elif args.command == "logs":
|
|
347
|
+
return logs_command(args)
|
|
341
348
|
elif args.command == "run":
|
|
342
349
|
# For the 'run' command, Hydra takes over argument parsing.
|
|
343
350
|
|