eval-protocol 0.2.0__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_protocol-0.2.0/eval_protocol.egg-info → eval_protocol-0.2.1}/PKG-INFO +10 -1
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/_version.py +3 -3
- eval_protocol-0.2.1/eval_protocol/adapters/__init__.py +47 -0
- eval_protocol-0.2.1/eval_protocol/adapters/huggingface.py +444 -0
- eval_protocol-0.2.1/eval_protocol/adapters/langfuse.py +407 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/pytest/__init__.py +2 -0
- eval_protocol-0.2.1/eval_protocol/pytest/default_dataset_adapter.py +10 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/pytest/default_single_turn_rollout_process.py +12 -4
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/pytest/evaluation_test.py +7 -7
- {eval_protocol-0.2.0 → eval_protocol-0.2.1/eval_protocol.egg-info}/PKG-INFO +10 -1
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol.egg-info/SOURCES.txt +4 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol.egg-info/requires.txt +12 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/pyproject.toml +12 -0
- eval_protocol-0.2.1/tests/test_adapters_e2e.py +447 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_cli_agent.py +4 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_examples_end_to_end.py +4 -4
- eval_protocol-0.2.0/eval_protocol/adapters/__init__.py +0 -1
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/LICENSE +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/README.md +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/development/__init__.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/development/normalize_sandbox_fusion.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/development/utils/__init__.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/development/utils/generate_api_key.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/development/utils/subprocess_manager.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/__init__.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/__main__.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/adapters/braintrust.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/adapters/trl.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/agent/__init__.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/agent/models.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/agent/orchestrator.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/agent/resource_abc.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/agent/resource_pool.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/agent/resources/__init__.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/agent/resources/docker_resource.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/agent/resources/http_rollout_protocol.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/agent/resources/http_rollout_resource.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/agent/resources/python_state_resource.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/agent/resources/sql_resource.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/agent/task_manager.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/agent/tool_registry.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/auth.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/cli.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/cli_commands/__init__.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/cli_commands/common.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/cli_commands/deploy.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/cli_commands/preview.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/common_utils.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/config.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/datasets/__init__.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/datasets/loader.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/evaluation.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/execution/__init__.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/execution/pipeline.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/gcp_tools.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/generation/cache.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/generation/clients/base.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/generation/clients.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/generic_server.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/integrations/__init__.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/integrations/braintrust.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/integrations/deepeval.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/integrations/openeval.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/integrations/trl.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp/__init__.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp/adapter.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp/client/__init__.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp/client/connection.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp/clients.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp/execution/__init__.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp/execution/base_policy.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp/execution/manager.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp/execution/policy.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp/grid_renderer.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp/mcp_multi_client.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp/mcpgym.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp/process_manager.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp/session/__init__.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp/session/manager.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp/simple_process_manager.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp/simulation_server.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp_agent/__init__.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp_agent/config.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp_agent/intermediary_server.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp_agent/main.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp_agent/orchestration/remote_http_client.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp_agent/session.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp_env.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/models.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/packaging.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/platform_api.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/playback_policy.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/pytest/default_no_op_rollout_process.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/pytest/types.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/pytest/utils.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/resources.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/reward_function.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/__init__.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/accuracy.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/accuracy_length.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/apps_coding_reward.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/apps_execution_utils.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/apps_testing_util.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/bfcl_reward.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/code_execution.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/code_execution_utils.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/cpp_code.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/deepcoder_reward.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/format.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/function_calling.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/json_schema.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/language_consistency.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/lean_prover.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/length.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/math.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/reasoning_steps.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/repetition.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/tag_count.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rl_processing.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/server.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/typed_interface.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/types/__init__.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/types/types.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/utils/__init__.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/utils/batch_evaluation.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/utils/batch_transformation.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/utils/dataset_helpers.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/utils/module_loader.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/utils/packaging_utils.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/utils/static_policy.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol.egg-info/dependency_links.txt +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol.egg-info/entry_points.txt +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol.egg-info/top_level.txt +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/setup.cfg +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/setup.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_accuracy.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_accuracy_length.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_agent_orchestrator.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_agent_resources.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_auth.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_batch_evaluation.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_braintrust_adapter.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_braintrust_example.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_cli.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_cli_args.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_code_execution.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_config.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_control_plane_separation.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_cpp_code.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_data_driven_task_manager.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_deepcoder_reward.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_deepeval_integration.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_deploy_integration.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_e2b_integration.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_e2b_js_integration.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_edge_cases.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_eval_protocol_import.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_evaluation.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_evaluation_integration.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_evaluation_preview_integration.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_fireworks_api.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_format.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_fractional_code.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_frozen_lake_http_server.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_frozen_lake_seed_evaluation.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_function_calling.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_gcp_tools.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_generic_server.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_integration.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_json_schema.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_kwargs_validation.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_language_consistency.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_lean_prover.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_lean_prover_runner.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_length.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_math.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_minimal.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_models.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_models_rl.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_n_variant_batch_integration.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_n_variant_integration.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_openai_compatibility.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_openeval_integration.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_packaging.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_parallel_rollouts.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_platform_api.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_readiness.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_reasoning_steps.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_repetition.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_repetition_debug.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_reward_function.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_reward_protocol_import.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_rl_processing.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_rollout_control_plane_integration.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_server.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_tag_count.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_typed_interface.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_typed_interface_rl.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_url_handling.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/__init__.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/agent/__init__.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/agent/base.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/agent/llm_agent.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/api_service/__init__.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/api_service/api_config.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/api_service/data_model.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/api_service/simulation_service.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/cli.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/config.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/data_model/__init__.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/data_model/message.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/data_model/simulation.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/data_model/tasks.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/__init__.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/airline/__init__.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/airline/data_model.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/airline/environment.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/airline/tools.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/airline/utils.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/mock/__init__.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/mock/data_model.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/mock/environment.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/mock/tools.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/mock/utils.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/retail/__init__.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/retail/data_model.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/retail/environment.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/retail/tools.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/retail/utils.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/telecom/__init__.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/telecom/data_model.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/telecom/environment.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/telecom/tools.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/telecom/user_tools.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/telecom/utils.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/environment/__init__.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/environment/db.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/environment/environment.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/environment/server.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/environment/tool.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/environment/toolkit.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/environment/utils/interface_agent.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/evaluator/__init__.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/evaluator/evaluator.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/evaluator/evaluator_action.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/evaluator/evaluator_base.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/evaluator/evaluator_env.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/metrics/__init__.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/metrics/agent_metrics.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/metrics/break_down_metrics.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/orchestrator/__init__.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/orchestrator/environment_manager.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/orchestrator/orchestrator.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/orchestrator/utils.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/registry.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/run.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/scripts/__init__.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/scripts/check_data.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/scripts/show_domain_doc.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/scripts/start_servers.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/scripts/view_simulations.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/user/__init__.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/user/base.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/user/user_simulator.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/utils/__init__.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/utils/display.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/utils/io_utils.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/utils/llm_utils.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/utils/pydantic_utils.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/utils/utils.py +0 -0
- {eval_protocol-0.2.0 → eval_protocol-0.2.1}/versioneer.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -81,6 +81,15 @@ Provides-Extra: box2d
|
|
|
81
81
|
Requires-Dist: swig; extra == "box2d"
|
|
82
82
|
Requires-Dist: gymnasium[box2d]>=0.29.0; extra == "box2d"
|
|
83
83
|
Requires-Dist: Pillow; extra == "box2d"
|
|
84
|
+
Provides-Extra: langfuse
|
|
85
|
+
Requires-Dist: langfuse>=2.0.0; extra == "langfuse"
|
|
86
|
+
Provides-Extra: huggingface
|
|
87
|
+
Requires-Dist: datasets>=2.0.0; extra == "huggingface"
|
|
88
|
+
Requires-Dist: transformers>=4.0.0; extra == "huggingface"
|
|
89
|
+
Provides-Extra: adapters
|
|
90
|
+
Requires-Dist: langfuse>=2.0.0; extra == "adapters"
|
|
91
|
+
Requires-Dist: datasets>=2.0.0; extra == "adapters"
|
|
92
|
+
Requires-Dist: transformers>=4.0.0; extra == "adapters"
|
|
84
93
|
Dynamic: license-file
|
|
85
94
|
|
|
86
95
|
# Eval Protocol (EP)
|
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2025-08-
|
|
11
|
+
"date": "2025-08-04T14:28:02-0700",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.2.
|
|
14
|
+
"full-revisionid": "07fda02490d1a09c7ab92595d6397622cb64230d",
|
|
15
|
+
"version": "0.2.1"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""Data source adapters for Eval Protocol.
|
|
2
|
+
|
|
3
|
+
This package provides adapters for integrating with various data sources
|
|
4
|
+
and converting them to EvaluationRow format for use in evaluation pipelines.
|
|
5
|
+
|
|
6
|
+
Available adapters:
|
|
7
|
+
- LangfuseAdapter: Pull data from Langfuse deployments
|
|
8
|
+
- HuggingFaceAdapter: Load datasets from HuggingFace Hub
|
|
9
|
+
- Braintrust integration (legacy)
|
|
10
|
+
- TRL integration (legacy)
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
# Conditional imports based on available dependencies
|
|
14
|
+
try:
|
|
15
|
+
from .langfuse import LangfuseAdapter, create_langfuse_adapter
|
|
16
|
+
__all__ = ["LangfuseAdapter", "create_langfuse_adapter"]
|
|
17
|
+
except ImportError:
|
|
18
|
+
__all__ = []
|
|
19
|
+
|
|
20
|
+
try:
|
|
21
|
+
from .huggingface import (
|
|
22
|
+
HuggingFaceAdapter,
|
|
23
|
+
create_huggingface_adapter,
|
|
24
|
+
create_gsm8k_adapter,
|
|
25
|
+
create_math_adapter,
|
|
26
|
+
)
|
|
27
|
+
__all__.extend([
|
|
28
|
+
"HuggingFaceAdapter",
|
|
29
|
+
"create_huggingface_adapter",
|
|
30
|
+
"create_gsm8k_adapter",
|
|
31
|
+
"create_math_adapter",
|
|
32
|
+
])
|
|
33
|
+
except ImportError:
|
|
34
|
+
pass
|
|
35
|
+
|
|
36
|
+
# Legacy adapters (always available)
|
|
37
|
+
try:
|
|
38
|
+
from .braintrust import reward_fn_to_scorer, scorer_to_reward_fn
|
|
39
|
+
__all__.extend(["scorer_to_reward_fn", "reward_fn_to_scorer"])
|
|
40
|
+
except ImportError:
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
try:
|
|
44
|
+
from .trl import create_trl_adapter
|
|
45
|
+
__all__.extend(["create_trl_adapter"])
|
|
46
|
+
except ImportError:
|
|
47
|
+
pass
|
|
@@ -0,0 +1,444 @@
|
|
|
1
|
+
"""HuggingFace Datasets adapter for Eval Protocol.
|
|
2
|
+
|
|
3
|
+
This adapter allows loading datasets from HuggingFace Hub with arbitrary
|
|
4
|
+
transformation functions to convert them to EvaluationRow format.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Any, Callable, Dict, Iterator, List, Optional
|
|
8
|
+
import logging
|
|
9
|
+
|
|
10
|
+
from eval_protocol.models import EvaluationRow, Message, InputMetadata, CompletionParams
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
try:
|
|
15
|
+
from datasets import load_dataset, Dataset, DatasetDict
|
|
16
|
+
DATASETS_AVAILABLE = True
|
|
17
|
+
except ImportError:
|
|
18
|
+
DATASETS_AVAILABLE = False
|
|
19
|
+
logger.warning(
|
|
20
|
+
"HuggingFace datasets not installed. Install with: pip install 'eval-protocol[huggingface]'"
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
# Type alias for transformation function
|
|
24
|
+
TransformFunction = Callable[[Dict[str, Any]], Dict[str, Any]]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class HuggingFaceAdapter:
|
|
28
|
+
"""Generic adapter to load HuggingFace datasets with custom transformations.
|
|
29
|
+
|
|
30
|
+
This adapter loads datasets from HuggingFace Hub and applies a user-provided
|
|
31
|
+
transformation function to convert each row to the format expected by
|
|
32
|
+
EvaluationRow.
|
|
33
|
+
|
|
34
|
+
The transformation function should take a dataset row dictionary and return:
|
|
35
|
+
{
|
|
36
|
+
'messages': List[Dict] - list of message dictionaries with 'role' and 'content'
|
|
37
|
+
'ground_truth': Optional[str] - expected answer/output
|
|
38
|
+
'metadata': Optional[Dict] - any additional metadata to preserve
|
|
39
|
+
'tools': Optional[List[Dict]] - tool definitions for tool calling scenarios
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
Examples:
|
|
43
|
+
Simple Q&A dataset:
|
|
44
|
+
>>> def transform(row):
|
|
45
|
+
... return {
|
|
46
|
+
... 'messages': [{'role': 'user', 'content': row['question']}],
|
|
47
|
+
... 'ground_truth': row['answer'],
|
|
48
|
+
... 'metadata': {'category': row.get('category')}
|
|
49
|
+
... }
|
|
50
|
+
>>> adapter = HuggingFaceAdapter("my-dataset", transform_fn=transform)
|
|
51
|
+
>>> rows = list(adapter.get_evaluation_rows(split="test", limit=10))
|
|
52
|
+
|
|
53
|
+
Math problems with system prompt:
|
|
54
|
+
>>> def gsm8k_transform(row):
|
|
55
|
+
... return {
|
|
56
|
+
... 'messages': [
|
|
57
|
+
... {'role': 'system', 'content': 'Solve step by step.'},
|
|
58
|
+
... {'role': 'user', 'content': row['question']}
|
|
59
|
+
... ],
|
|
60
|
+
... 'ground_truth': row['answer'],
|
|
61
|
+
... 'metadata': {'dataset': 'gsm8k'}
|
|
62
|
+
... }
|
|
63
|
+
>>> adapter = HuggingFaceAdapter("gsm8k", config_name="main", transform_fn=gsm8k_transform)
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
def __init__(
|
|
67
|
+
self,
|
|
68
|
+
dataset_id: str,
|
|
69
|
+
transform_fn: TransformFunction,
|
|
70
|
+
config_name: Optional[str] = None,
|
|
71
|
+
revision: Optional[str] = None,
|
|
72
|
+
**load_dataset_kwargs,
|
|
73
|
+
):
|
|
74
|
+
"""Initialize the HuggingFace adapter.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
dataset_id: HuggingFace dataset identifier (e.g., "gsm8k", "squad", "org/dataset")
|
|
78
|
+
transform_fn: Function to transform dataset rows to evaluation format
|
|
79
|
+
config_name: Optional dataset configuration name
|
|
80
|
+
revision: Optional dataset revision/commit hash
|
|
81
|
+
**load_dataset_kwargs: Additional arguments to pass to load_dataset
|
|
82
|
+
"""
|
|
83
|
+
if not DATASETS_AVAILABLE:
|
|
84
|
+
raise ImportError(
|
|
85
|
+
"HuggingFace datasets not installed. Install with: pip install 'eval-protocol[huggingface]'"
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
self.dataset_id = dataset_id
|
|
89
|
+
self.transform_fn = transform_fn
|
|
90
|
+
self.config_name = config_name
|
|
91
|
+
self.revision = revision
|
|
92
|
+
self.load_dataset_kwargs = load_dataset_kwargs
|
|
93
|
+
|
|
94
|
+
# Load the dataset
|
|
95
|
+
self.dataset = self._load_dataset()
|
|
96
|
+
|
|
97
|
+
@classmethod
|
|
98
|
+
def from_local(
|
|
99
|
+
cls,
|
|
100
|
+
path: str,
|
|
101
|
+
transform_fn: TransformFunction,
|
|
102
|
+
**load_dataset_kwargs,
|
|
103
|
+
) -> "HuggingFaceAdapter":
|
|
104
|
+
"""Create adapter from local dataset file.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
path: Path to local dataset file (JSON, JSONL, CSV, etc.)
|
|
108
|
+
transform_fn: Function to transform dataset rows
|
|
109
|
+
**load_dataset_kwargs: Additional arguments to pass to load_dataset
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
HuggingFaceAdapter instance
|
|
113
|
+
"""
|
|
114
|
+
# Determine file format
|
|
115
|
+
if path.endswith('.jsonl'):
|
|
116
|
+
dataset_type = "json"
|
|
117
|
+
elif path.endswith('.json'):
|
|
118
|
+
dataset_type = "json"
|
|
119
|
+
elif path.endswith('.csv'):
|
|
120
|
+
dataset_type = "csv"
|
|
121
|
+
elif path.endswith('.parquet'):
|
|
122
|
+
dataset_type = "parquet"
|
|
123
|
+
else:
|
|
124
|
+
# Let HuggingFace auto-detect
|
|
125
|
+
dataset_type = None
|
|
126
|
+
|
|
127
|
+
load_kwargs = {'data_files': path, **load_dataset_kwargs}
|
|
128
|
+
|
|
129
|
+
return cls(
|
|
130
|
+
dataset_id=dataset_type or "json",
|
|
131
|
+
transform_fn=transform_fn,
|
|
132
|
+
**load_kwargs
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
def _load_dataset(self) -> "Dataset | DatasetDict":
|
|
136
|
+
"""Load the dataset from HuggingFace Hub or local source."""
|
|
137
|
+
try:
|
|
138
|
+
kwargs = {}
|
|
139
|
+
if self.config_name:
|
|
140
|
+
kwargs['name'] = self.config_name
|
|
141
|
+
if self.revision:
|
|
142
|
+
kwargs['revision'] = self.revision
|
|
143
|
+
|
|
144
|
+
kwargs.update(self.load_dataset_kwargs)
|
|
145
|
+
|
|
146
|
+
return load_dataset(self.dataset_id, **kwargs)
|
|
147
|
+
|
|
148
|
+
except (OSError, ValueError, RuntimeError) as e:
|
|
149
|
+
logger.error("Failed to load dataset %s: %s", self.dataset_id, e)
|
|
150
|
+
raise
|
|
151
|
+
|
|
152
|
+
def get_evaluation_rows(
|
|
153
|
+
self,
|
|
154
|
+
split: Optional[str] = None,
|
|
155
|
+
limit: Optional[int] = None,
|
|
156
|
+
offset: int = 0,
|
|
157
|
+
model_name: str = "gpt-3.5-turbo",
|
|
158
|
+
temperature: float = 0.0,
|
|
159
|
+
max_tokens: Optional[int] = None,
|
|
160
|
+
**completion_params_kwargs,
|
|
161
|
+
) -> Iterator[EvaluationRow]:
|
|
162
|
+
"""Convert dataset entries to EvaluationRow format.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
split: Dataset split to use (if dataset has multiple splits)
|
|
166
|
+
limit: Maximum number of rows to return
|
|
167
|
+
offset: Number of rows to skip
|
|
168
|
+
model_name: Model name for completion parameters
|
|
169
|
+
temperature: Temperature for completion parameters
|
|
170
|
+
max_tokens: Max tokens for completion parameters
|
|
171
|
+
**completion_params_kwargs: Additional completion parameters
|
|
172
|
+
|
|
173
|
+
Yields:
|
|
174
|
+
EvaluationRow: Converted evaluation rows
|
|
175
|
+
"""
|
|
176
|
+
# Select dataset split
|
|
177
|
+
dataset = self.dataset
|
|
178
|
+
if isinstance(self.dataset, DatasetDict):
|
|
179
|
+
if split is None:
|
|
180
|
+
# Use first available split
|
|
181
|
+
split = list(self.dataset.keys())[0]
|
|
182
|
+
logger.info("No split specified, using: %s", split)
|
|
183
|
+
dataset = self.dataset[split]
|
|
184
|
+
elif split is not None:
|
|
185
|
+
logger.warning("Split '%s' specified but dataset is not split", split)
|
|
186
|
+
|
|
187
|
+
# Apply offset and limit
|
|
188
|
+
total_rows = len(dataset)
|
|
189
|
+
end_idx = min(offset + limit, total_rows) if limit else total_rows
|
|
190
|
+
|
|
191
|
+
if offset >= total_rows:
|
|
192
|
+
logger.warning("Offset %d is greater than dataset size %d", offset, total_rows)
|
|
193
|
+
return
|
|
194
|
+
|
|
195
|
+
# Create completion parameters
|
|
196
|
+
completion_params = CompletionParams(
|
|
197
|
+
model=model_name,
|
|
198
|
+
temperature=temperature,
|
|
199
|
+
max_tokens=max_tokens,
|
|
200
|
+
**completion_params_kwargs,
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
# Convert each row
|
|
204
|
+
for i in range(offset, end_idx):
|
|
205
|
+
try:
|
|
206
|
+
raw_row = dataset[i]
|
|
207
|
+
eval_row = self._convert_row_to_evaluation_row(
|
|
208
|
+
raw_row, i, completion_params, split
|
|
209
|
+
)
|
|
210
|
+
yield eval_row
|
|
211
|
+
except (AttributeError, ValueError, KeyError) as e:
|
|
212
|
+
logger.warning("Failed to convert row %d: %s", i, e)
|
|
213
|
+
continue
|
|
214
|
+
|
|
215
|
+
def _convert_row_to_evaluation_row(
|
|
216
|
+
self,
|
|
217
|
+
raw_row: Dict[str, Any],
|
|
218
|
+
row_index: int,
|
|
219
|
+
completion_params: CompletionParams,
|
|
220
|
+
split: Optional[str] = None,
|
|
221
|
+
) -> EvaluationRow:
|
|
222
|
+
"""Convert a single dataset row to EvaluationRow format.
|
|
223
|
+
|
|
224
|
+
Args:
|
|
225
|
+
raw_row: Raw dataset row dictionary
|
|
226
|
+
row_index: Index of the row in the dataset
|
|
227
|
+
completion_params: Completion parameters to use
|
|
228
|
+
split: Dataset split name
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
EvaluationRow object
|
|
232
|
+
"""
|
|
233
|
+
# Apply user transformation
|
|
234
|
+
transformed = self.transform_fn(raw_row)
|
|
235
|
+
|
|
236
|
+
# Validate required fields
|
|
237
|
+
if 'messages' not in transformed:
|
|
238
|
+
raise ValueError("Transform function must return 'messages' field")
|
|
239
|
+
|
|
240
|
+
# Convert message dictionaries to Message objects
|
|
241
|
+
messages = []
|
|
242
|
+
for msg_dict in transformed['messages']:
|
|
243
|
+
if not isinstance(msg_dict, dict):
|
|
244
|
+
raise ValueError("Each message must be a dictionary")
|
|
245
|
+
if 'role' not in msg_dict:
|
|
246
|
+
raise ValueError("Each message must have a 'role' field")
|
|
247
|
+
|
|
248
|
+
messages.append(Message(
|
|
249
|
+
role=msg_dict['role'],
|
|
250
|
+
content=msg_dict.get('content'),
|
|
251
|
+
name=msg_dict.get('name'),
|
|
252
|
+
tool_call_id=msg_dict.get('tool_call_id'),
|
|
253
|
+
tool_calls=msg_dict.get('tool_calls'),
|
|
254
|
+
function_call=msg_dict.get('function_call'),
|
|
255
|
+
))
|
|
256
|
+
|
|
257
|
+
# Extract other fields
|
|
258
|
+
ground_truth = transformed.get('ground_truth')
|
|
259
|
+
tools = transformed.get('tools')
|
|
260
|
+
user_metadata = transformed.get('metadata', {})
|
|
261
|
+
|
|
262
|
+
# Create dataset info
|
|
263
|
+
dataset_info = {
|
|
264
|
+
'dataset_id': self.dataset_id,
|
|
265
|
+
'config_name': self.config_name,
|
|
266
|
+
'revision': self.revision,
|
|
267
|
+
'split': split,
|
|
268
|
+
'row_index': row_index,
|
|
269
|
+
'transform_function': self.transform_fn.__name__ if hasattr(self.transform_fn, '__name__') else 'anonymous',
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
# Add user metadata
|
|
273
|
+
dataset_info.update(user_metadata)
|
|
274
|
+
|
|
275
|
+
# Add original row data (with prefix to avoid conflicts)
|
|
276
|
+
for key, value in raw_row.items():
|
|
277
|
+
dataset_info[f'original_{key}'] = value
|
|
278
|
+
|
|
279
|
+
# Create input metadata
|
|
280
|
+
input_metadata = InputMetadata(
|
|
281
|
+
row_id=f"{self.dataset_id}_{row_index}",
|
|
282
|
+
completion_params=completion_params,
|
|
283
|
+
dataset_info=dataset_info,
|
|
284
|
+
session_data={
|
|
285
|
+
'dataset_source': 'huggingface',
|
|
286
|
+
'timestamp': None,
|
|
287
|
+
}
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
return EvaluationRow(
|
|
291
|
+
messages=messages,
|
|
292
|
+
tools=tools,
|
|
293
|
+
input_metadata=input_metadata,
|
|
294
|
+
ground_truth=str(ground_truth) if ground_truth is not None else None,
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
def get_splits(self) -> List[str]:
|
|
298
|
+
"""Get available dataset splits.
|
|
299
|
+
|
|
300
|
+
Returns:
|
|
301
|
+
List of available split names
|
|
302
|
+
"""
|
|
303
|
+
if isinstance(self.dataset, DatasetDict):
|
|
304
|
+
return list(self.dataset.keys())
|
|
305
|
+
else:
|
|
306
|
+
return ["train"] # Default split name for non-split datasets
|
|
307
|
+
|
|
308
|
+
def get_dataset_info(self) -> Dict[str, Any]:
|
|
309
|
+
"""Get information about the loaded dataset.
|
|
310
|
+
|
|
311
|
+
Returns:
|
|
312
|
+
Dictionary with dataset information
|
|
313
|
+
"""
|
|
314
|
+
info = {
|
|
315
|
+
'dataset_id': self.dataset_id,
|
|
316
|
+
'config_name': self.config_name,
|
|
317
|
+
'revision': self.revision,
|
|
318
|
+
'splits': self.get_splits(),
|
|
319
|
+
'transform_function': self.transform_fn.__name__ if hasattr(self.transform_fn, '__name__') else 'anonymous',
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
# Add split sizes
|
|
323
|
+
if isinstance(self.dataset, DatasetDict):
|
|
324
|
+
info['split_sizes'] = {split: len(data) for split, data in self.dataset.items()}
|
|
325
|
+
else:
|
|
326
|
+
info['total_size'] = len(self.dataset)
|
|
327
|
+
|
|
328
|
+
return info
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def create_huggingface_adapter(
|
|
332
|
+
dataset_id: str,
|
|
333
|
+
transform_fn: TransformFunction,
|
|
334
|
+
config_name: Optional[str] = None,
|
|
335
|
+
revision: Optional[str] = None,
|
|
336
|
+
**load_dataset_kwargs,
|
|
337
|
+
) -> HuggingFaceAdapter:
|
|
338
|
+
"""Factory function to create a HuggingFace adapter.
|
|
339
|
+
|
|
340
|
+
Args:
|
|
341
|
+
dataset_id: HuggingFace dataset identifier
|
|
342
|
+
transform_fn: Function to transform dataset rows to evaluation format
|
|
343
|
+
config_name: Optional configuration name
|
|
344
|
+
revision: Optional dataset revision/commit hash
|
|
345
|
+
**load_dataset_kwargs: Additional arguments for load_dataset
|
|
346
|
+
|
|
347
|
+
Returns:
|
|
348
|
+
HuggingFaceAdapter instance
|
|
349
|
+
"""
|
|
350
|
+
return HuggingFaceAdapter(
|
|
351
|
+
dataset_id=dataset_id,
|
|
352
|
+
transform_fn=transform_fn,
|
|
353
|
+
config_name=config_name,
|
|
354
|
+
revision=revision,
|
|
355
|
+
**load_dataset_kwargs,
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
# Convenience functions for common datasets
|
|
360
|
+
def create_gsm8k_adapter(
|
|
361
|
+
system_prompt: Optional[str] = None,
|
|
362
|
+
revision: Optional[str] = None,
|
|
363
|
+
) -> HuggingFaceAdapter:
|
|
364
|
+
"""Create adapter specifically configured for GSM8K dataset.
|
|
365
|
+
|
|
366
|
+
Args:
|
|
367
|
+
system_prompt: Optional system prompt for math problems
|
|
368
|
+
revision: Optional dataset revision/commit
|
|
369
|
+
|
|
370
|
+
Returns:
|
|
371
|
+
HuggingFaceAdapter configured for GSM8K
|
|
372
|
+
"""
|
|
373
|
+
default_system_prompt = (
|
|
374
|
+
"You are a helpful assistant that solves math problems step by step. "
|
|
375
|
+
"Show your work and provide the final answer."
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
system_content = system_prompt or default_system_prompt
|
|
379
|
+
|
|
380
|
+
def gsm8k_transform(row: Dict[str, Any]) -> Dict[str, Any]:
|
|
381
|
+
"""Transform GSM8K row to evaluation format."""
|
|
382
|
+
return {
|
|
383
|
+
'messages': [
|
|
384
|
+
{'role': 'system', 'content': system_content},
|
|
385
|
+
{'role': 'user', 'content': row['question']},
|
|
386
|
+
],
|
|
387
|
+
'ground_truth': row['answer'],
|
|
388
|
+
'metadata': {
|
|
389
|
+
'dataset': 'gsm8k',
|
|
390
|
+
'question_length': len(row['question']),
|
|
391
|
+
'answer_length': len(row['answer']),
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
return create_huggingface_adapter(
|
|
396
|
+
dataset_id="gsm8k",
|
|
397
|
+
config_name="main",
|
|
398
|
+
transform_fn=gsm8k_transform,
|
|
399
|
+
revision=revision,
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
def create_math_adapter(
|
|
404
|
+
system_prompt: Optional[str] = None,
|
|
405
|
+
revision: Optional[str] = None,
|
|
406
|
+
) -> HuggingFaceAdapter:
|
|
407
|
+
"""Create adapter specifically configured for MATH competition dataset.
|
|
408
|
+
|
|
409
|
+
Args:
|
|
410
|
+
system_prompt: Optional system prompt for math problems
|
|
411
|
+
revision: Optional dataset revision/commit
|
|
412
|
+
|
|
413
|
+
Returns:
|
|
414
|
+
HuggingFaceAdapter configured for MATH dataset
|
|
415
|
+
"""
|
|
416
|
+
default_system_prompt = (
|
|
417
|
+
"You are an expert mathematician. Solve this advanced math problem "
|
|
418
|
+
"step by step, showing detailed work."
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
system_content = system_prompt or default_system_prompt
|
|
422
|
+
|
|
423
|
+
def math_transform(row: Dict[str, Any]) -> Dict[str, Any]:
|
|
424
|
+
"""Transform MATH dataset row to evaluation format."""
|
|
425
|
+
return {
|
|
426
|
+
'messages': [
|
|
427
|
+
{'role': 'system', 'content': system_content},
|
|
428
|
+
{'role': 'user', 'content': row['problem']},
|
|
429
|
+
],
|
|
430
|
+
'ground_truth': row['solution'],
|
|
431
|
+
'metadata': {
|
|
432
|
+
'dataset': 'hendrycks_math',
|
|
433
|
+
'type': row.get('type', 'unknown'),
|
|
434
|
+
'level': row.get('level', 'unknown'),
|
|
435
|
+
'problem_length': len(row['problem']),
|
|
436
|
+
'solution_length': len(row['solution']),
|
|
437
|
+
}
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
return create_huggingface_adapter(
|
|
441
|
+
dataset_id="hendrycks/competition_math",
|
|
442
|
+
transform_fn=math_transform,
|
|
443
|
+
revision=revision,
|
|
444
|
+
)
|