eval-protocol 0.2.0__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (301) hide show
  1. {eval_protocol-0.2.0/eval_protocol.egg-info → eval_protocol-0.2.1}/PKG-INFO +10 -1
  2. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/_version.py +3 -3
  3. eval_protocol-0.2.1/eval_protocol/adapters/__init__.py +47 -0
  4. eval_protocol-0.2.1/eval_protocol/adapters/huggingface.py +444 -0
  5. eval_protocol-0.2.1/eval_protocol/adapters/langfuse.py +407 -0
  6. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/pytest/__init__.py +2 -0
  7. eval_protocol-0.2.1/eval_protocol/pytest/default_dataset_adapter.py +10 -0
  8. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/pytest/default_single_turn_rollout_process.py +12 -4
  9. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/pytest/evaluation_test.py +7 -7
  10. {eval_protocol-0.2.0 → eval_protocol-0.2.1/eval_protocol.egg-info}/PKG-INFO +10 -1
  11. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol.egg-info/SOURCES.txt +4 -0
  12. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol.egg-info/requires.txt +12 -0
  13. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/pyproject.toml +12 -0
  14. eval_protocol-0.2.1/tests/test_adapters_e2e.py +447 -0
  15. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_cli_agent.py +4 -0
  16. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_examples_end_to_end.py +4 -4
  17. eval_protocol-0.2.0/eval_protocol/adapters/__init__.py +0 -1
  18. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/LICENSE +0 -0
  19. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/README.md +0 -0
  20. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/development/__init__.py +0 -0
  21. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/development/normalize_sandbox_fusion.py +0 -0
  22. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/development/utils/__init__.py +0 -0
  23. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/development/utils/generate_api_key.py +0 -0
  24. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/development/utils/subprocess_manager.py +0 -0
  25. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/__init__.py +0 -0
  26. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/__main__.py +0 -0
  27. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/adapters/braintrust.py +0 -0
  28. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/adapters/trl.py +0 -0
  29. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/agent/__init__.py +0 -0
  30. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/agent/models.py +0 -0
  31. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/agent/orchestrator.py +0 -0
  32. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/agent/resource_abc.py +0 -0
  33. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/agent/resource_pool.py +0 -0
  34. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/agent/resources/__init__.py +0 -0
  35. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
  36. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
  37. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
  38. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
  39. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
  40. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/agent/resources/docker_resource.py +0 -0
  41. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
  42. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/agent/resources/http_rollout_protocol.py +0 -0
  43. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/agent/resources/http_rollout_resource.py +0 -0
  44. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/agent/resources/python_state_resource.py +0 -0
  45. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/agent/resources/sql_resource.py +0 -0
  46. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/agent/task_manager.py +0 -0
  47. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/agent/tool_registry.py +0 -0
  48. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/auth.py +0 -0
  49. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/cli.py +0 -0
  50. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/cli_commands/__init__.py +0 -0
  51. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
  52. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/cli_commands/common.py +0 -0
  53. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/cli_commands/deploy.py +0 -0
  54. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
  55. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/cli_commands/preview.py +0 -0
  56. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
  57. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/common_utils.py +0 -0
  58. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/config.py +0 -0
  59. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/datasets/__init__.py +0 -0
  60. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/datasets/loader.py +0 -0
  61. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/evaluation.py +0 -0
  62. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/execution/__init__.py +0 -0
  63. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/execution/pipeline.py +0 -0
  64. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/gcp_tools.py +0 -0
  65. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/generation/cache.py +0 -0
  66. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/generation/clients/base.py +0 -0
  67. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/generation/clients.py +0 -0
  68. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/generic_server.py +0 -0
  69. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/integrations/__init__.py +0 -0
  70. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/integrations/braintrust.py +0 -0
  71. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/integrations/deepeval.py +0 -0
  72. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/integrations/openeval.py +0 -0
  73. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/integrations/trl.py +0 -0
  74. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp/__init__.py +0 -0
  75. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp/adapter.py +0 -0
  76. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp/client/__init__.py +0 -0
  77. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp/client/connection.py +0 -0
  78. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp/clients.py +0 -0
  79. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp/execution/__init__.py +0 -0
  80. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp/execution/base_policy.py +0 -0
  81. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp/execution/manager.py +0 -0
  82. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp/execution/policy.py +0 -0
  83. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp/grid_renderer.py +0 -0
  84. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp/mcp_multi_client.py +0 -0
  85. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp/mcpgym.py +0 -0
  86. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp/process_manager.py +0 -0
  87. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp/session/__init__.py +0 -0
  88. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp/session/manager.py +0 -0
  89. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp/simple_process_manager.py +0 -0
  90. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp/simulation_server.py +0 -0
  91. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp_agent/__init__.py +0 -0
  92. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp_agent/config.py +0 -0
  93. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp_agent/intermediary_server.py +0 -0
  94. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp_agent/main.py +0 -0
  95. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
  96. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
  97. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
  98. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp_agent/orchestration/remote_http_client.py +0 -0
  99. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
  100. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp_agent/session.py +0 -0
  101. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/mcp_env.py +0 -0
  102. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/models.py +0 -0
  103. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/packaging.py +0 -0
  104. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/platform_api.py +0 -0
  105. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/playback_policy.py +0 -0
  106. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
  107. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
  108. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/pytest/default_no_op_rollout_process.py +0 -0
  109. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/pytest/types.py +0 -0
  110. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/pytest/utils.py +0 -0
  111. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/resources.py +0 -0
  112. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/reward_function.py +0 -0
  113. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/__init__.py +0 -0
  114. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/accuracy.py +0 -0
  115. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/accuracy_length.py +0 -0
  116. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/apps_coding_reward.py +0 -0
  117. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/apps_execution_utils.py +0 -0
  118. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/apps_testing_util.py +0 -0
  119. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/bfcl_reward.py +0 -0
  120. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/code_execution.py +0 -0
  121. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/code_execution_utils.py +0 -0
  122. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/cpp_code.py +0 -0
  123. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/deepcoder_reward.py +0 -0
  124. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/format.py +0 -0
  125. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/function_calling.py +0 -0
  126. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/json_schema.py +0 -0
  127. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/language_consistency.py +0 -0
  128. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/lean_prover.py +0 -0
  129. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/length.py +0 -0
  130. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
  131. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/math.py +0 -0
  132. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
  133. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/reasoning_steps.py +0 -0
  134. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/repetition.py +0 -0
  135. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rewards/tag_count.py +0 -0
  136. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/rl_processing.py +0 -0
  137. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/server.py +0 -0
  138. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/typed_interface.py +0 -0
  139. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/types/__init__.py +0 -0
  140. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/types/types.py +0 -0
  141. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/utils/__init__.py +0 -0
  142. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/utils/batch_evaluation.py +0 -0
  143. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/utils/batch_transformation.py +0 -0
  144. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/utils/dataset_helpers.py +0 -0
  145. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/utils/module_loader.py +0 -0
  146. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/utils/packaging_utils.py +0 -0
  147. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol/utils/static_policy.py +0 -0
  148. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol.egg-info/dependency_links.txt +0 -0
  149. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol.egg-info/entry_points.txt +0 -0
  150. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/eval_protocol.egg-info/top_level.txt +0 -0
  151. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/setup.cfg +0 -0
  152. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/setup.py +0 -0
  153. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_accuracy.py +0 -0
  154. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_accuracy_length.py +0 -0
  155. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_agent_orchestrator.py +0 -0
  156. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_agent_resources.py +0 -0
  157. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_auth.py +0 -0
  158. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_batch_evaluation.py +0 -0
  159. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_braintrust_adapter.py +0 -0
  160. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_braintrust_example.py +0 -0
  161. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_cli.py +0 -0
  162. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_cli_args.py +0 -0
  163. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_code_execution.py +0 -0
  164. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_config.py +0 -0
  165. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_control_plane_separation.py +0 -0
  166. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_cpp_code.py +0 -0
  167. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_data_driven_task_manager.py +0 -0
  168. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_deepcoder_reward.py +0 -0
  169. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_deepeval_integration.py +0 -0
  170. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_deploy_integration.py +0 -0
  171. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_e2b_integration.py +0 -0
  172. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_e2b_js_integration.py +0 -0
  173. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_edge_cases.py +0 -0
  174. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_eval_protocol_import.py +0 -0
  175. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_evaluation.py +0 -0
  176. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_evaluation_integration.py +0 -0
  177. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_evaluation_preview_integration.py +0 -0
  178. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_fireworks_api.py +0 -0
  179. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_format.py +0 -0
  180. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_fractional_code.py +0 -0
  181. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_frozen_lake_http_server.py +0 -0
  182. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_frozen_lake_seed_evaluation.py +0 -0
  183. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_function_calling.py +0 -0
  184. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_gcp_tools.py +0 -0
  185. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_generic_server.py +0 -0
  186. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_integration.py +0 -0
  187. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_json_schema.py +0 -0
  188. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_kwargs_validation.py +0 -0
  189. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_language_consistency.py +0 -0
  190. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_lean_prover.py +0 -0
  191. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_lean_prover_runner.py +0 -0
  192. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_length.py +0 -0
  193. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_list_comparison_math_reward.py +0 -0
  194. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_math.py +0 -0
  195. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_minimal.py +0 -0
  196. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_models.py +0 -0
  197. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_models_rl.py +0 -0
  198. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_multiple_choice_math_reward.py +0 -0
  199. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_n_variant_batch_integration.py +0 -0
  200. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_n_variant_integration.py +0 -0
  201. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_openai_compatibility.py +0 -0
  202. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_openeval_integration.py +0 -0
  203. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_packaging.py +0 -0
  204. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_parallel_rollouts.py +0 -0
  205. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_platform_api.py +0 -0
  206. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_readiness.py +0 -0
  207. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_reasoning_steps.py +0 -0
  208. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_repetition.py +0 -0
  209. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_repetition_debug.py +0 -0
  210. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_reward_function.py +0 -0
  211. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_reward_protocol_import.py +0 -0
  212. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_rl_processing.py +0 -0
  213. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_rollout_control_plane_integration.py +0 -0
  214. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_server.py +0 -0
  215. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_tag_count.py +0 -0
  216. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_typed_interface.py +0 -0
  217. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_typed_interface_rl.py +0 -0
  218. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/tests/test_url_handling.py +0 -0
  219. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/__init__.py +0 -0
  220. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/agent/__init__.py +0 -0
  221. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/agent/base.py +0 -0
  222. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/agent/llm_agent.py +0 -0
  223. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/api_service/__init__.py +0 -0
  224. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/api_service/api_config.py +0 -0
  225. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/api_service/data_model.py +0 -0
  226. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/api_service/simulation_service.py +0 -0
  227. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/cli.py +0 -0
  228. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/config.py +0 -0
  229. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/data_model/__init__.py +0 -0
  230. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/data_model/message.py +0 -0
  231. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/data_model/simulation.py +0 -0
  232. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/data_model/tasks.py +0 -0
  233. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/__init__.py +0 -0
  234. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/airline/__init__.py +0 -0
  235. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/airline/data_model.py +0 -0
  236. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/airline/environment.py +0 -0
  237. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/airline/tools.py +0 -0
  238. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/airline/utils.py +0 -0
  239. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/mock/__init__.py +0 -0
  240. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/mock/data_model.py +0 -0
  241. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/mock/environment.py +0 -0
  242. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/mock/tools.py +0 -0
  243. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/mock/utils.py +0 -0
  244. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/retail/__init__.py +0 -0
  245. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/retail/data_model.py +0 -0
  246. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/retail/environment.py +0 -0
  247. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/retail/tools.py +0 -0
  248. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/retail/utils.py +0 -0
  249. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/telecom/__init__.py +0 -0
  250. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/telecom/data_model.py +0 -0
  251. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/telecom/environment.py +0 -0
  252. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
  253. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
  254. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
  255. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
  256. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
  257. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
  258. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
  259. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
  260. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/telecom/tools.py +0 -0
  261. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
  262. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/telecom/user_tools.py +0 -0
  263. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/domains/telecom/utils.py +0 -0
  264. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/environment/__init__.py +0 -0
  265. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/environment/db.py +0 -0
  266. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/environment/environment.py +0 -0
  267. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/environment/server.py +0 -0
  268. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/environment/tool.py +0 -0
  269. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/environment/toolkit.py +0 -0
  270. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/environment/utils/interface_agent.py +0 -0
  271. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/evaluator/__init__.py +0 -0
  272. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/evaluator/evaluator.py +0 -0
  273. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/evaluator/evaluator_action.py +0 -0
  274. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/evaluator/evaluator_base.py +0 -0
  275. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
  276. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/evaluator/evaluator_env.py +0 -0
  277. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
  278. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/metrics/__init__.py +0 -0
  279. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/metrics/agent_metrics.py +0 -0
  280. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/metrics/break_down_metrics.py +0 -0
  281. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/orchestrator/__init__.py +0 -0
  282. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/orchestrator/environment_manager.py +0 -0
  283. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/orchestrator/orchestrator.py +0 -0
  284. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/orchestrator/utils.py +0 -0
  285. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/registry.py +0 -0
  286. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/run.py +0 -0
  287. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/scripts/__init__.py +0 -0
  288. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/scripts/check_data.py +0 -0
  289. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/scripts/show_domain_doc.py +0 -0
  290. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/scripts/start_servers.py +0 -0
  291. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/scripts/view_simulations.py +0 -0
  292. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/user/__init__.py +0 -0
  293. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/user/base.py +0 -0
  294. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/user/user_simulator.py +0 -0
  295. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/utils/__init__.py +0 -0
  296. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/utils/display.py +0 -0
  297. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/utils/io_utils.py +0 -0
  298. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/utils/llm_utils.py +0 -0
  299. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/utils/pydantic_utils.py +0 -0
  300. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/vendor/tau2/utils/utils.py +0 -0
  301. {eval_protocol-0.2.0 → eval_protocol-0.2.1}/versioneer.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.2.0
3
+ Version: 0.2.1
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: Apache-2.0
@@ -81,6 +81,15 @@ Provides-Extra: box2d
81
81
  Requires-Dist: swig; extra == "box2d"
82
82
  Requires-Dist: gymnasium[box2d]>=0.29.0; extra == "box2d"
83
83
  Requires-Dist: Pillow; extra == "box2d"
84
+ Provides-Extra: langfuse
85
+ Requires-Dist: langfuse>=2.0.0; extra == "langfuse"
86
+ Provides-Extra: huggingface
87
+ Requires-Dist: datasets>=2.0.0; extra == "huggingface"
88
+ Requires-Dist: transformers>=4.0.0; extra == "huggingface"
89
+ Provides-Extra: adapters
90
+ Requires-Dist: langfuse>=2.0.0; extra == "adapters"
91
+ Requires-Dist: datasets>=2.0.0; extra == "adapters"
92
+ Requires-Dist: transformers>=4.0.0; extra == "adapters"
84
93
  Dynamic: license-file
85
94
 
86
95
  # Eval Protocol (EP)
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2025-08-04T10:11:52-0700",
11
+ "date": "2025-08-04T14:28:02-0700",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "438074ad1d0308bce0c3afaec29b7e044ccc1c3a",
15
- "version": "0.2.0"
14
+ "full-revisionid": "07fda02490d1a09c7ab92595d6397622cb64230d",
15
+ "version": "0.2.1"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -0,0 +1,47 @@
1
+ """Data source adapters for Eval Protocol.
2
+
3
+ This package provides adapters for integrating with various data sources
4
+ and converting them to EvaluationRow format for use in evaluation pipelines.
5
+
6
+ Available adapters:
7
+ - LangfuseAdapter: Pull data from Langfuse deployments
8
+ - HuggingFaceAdapter: Load datasets from HuggingFace Hub
9
+ - Braintrust integration (legacy)
10
+ - TRL integration (legacy)
11
+ """
12
+
13
+ # Conditional imports based on available dependencies
14
+ try:
15
+ from .langfuse import LangfuseAdapter, create_langfuse_adapter
16
+ __all__ = ["LangfuseAdapter", "create_langfuse_adapter"]
17
+ except ImportError:
18
+ __all__ = []
19
+
20
+ try:
21
+ from .huggingface import (
22
+ HuggingFaceAdapter,
23
+ create_huggingface_adapter,
24
+ create_gsm8k_adapter,
25
+ create_math_adapter,
26
+ )
27
+ __all__.extend([
28
+ "HuggingFaceAdapter",
29
+ "create_huggingface_adapter",
30
+ "create_gsm8k_adapter",
31
+ "create_math_adapter",
32
+ ])
33
+ except ImportError:
34
+ pass
35
+
36
+ # Legacy adapters (always available)
37
+ try:
38
+ from .braintrust import reward_fn_to_scorer, scorer_to_reward_fn
39
+ __all__.extend(["scorer_to_reward_fn", "reward_fn_to_scorer"])
40
+ except ImportError:
41
+ pass
42
+
43
+ try:
44
+ from .trl import create_trl_adapter
45
+ __all__.extend(["create_trl_adapter"])
46
+ except ImportError:
47
+ pass
@@ -0,0 +1,444 @@
1
+ """HuggingFace Datasets adapter for Eval Protocol.
2
+
3
+ This adapter allows loading datasets from HuggingFace Hub with arbitrary
4
+ transformation functions to convert them to EvaluationRow format.
5
+ """
6
+
7
+ from typing import Any, Callable, Dict, Iterator, List, Optional
8
+ import logging
9
+
10
+ from eval_protocol.models import EvaluationRow, Message, InputMetadata, CompletionParams
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ try:
15
+ from datasets import load_dataset, Dataset, DatasetDict
16
+ DATASETS_AVAILABLE = True
17
+ except ImportError:
18
+ DATASETS_AVAILABLE = False
19
+ logger.warning(
20
+ "HuggingFace datasets not installed. Install with: pip install 'eval-protocol[huggingface]'"
21
+ )
22
+
23
+ # Type alias for transformation function
24
+ TransformFunction = Callable[[Dict[str, Any]], Dict[str, Any]]
25
+
26
+
27
+ class HuggingFaceAdapter:
28
+ """Generic adapter to load HuggingFace datasets with custom transformations.
29
+
30
+ This adapter loads datasets from HuggingFace Hub and applies a user-provided
31
+ transformation function to convert each row to the format expected by
32
+ EvaluationRow.
33
+
34
+ The transformation function should take a dataset row dictionary and return:
35
+ {
36
+ 'messages': List[Dict] - list of message dictionaries with 'role' and 'content'
37
+ 'ground_truth': Optional[str] - expected answer/output
38
+ 'metadata': Optional[Dict] - any additional metadata to preserve
39
+ 'tools': Optional[List[Dict]] - tool definitions for tool calling scenarios
40
+ }
41
+
42
+ Examples:
43
+ Simple Q&A dataset:
44
+ >>> def transform(row):
45
+ ... return {
46
+ ... 'messages': [{'role': 'user', 'content': row['question']}],
47
+ ... 'ground_truth': row['answer'],
48
+ ... 'metadata': {'category': row.get('category')}
49
+ ... }
50
+ >>> adapter = HuggingFaceAdapter("my-dataset", transform_fn=transform)
51
+ >>> rows = list(adapter.get_evaluation_rows(split="test", limit=10))
52
+
53
+ Math problems with system prompt:
54
+ >>> def gsm8k_transform(row):
55
+ ... return {
56
+ ... 'messages': [
57
+ ... {'role': 'system', 'content': 'Solve step by step.'},
58
+ ... {'role': 'user', 'content': row['question']}
59
+ ... ],
60
+ ... 'ground_truth': row['answer'],
61
+ ... 'metadata': {'dataset': 'gsm8k'}
62
+ ... }
63
+ >>> adapter = HuggingFaceAdapter("gsm8k", config_name="main", transform_fn=gsm8k_transform)
64
+ """
65
+
66
+ def __init__(
67
+ self,
68
+ dataset_id: str,
69
+ transform_fn: TransformFunction,
70
+ config_name: Optional[str] = None,
71
+ revision: Optional[str] = None,
72
+ **load_dataset_kwargs,
73
+ ):
74
+ """Initialize the HuggingFace adapter.
75
+
76
+ Args:
77
+ dataset_id: HuggingFace dataset identifier (e.g., "gsm8k", "squad", "org/dataset")
78
+ transform_fn: Function to transform dataset rows to evaluation format
79
+ config_name: Optional dataset configuration name
80
+ revision: Optional dataset revision/commit hash
81
+ **load_dataset_kwargs: Additional arguments to pass to load_dataset
82
+ """
83
+ if not DATASETS_AVAILABLE:
84
+ raise ImportError(
85
+ "HuggingFace datasets not installed. Install with: pip install 'eval-protocol[huggingface]'"
86
+ )
87
+
88
+ self.dataset_id = dataset_id
89
+ self.transform_fn = transform_fn
90
+ self.config_name = config_name
91
+ self.revision = revision
92
+ self.load_dataset_kwargs = load_dataset_kwargs
93
+
94
+ # Load the dataset
95
+ self.dataset = self._load_dataset()
96
+
97
+ @classmethod
98
+ def from_local(
99
+ cls,
100
+ path: str,
101
+ transform_fn: TransformFunction,
102
+ **load_dataset_kwargs,
103
+ ) -> "HuggingFaceAdapter":
104
+ """Create adapter from local dataset file.
105
+
106
+ Args:
107
+ path: Path to local dataset file (JSON, JSONL, CSV, etc.)
108
+ transform_fn: Function to transform dataset rows
109
+ **load_dataset_kwargs: Additional arguments to pass to load_dataset
110
+
111
+ Returns:
112
+ HuggingFaceAdapter instance
113
+ """
114
+ # Determine file format
115
+ if path.endswith('.jsonl'):
116
+ dataset_type = "json"
117
+ elif path.endswith('.json'):
118
+ dataset_type = "json"
119
+ elif path.endswith('.csv'):
120
+ dataset_type = "csv"
121
+ elif path.endswith('.parquet'):
122
+ dataset_type = "parquet"
123
+ else:
124
+ # Let HuggingFace auto-detect
125
+ dataset_type = None
126
+
127
+ load_kwargs = {'data_files': path, **load_dataset_kwargs}
128
+
129
+ return cls(
130
+ dataset_id=dataset_type or "json",
131
+ transform_fn=transform_fn,
132
+ **load_kwargs
133
+ )
134
+
135
+ def _load_dataset(self) -> "Dataset | DatasetDict":
136
+ """Load the dataset from HuggingFace Hub or local source."""
137
+ try:
138
+ kwargs = {}
139
+ if self.config_name:
140
+ kwargs['name'] = self.config_name
141
+ if self.revision:
142
+ kwargs['revision'] = self.revision
143
+
144
+ kwargs.update(self.load_dataset_kwargs)
145
+
146
+ return load_dataset(self.dataset_id, **kwargs)
147
+
148
+ except (OSError, ValueError, RuntimeError) as e:
149
+ logger.error("Failed to load dataset %s: %s", self.dataset_id, e)
150
+ raise
151
+
152
+ def get_evaluation_rows(
153
+ self,
154
+ split: Optional[str] = None,
155
+ limit: Optional[int] = None,
156
+ offset: int = 0,
157
+ model_name: str = "gpt-3.5-turbo",
158
+ temperature: float = 0.0,
159
+ max_tokens: Optional[int] = None,
160
+ **completion_params_kwargs,
161
+ ) -> Iterator[EvaluationRow]:
162
+ """Convert dataset entries to EvaluationRow format.
163
+
164
+ Args:
165
+ split: Dataset split to use (if dataset has multiple splits)
166
+ limit: Maximum number of rows to return
167
+ offset: Number of rows to skip
168
+ model_name: Model name for completion parameters
169
+ temperature: Temperature for completion parameters
170
+ max_tokens: Max tokens for completion parameters
171
+ **completion_params_kwargs: Additional completion parameters
172
+
173
+ Yields:
174
+ EvaluationRow: Converted evaluation rows
175
+ """
176
+ # Select dataset split
177
+ dataset = self.dataset
178
+ if isinstance(self.dataset, DatasetDict):
179
+ if split is None:
180
+ # Use first available split
181
+ split = list(self.dataset.keys())[0]
182
+ logger.info("No split specified, using: %s", split)
183
+ dataset = self.dataset[split]
184
+ elif split is not None:
185
+ logger.warning("Split '%s' specified but dataset is not split", split)
186
+
187
+ # Apply offset and limit
188
+ total_rows = len(dataset)
189
+ end_idx = min(offset + limit, total_rows) if limit else total_rows
190
+
191
+ if offset >= total_rows:
192
+ logger.warning("Offset %d is greater than dataset size %d", offset, total_rows)
193
+ return
194
+
195
+ # Create completion parameters
196
+ completion_params = CompletionParams(
197
+ model=model_name,
198
+ temperature=temperature,
199
+ max_tokens=max_tokens,
200
+ **completion_params_kwargs,
201
+ )
202
+
203
+ # Convert each row
204
+ for i in range(offset, end_idx):
205
+ try:
206
+ raw_row = dataset[i]
207
+ eval_row = self._convert_row_to_evaluation_row(
208
+ raw_row, i, completion_params, split
209
+ )
210
+ yield eval_row
211
+ except (AttributeError, ValueError, KeyError) as e:
212
+ logger.warning("Failed to convert row %d: %s", i, e)
213
+ continue
214
+
215
+ def _convert_row_to_evaluation_row(
216
+ self,
217
+ raw_row: Dict[str, Any],
218
+ row_index: int,
219
+ completion_params: CompletionParams,
220
+ split: Optional[str] = None,
221
+ ) -> EvaluationRow:
222
+ """Convert a single dataset row to EvaluationRow format.
223
+
224
+ Args:
225
+ raw_row: Raw dataset row dictionary
226
+ row_index: Index of the row in the dataset
227
+ completion_params: Completion parameters to use
228
+ split: Dataset split name
229
+
230
+ Returns:
231
+ EvaluationRow object
232
+ """
233
+ # Apply user transformation
234
+ transformed = self.transform_fn(raw_row)
235
+
236
+ # Validate required fields
237
+ if 'messages' not in transformed:
238
+ raise ValueError("Transform function must return 'messages' field")
239
+
240
+ # Convert message dictionaries to Message objects
241
+ messages = []
242
+ for msg_dict in transformed['messages']:
243
+ if not isinstance(msg_dict, dict):
244
+ raise ValueError("Each message must be a dictionary")
245
+ if 'role' not in msg_dict:
246
+ raise ValueError("Each message must have a 'role' field")
247
+
248
+ messages.append(Message(
249
+ role=msg_dict['role'],
250
+ content=msg_dict.get('content'),
251
+ name=msg_dict.get('name'),
252
+ tool_call_id=msg_dict.get('tool_call_id'),
253
+ tool_calls=msg_dict.get('tool_calls'),
254
+ function_call=msg_dict.get('function_call'),
255
+ ))
256
+
257
+ # Extract other fields
258
+ ground_truth = transformed.get('ground_truth')
259
+ tools = transformed.get('tools')
260
+ user_metadata = transformed.get('metadata', {})
261
+
262
+ # Create dataset info
263
+ dataset_info = {
264
+ 'dataset_id': self.dataset_id,
265
+ 'config_name': self.config_name,
266
+ 'revision': self.revision,
267
+ 'split': split,
268
+ 'row_index': row_index,
269
+ 'transform_function': self.transform_fn.__name__ if hasattr(self.transform_fn, '__name__') else 'anonymous',
270
+ }
271
+
272
+ # Add user metadata
273
+ dataset_info.update(user_metadata)
274
+
275
+ # Add original row data (with prefix to avoid conflicts)
276
+ for key, value in raw_row.items():
277
+ dataset_info[f'original_{key}'] = value
278
+
279
+ # Create input metadata
280
+ input_metadata = InputMetadata(
281
+ row_id=f"{self.dataset_id}_{row_index}",
282
+ completion_params=completion_params,
283
+ dataset_info=dataset_info,
284
+ session_data={
285
+ 'dataset_source': 'huggingface',
286
+ 'timestamp': None,
287
+ }
288
+ )
289
+
290
+ return EvaluationRow(
291
+ messages=messages,
292
+ tools=tools,
293
+ input_metadata=input_metadata,
294
+ ground_truth=str(ground_truth) if ground_truth is not None else None,
295
+ )
296
+
297
+ def get_splits(self) -> List[str]:
298
+ """Get available dataset splits.
299
+
300
+ Returns:
301
+ List of available split names
302
+ """
303
+ if isinstance(self.dataset, DatasetDict):
304
+ return list(self.dataset.keys())
305
+ else:
306
+ return ["train"] # Default split name for non-split datasets
307
+
308
+ def get_dataset_info(self) -> Dict[str, Any]:
309
+ """Get information about the loaded dataset.
310
+
311
+ Returns:
312
+ Dictionary with dataset information
313
+ """
314
+ info = {
315
+ 'dataset_id': self.dataset_id,
316
+ 'config_name': self.config_name,
317
+ 'revision': self.revision,
318
+ 'splits': self.get_splits(),
319
+ 'transform_function': self.transform_fn.__name__ if hasattr(self.transform_fn, '__name__') else 'anonymous',
320
+ }
321
+
322
+ # Add split sizes
323
+ if isinstance(self.dataset, DatasetDict):
324
+ info['split_sizes'] = {split: len(data) for split, data in self.dataset.items()}
325
+ else:
326
+ info['total_size'] = len(self.dataset)
327
+
328
+ return info
329
+
330
+
331
+ def create_huggingface_adapter(
332
+ dataset_id: str,
333
+ transform_fn: TransformFunction,
334
+ config_name: Optional[str] = None,
335
+ revision: Optional[str] = None,
336
+ **load_dataset_kwargs,
337
+ ) -> HuggingFaceAdapter:
338
+ """Factory function to create a HuggingFace adapter.
339
+
340
+ Args:
341
+ dataset_id: HuggingFace dataset identifier
342
+ transform_fn: Function to transform dataset rows to evaluation format
343
+ config_name: Optional configuration name
344
+ revision: Optional dataset revision/commit hash
345
+ **load_dataset_kwargs: Additional arguments for load_dataset
346
+
347
+ Returns:
348
+ HuggingFaceAdapter instance
349
+ """
350
+ return HuggingFaceAdapter(
351
+ dataset_id=dataset_id,
352
+ transform_fn=transform_fn,
353
+ config_name=config_name,
354
+ revision=revision,
355
+ **load_dataset_kwargs,
356
+ )
357
+
358
+
359
+ # Convenience functions for common datasets
360
+ def create_gsm8k_adapter(
361
+ system_prompt: Optional[str] = None,
362
+ revision: Optional[str] = None,
363
+ ) -> HuggingFaceAdapter:
364
+ """Create adapter specifically configured for GSM8K dataset.
365
+
366
+ Args:
367
+ system_prompt: Optional system prompt for math problems
368
+ revision: Optional dataset revision/commit
369
+
370
+ Returns:
371
+ HuggingFaceAdapter configured for GSM8K
372
+ """
373
+ default_system_prompt = (
374
+ "You are a helpful assistant that solves math problems step by step. "
375
+ "Show your work and provide the final answer."
376
+ )
377
+
378
+ system_content = system_prompt or default_system_prompt
379
+
380
+ def gsm8k_transform(row: Dict[str, Any]) -> Dict[str, Any]:
381
+ """Transform GSM8K row to evaluation format."""
382
+ return {
383
+ 'messages': [
384
+ {'role': 'system', 'content': system_content},
385
+ {'role': 'user', 'content': row['question']},
386
+ ],
387
+ 'ground_truth': row['answer'],
388
+ 'metadata': {
389
+ 'dataset': 'gsm8k',
390
+ 'question_length': len(row['question']),
391
+ 'answer_length': len(row['answer']),
392
+ }
393
+ }
394
+
395
+ return create_huggingface_adapter(
396
+ dataset_id="gsm8k",
397
+ config_name="main",
398
+ transform_fn=gsm8k_transform,
399
+ revision=revision,
400
+ )
401
+
402
+
403
+ def create_math_adapter(
404
+ system_prompt: Optional[str] = None,
405
+ revision: Optional[str] = None,
406
+ ) -> HuggingFaceAdapter:
407
+ """Create adapter specifically configured for MATH competition dataset.
408
+
409
+ Args:
410
+ system_prompt: Optional system prompt for math problems
411
+ revision: Optional dataset revision/commit
412
+
413
+ Returns:
414
+ HuggingFaceAdapter configured for MATH dataset
415
+ """
416
+ default_system_prompt = (
417
+ "You are an expert mathematician. Solve this advanced math problem "
418
+ "step by step, showing detailed work."
419
+ )
420
+
421
+ system_content = system_prompt or default_system_prompt
422
+
423
+ def math_transform(row: Dict[str, Any]) -> Dict[str, Any]:
424
+ """Transform MATH dataset row to evaluation format."""
425
+ return {
426
+ 'messages': [
427
+ {'role': 'system', 'content': system_content},
428
+ {'role': 'user', 'content': row['problem']},
429
+ ],
430
+ 'ground_truth': row['solution'],
431
+ 'metadata': {
432
+ 'dataset': 'hendrycks_math',
433
+ 'type': row.get('type', 'unknown'),
434
+ 'level': row.get('level', 'unknown'),
435
+ 'problem_length': len(row['problem']),
436
+ 'solution_length': len(row['solution']),
437
+ }
438
+ }
439
+
440
+ return create_huggingface_adapter(
441
+ dataset_id="hendrycks/competition_math",
442
+ transform_fn=math_transform,
443
+ revision=revision,
444
+ )