eval-framework 0.5.0__tar.gz → 0.5.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (193) hide show
  1. {eval_framework-0.5.0 → eval_framework-0.5.1}/PKG-INFO +2 -2
  2. {eval_framework-0.5.0 → eval_framework-0.5.1}/pyproject.toml +4 -4
  3. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/evaluation_generator.py +5 -9
  4. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/response_generator.py +8 -14
  5. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/gsm8k.py +30 -0
  6. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/math_reasoning.py +34 -40
  7. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/squad.py +5 -0
  8. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/triviaqa.py +5 -0
  9. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/eval_config.py +2 -3
  10. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/registry.py +102 -18
  11. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/task_names.py +2 -0
  12. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/utils/generate_task_docs.py +6 -12
  13. {eval_framework-0.5.0 → eval_framework-0.5.1}/LICENSE +0 -0
  14. {eval_framework-0.5.0 → eval_framework-0.5.1}/README.md +0 -0
  15. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/__init__.py +0 -0
  16. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/base_config.py +0 -0
  17. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/context/__init__.py +0 -0
  18. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/context/determined.py +0 -0
  19. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/context/eval.py +0 -0
  20. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/context/local.py +0 -0
  21. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/exceptions.py +0 -0
  22. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/external/drop_process_results.py +0 -0
  23. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/external/ifeval_impl/README.md +0 -0
  24. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
  25. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
  26. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
  27. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
  28. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/llm/__init__.py +0 -0
  29. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/llm/aleph_alpha.py +0 -0
  30. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/llm/base.py +0 -0
  31. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/llm/huggingface.py +0 -0
  32. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/llm/mistral.py +0 -0
  33. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/llm/models.py +0 -0
  34. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/llm/openai.py +0 -0
  35. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/llm/vllm.py +0 -0
  36. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/llm/vllm_local_server.py +0 -0
  37. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/logger.py +0 -0
  38. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/main.py +0 -0
  39. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/__init__.py +0 -0
  40. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/aggregators/__init__.py +0 -0
  41. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/aggregators/aggregators.py +0 -0
  42. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/base.py +0 -0
  43. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/__init__.py +0 -0
  44. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
  45. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
  46. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/bleu.py +0 -0
  47. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/chrf.py +0 -0
  48. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
  49. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
  50. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
  51. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/csv_format.py +0 -0
  52. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
  53. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/drop_completion.py +0 -0
  54. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
  55. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/f1.py +0 -0
  56. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/format_checker.py +0 -0
  57. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
  58. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/ifeval.py +0 -0
  59. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/json_format.py +0 -0
  60. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/language_checker.py +0 -0
  61. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/length_control.py +0 -0
  62. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/math_minerva_completion.py +0 -0
  63. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
  64. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/minerva_math_utils.py +0 -0
  65. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/multipl_e_assertion.py +0 -0
  66. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
  67. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
  68. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/repetition.py +0 -0
  69. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
  70. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
  71. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
  72. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
  73. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
  74. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/ter.py +0 -0
  75. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/text_counter.py +0 -0
  76. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
  77. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
  78. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/__init__.py +0 -0
  79. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/base.py +0 -0
  80. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
  81. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
  82. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
  83. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
  84. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
  85. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
  86. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
  87. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/language.py +0 -0
  88. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
  89. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/models.py +0 -0
  90. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
  91. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
  92. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
  93. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
  94. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
  95. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
  96. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
  97. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
  98. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
  99. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
  100. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
  101. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
  102. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
  103. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
  104. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
  105. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/utils.py +0 -0
  106. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
  107. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
  108. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
  109. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/loglikelihood/bits_per_byte.py +0 -0
  110. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
  111. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
  112. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
  113. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
  114. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/py.typed +0 -0
  115. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/result_processors/__init__.py +0 -0
  116. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/result_processors/base.py +0 -0
  117. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/result_processors/hf_uploader.py +0 -0
  118. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/result_processors/result_processor.py +0 -0
  119. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
  120. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/run.py +0 -0
  121. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/run_direct.py +0 -0
  122. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/shared/types.py +0 -0
  123. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/suite.py +0 -0
  124. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/Dockerfile_codebench +0 -0
  125. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/__init__.py +0 -0
  126. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/base.py +0 -0
  127. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
  128. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/aidanbench.py +0 -0
  129. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
  130. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
  131. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
  132. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/balancedcopa.py +0 -0
  133. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
  134. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
  135. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
  136. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
  137. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
  138. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/csqa.py +0 -0
  139. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/drop.py +0 -0
  140. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
  141. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/flores200.py +0 -0
  142. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
  143. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/global_mmlu.py +0 -0
  144. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/goldenswag.py +0 -0
  145. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
  146. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
  147. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
  148. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
  149. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
  150. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/include.py +0 -0
  151. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
  152. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/lab_bench.py +0 -0
  153. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
  154. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/medqa.py +0 -0
  155. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
  156. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
  157. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
  158. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
  159. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/multipl_e.py +0 -0
  160. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/naturalqs_open.py +0 -0
  161. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
  162. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
  163. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
  164. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
  165. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
  166. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
  167. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/social_iqa.py +0 -0
  168. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
  169. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
  170. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
  171. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/truthfulqa.py +0 -0
  172. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
  173. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
  174. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
  175. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
  176. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
  177. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/dataset_revisions.py +0 -0
  178. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/perturbation.py +0 -0
  179. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/task-dataset-revisions.json +0 -0
  180. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/task_loader.py +0 -0
  181. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/task_style.py +0 -0
  182. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/utils.py +0 -0
  183. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/utils/constants.py +0 -0
  184. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/utils/file_ops.py +0 -0
  185. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/utils/helpers.py +0 -0
  186. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/utils/logging.py +0 -0
  187. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/utils/packaging.py +0 -0
  188. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/utils/tqdm_handler.py +0 -0
  189. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/template_formatting/README.md +0 -0
  190. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/template_formatting/__init__.py +0 -0
  191. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/template_formatting/formatter.py +0 -0
  192. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/template_formatting/mistral_formatter.py +0 -0
  193. {eval_framework-0.5.0 → eval_framework-0.5.1}/src/template_formatting/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: eval-framework
3
- Version: 0.5.0
3
+ Version: 0.5.1
4
4
  Summary: Evaluation Framework
5
5
  Author: Aleph Alpha Research
6
6
  License: Apache License
@@ -235,7 +235,7 @@ Requires-Dist: wandb>=0.27.2,<1
235
235
  Requires-Dist: boto3>=1.43.19,<2
236
236
  Requires-Dist: numpy>=2.2.6
237
237
  Requires-Dist: antlr4-python3-runtime==4.11.0
238
- Requires-Dist: scipy>=1.17.1,<2
238
+ Requires-Dist: scipy>=1.18.0,<2
239
239
  Requires-Dist: accelerate ; extra == 'accelerate'
240
240
  Requires-Dist: eval-framework[determined,api,openai,transformers,accelerate,vllm,optional,mistral] ; extra == 'all'
241
241
  Requires-Dist: aleph-alpha-client>=11.5.1 ; extra == 'api'
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "eval-framework"
3
- version = "0.5.0"
3
+ version = "0.5.1"
4
4
  description = "Evaluation Framework"
5
5
  readme = "README.md"
6
6
  license = { file = "LICENSE" }
@@ -44,7 +44,7 @@ dependencies = [
44
44
  # is a dependency of sympy, but not explicitly listed in the requirements.txt
45
45
  # https://github.com/sympy/sympy/blob/0204fa34e8f6f6f8ccb4de01209be9a2345c9d6e/doc/src/contributing/dependencies.md?plain=1#L125
46
46
  "antlr4-python3-runtime==4.11.0",
47
- "scipy>=1.17.1,<2", # required for the aggregation of pass@k metrics
47
+ "scipy>=1.18.0,<2", # required for the aggregation of pass@k metrics
48
48
  ]
49
49
 
50
50
  [project.optional-dependencies]
@@ -104,12 +104,12 @@ dev = [
104
104
  "pip-licenses>=5.5.5",
105
105
  ]
106
106
  flash-attn = [
107
- "flash-attn>=2.8.3,<2.9",
107
+ "flash-attn>=2.8.3.post1,<2.9",
108
108
  "torch"
109
109
  ]
110
110
 
111
111
  [build-system]
112
- requires = ["uv_build>=0.11.22,<0.11.23"]
112
+ requires = ["uv_build>=0.11.23,<0.11.24"]
113
113
  build-backend = "uv_build"
114
114
 
115
115
  [tool.uv.build-backend]
@@ -18,7 +18,7 @@ from eval_framework.result_processors.base import Result, ResultProcessor
18
18
  from eval_framework.shared.types import Completion, Loglikelihood
19
19
  from eval_framework.tasks.base import ResponseType
20
20
  from eval_framework.tasks.eval_config import EvalConfig
21
- from eval_framework.tasks.registry import get_task
21
+ from eval_framework.tasks.registry import registry
22
22
  from eval_framework.utils.constants import RED, RESET
23
23
  from eval_framework.utils.tqdm_handler import get_disable_bar_flag, safe_tqdm_write
24
24
 
@@ -36,13 +36,9 @@ class EvaluationGenerator:
36
36
  self.result_processor = result_processor
37
37
  self.save_intermediate_results = config.save_intermediate_results
38
38
 
39
- task_class = get_task(config.task_name)
40
- if hasattr(task_class, "TASK_STYLER"):
41
- response_type = task_class.TASK_STYLER.response_type
42
- task_metrics = list(task_class.TASK_STYLER.metrics)
43
- else:
44
- response_type = task_class.RESPONSE_TYPE
45
- task_metrics = task_class.METRICS
39
+ eval_ = registry()[config.task_name]
40
+ response_type = eval_.response_type()
41
+ task_metrics = eval_.metrics()
46
42
 
47
43
  if response_type == ResponseType.COMPLETION:
48
44
  self.metrics = task_metrics + [BytesCompletion, SequencePositionsCompletion]
@@ -51,7 +47,7 @@ class EvaluationGenerator:
51
47
  else:
52
48
  raise NotImplementedError
53
49
 
54
- self.task_name = task_class.NAME
50
+ self.task_name = eval_.task_class().NAME
55
51
 
56
52
  def _run_metric_calculators(self, responses: list[Completion | Loglikelihood]) -> list[Result]:
57
53
  results: list[Result] = self.result_processor.load_metrics_results()
@@ -5,7 +5,7 @@ from collections.abc import Callable, Iterable
5
5
  from datetime import UTC, datetime
6
6
  from functools import partial
7
7
 
8
- from eval_framework.tasks.registry import get_task
8
+ from eval_framework.tasks.registry import registry
9
9
 
10
10
  try:
11
11
  from determined._info import get_cluster_info
@@ -28,7 +28,6 @@ from eval_framework.shared.types import (
28
28
  )
29
29
  from eval_framework.tasks.base import Language, ResponseType, Sample
30
30
  from eval_framework.tasks.eval_config import EvalConfig
31
- from eval_framework.tasks.perturbation import create_perturbation_class
32
31
  from eval_framework.tasks.utils import raise_errors
33
32
  from eval_framework.utils.constants import RED, RESET
34
33
  from eval_framework.utils.tqdm_handler import get_disable_bar_flag, safe_tqdm_write
@@ -54,7 +53,6 @@ def map_language_to_value(
54
53
 
55
54
  class ResponseGenerator:
56
55
  def __init__(self, llm: BaseLLM, config: EvalConfig, result_processor: ResultsFileProcessor) -> None:
57
- self.few_shot = config.num_fewshot
58
56
  self.task_name = config.task_name
59
57
  self.llm = llm
60
58
  self.config = config
@@ -62,20 +60,16 @@ class ResponseGenerator:
62
60
  self.num_samples = config.num_samples
63
61
  self.save_intermediate_results = config.save_intermediate_results
64
62
 
65
- task_class = get_task(config.task_name)
66
-
67
63
  if config.perturbation_config is not None:
68
- perturbation_task_class = create_perturbation_class(task_class, config.perturbation_config)
69
- self.task = perturbation_task_class.with_overwrite(
70
- self.few_shot,
71
- custom_subjects=self.config.task_subjects,
72
- custom_hf_revision=self.config.hf_revision,
64
+ self.task = registry()[config.task_name].create_perturbation(
65
+ config.perturbation_config,
66
+ config.num_fewshot,
67
+ config.task_subjects,
68
+ config.hf_revision,
73
69
  )
74
70
  else:
75
- self.task = task_class.with_overwrite(
76
- self.few_shot,
77
- custom_subjects=self.config.task_subjects,
78
- custom_hf_revision=self.config.hf_revision,
71
+ self.task = registry()[config.task_name].create(
72
+ config.num_fewshot, config.task_subjects, config.hf_revision
79
73
  )
80
74
 
81
75
  self.response_type = self.task.get_response_type()
@@ -4,6 +4,7 @@ from typing import Any
4
4
 
5
5
  from eval_framework.metrics.completion.accuracy_completion import AccuracyCompletion, AccuracyCompletionOLMES
6
6
  from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample
7
+ from eval_framework.tasks.task_style import BPBStyle
7
8
 
8
9
  logger = logging.getLogger(__name__)
9
10
 
@@ -215,3 +216,32 @@ class GSM8K_OLMES(GSM8K):
215
216
 
216
217
  def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
217
218
  return self._clean_short_answer(completion_text)
219
+
220
+
221
+ class GSM8KBPB(GSM8K_OLMES):
222
+ NAME = "GSM8KBPB"
223
+ TASK_STYLER = BPBStyle(cue_text="Answer:", leading_space_continuations=False)
224
+
225
+ # BPBStyle already adds "Answer:" as that separate assistant message. But the methods we inherit
226
+ # still put "Answer:" at the end of the question text and leave it out of the fewshot answer.
227
+ # So we override them here: remove "Answer:" from the question, and add it back in front of the
228
+ # fewshot answer. Without this, the question ends in "Answer:Answer:" and fewshot answers have
229
+ # no "Answer:" label at all.
230
+
231
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
232
+ return f"Question: {item['question']}\n"
233
+
234
+ def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
235
+ return f"Answer:{self.normalize_answer_str(item)}"
236
+
237
+ def _get_raw_question(self, item: dict[str, Any]) -> str:
238
+ return item["question"]
239
+
240
+ def _get_choices(self, item: dict[str, Any]) -> list[str]:
241
+ return [self.normalize_answer_str(item)]
242
+
243
+ def _get_correct_index(self, item: dict[str, Any]) -> int:
244
+ return 0
245
+
246
+ def _get_ground_truth(self, item: dict[str, Any]) -> str:
247
+ return self._get_choices(item)[0]
@@ -14,8 +14,8 @@ from eval_framework.metrics.completion.minerva_math_utils import (
14
14
  extract_answers,
15
15
  normalized_gold_from_solution,
16
16
  )
17
- from eval_framework.metrics.loglikelihood.bits_per_byte import BitsPerByteLoglikelihood
18
17
  from eval_framework.tasks.base import NO_SUBJECT, RANDOM_SEED, BaseTask, Language, ResponseType, Sample, SubjectType
18
+ from eval_framework.tasks.task_style import BPBStyle
19
19
 
20
20
  # Hendrycks MATH subject splits (shared by MATH, MATHMinervaEvalHarness, MATHMinervaBPB)
21
21
  MATH_SUBJECTS = [
@@ -612,44 +612,6 @@ class MATH500Minerva(MATHMinerva):
612
612
  super().__init__(num_fewshot)
613
613
 
614
614
 
615
- class MATHMinervaBPB(MATHReasoning):
616
- """
617
- MATH (Hendrycks) with Minerva-style prompt, evaluated via loglikelihood of the
618
- gold answer string (bits-per-byte).
619
- Same prompt as MATHMinerva; scores P(normalized_gold_answer | prompt).
620
- """
621
-
622
- NAME = "MATHMinervaBPB"
623
- DATASET_PATH = "EleutherAI/hendrycks_math"
624
- SAMPLE_SPLIT = "test"
625
- FEWSHOT_SPLIT = "train"
626
- RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
627
- METRICS = [BitsPerByteLoglikelihood]
628
- SUBJECTS = MATH_SUBJECTS
629
- LANGUAGE = Language.ENG
630
-
631
- def _get_instruction_text(self, item: dict[str, Any]) -> str:
632
- return "Problem:\n" + item["problem"] + "\n\n" + "Solution:"
633
-
634
- def _get_cue_text(self, item: dict[str, Any]) -> str:
635
- return ""
636
-
637
- def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
638
- normalized = self._normalized_gold_from_solution(item["solution"])
639
- if normalized is None:
640
- return None
641
- return " " + normalized
642
-
643
- def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
644
- normalized = self._normalized_gold_from_solution(item["solution"])
645
- if normalized is None:
646
- return None
647
- return [" " + normalized]
648
-
649
- def _normalized_gold_from_solution(self, solution: str) -> str | None:
650
- return normalized_gold_from_solution(solution)
651
-
652
-
653
615
  class MATHLvl5(MATH):
654
616
  NAME = "Math Lvl 5"
655
617
 
@@ -742,7 +704,7 @@ Answer:"""
742
704
 
743
705
 
744
706
  _OLMES_FEWSHOTS = [
745
- ## https://github.com/huggingface/lm-evaluation-harness/blob/add_leaderboard_tasks/lm_eval/tasks/leaderboard/math/utils.py
707
+ # https://github.com/huggingface/lm-evaluation-harness/blob/add_leaderboard_tasks/lm_eval/tasks/leaderboard/math/utils.py
746
708
  {
747
709
  "problem": "Find the domain of the expression $\\frac{\\sqrt{x-2}}{\\sqrt{5-x}}$.}",
748
710
  "solution": "The expressions inside each square root must be non-negative. Therefore, $x-2 \\ge 0$, so "
@@ -790,3 +752,35 @@ class MATHMinerva_OLMES(MATHMinerva):
790
752
 
791
753
  def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]:
792
754
  return _OLMES_FEWSHOTS[: self.num_fewshot]
755
+
756
+
757
+ class MATHMinervaBPB(MATHMinerva_OLMES):
758
+ NAME = "MATHMinervaBPB"
759
+ TASK_STYLER = BPBStyle(cue_text="Solution:")
760
+
761
+ # BPBStyle already adds "Solution:" as that separate assistant message. But the methods we inherit
762
+ # still put "Solution:" at the end of the question text and leave it out of the fewshot answer.
763
+ # So we override them here: remove "Solution:" from the question, and add it back in front of the
764
+ # fewshot answer. Without this, the question ends in "Solution:Solution:" and fewshot answers have
765
+ # no "Solution:" label at all.
766
+
767
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
768
+ return "Problem:\n" + item["problem"] + "\n\n"
769
+
770
+ def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
771
+ return f"Solution: {item['solution']}"
772
+
773
+ def _get_choices(self, item: dict[str, Any]) -> list[str]:
774
+ answer = normalized_gold_from_solution(item["solution"])
775
+ template = f"\nFinal Answer: The final answer is {answer}. I hope it is correct."
776
+
777
+ return [item["solution"] + template]
778
+
779
+ def _get_correct_index(self, item: dict[str, Any]) -> int:
780
+ return 0
781
+
782
+ def _get_raw_question(self, item: dict[str, Any]) -> str:
783
+ return item["problem"]
784
+
785
+ def _get_ground_truth(self, item: dict[str, Any]) -> str | None | list[str]:
786
+ return self._get_choices(item)[0]
@@ -244,6 +244,11 @@ class SQuAD2_MA(SQUAD2):
244
244
 
245
245
  METRICS = [AccuracyCompletion, F1, F1SquadNormalized]
246
246
 
247
+ def __init__(self, num_fewshot: int = 0) -> None:
248
+ super().__init__(num_fewshot)
249
+ self.stop_sequences = []
250
+ self.max_tokens = None
251
+
247
252
  def _get_system_prompt_text(self, item: dict[str, Any]) -> str | None:
248
253
  return (
249
254
  "You are a helpful assistant and will answer the user's questions carefully, "
@@ -52,6 +52,11 @@ class TriviaQA_MA(TRIVIAQA):
52
52
  METRICS = [AccuracyCompletion, F1, F1SquadNormalized]
53
53
  PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer", "Context", "unanswerable"]
54
54
 
55
+ def __init__(self, num_fewshot: int = 0) -> None:
56
+ super().__init__(num_fewshot)
57
+ self.stop_sequences = []
58
+ self.max_tokens = None
59
+
55
60
  def _get_context_text(self, item: dict[str, Any]) -> str:
56
61
  return "\n\n".join(item["entity_pages"]["wiki_context"])
57
62
 
@@ -10,7 +10,7 @@ from eval_framework.llm.base import BaseLLM
10
10
  from eval_framework.metrics.llm.base import BaseLLMJudgeMetric
11
11
  from eval_framework.tasks.base import BaseTask
12
12
  from eval_framework.tasks.perturbation import PerturbationConfig
13
- from eval_framework.tasks.registry import get_task, validate_task_name
13
+ from eval_framework.tasks.registry import get_task, registry, validate_task_name
14
14
  from eval_framework.utils.constants import ROOT_DIR
15
15
 
16
16
  # Keys that don't impact actual evaluation results and should be excluded from config dumps for hashing purposes.
@@ -115,8 +115,7 @@ class EvalConfig(BaseConfig):
115
115
 
116
116
  @model_validator(mode="after")
117
117
  def validate_llm_judge_defined(self) -> "EvalConfig":
118
- task = get_task(self.task_name)
119
- task_metrics = task(num_fewshot=0).get_metrics()
118
+ task_metrics = registry()[self.task_name].metrics()
120
119
  for metric_class in task_metrics:
121
120
  if issubclass(metric_class, BaseLLMJudgeMetric):
122
121
  assert self.llm_judge_class is not None, "The LLM Judge must be defined for this evaluation task."
@@ -3,15 +3,19 @@ import importlib
3
3
  import re
4
4
  from abc import ABC, abstractmethod
5
5
  from collections.abc import Generator, Iterator, Sequence
6
- from typing import Any
6
+ from typing import TYPE_CHECKING, Any
7
7
 
8
- from eval_framework.tasks.base import BaseTask
8
+ from eval_framework.tasks.base import BaseTask, ResponseType
9
+ from eval_framework.tasks.perturbation import PerturbationConfig, create_perturbation_class
9
10
  from eval_framework.utils.packaging import is_extra_installed, validate_package_extras
10
11
 
12
+ if TYPE_CHECKING:
13
+ from eval_framework.metrics.base import BaseMetric
14
+
11
15
  __all__ = [
12
16
  "register_task",
13
17
  "register_lazy_task",
14
- "BenchmarkFactory",
18
+ "EvalFactory",
15
19
  "Registry",
16
20
  "with_registry",
17
21
  "get_task",
@@ -22,13 +26,13 @@ __all__ = [
22
26
  ]
23
27
 
24
28
 
25
- class BenchmarkFactory(ABC):
26
- """Produces a registered benchmark's task.
29
+ class EvalFactory(ABC):
30
+ """Produces a registered benchmark's eval.
27
31
 
28
- The registry stores one factory per benchmark. This allows the factory to be
29
- constructed without constructing all benchmarks. Going via this ABC allows
32
+ The registry stores one factory per eval. This allows the factory to be
33
+ constructed without constructing all evals. Going via this ABC allows
30
34
  the factory instances to contain state specifically relevant to the
31
- benchmark, as well as supporting different strategies for instantiating it.
35
+ eval, as well as supporting different strategies for instantiating it.
32
36
  E.g. eager vs lazy loading of the required dependencies.
33
37
  """
34
38
 
@@ -41,11 +45,33 @@ class BenchmarkFactory(ABC):
41
45
  def source_module(self) -> str:
42
46
  """Module the task class is defined in, resolvable without importing it."""
43
47
 
48
+ @abstractmethod
49
+ def response_type(self) -> ResponseType:
50
+ """The eval's response type"""
51
+
52
+ @abstractmethod
53
+ def metrics(self) -> list[type["BaseMetric"]]:
54
+ """The eval's metrics"""
55
+
56
+ @abstractmethod
57
+ def create(
58
+ self, num_fewshot: int, custom_subjects: list[str] | None, custom_hf_revision: str | None
59
+ ) -> BaseTask: ...
60
+
61
+ @abstractmethod
62
+ def create_perturbation(
63
+ self,
64
+ perturbation_config: PerturbationConfig,
65
+ num_fewshot: int,
66
+ custom_subjects: list[str] | None,
67
+ custom_hf_revision: str | None,
68
+ ) -> BaseTask: ...
69
+
44
70
 
45
- class _Lazy(BenchmarkFactory):
71
+ class _Lazy(EvalFactory):
46
72
  """
47
- Create benchmark from qualified class path; Delays importing modules until
48
- benchmark is constructed.
73
+ Create eval from qualified class path; Delays importing modules until
74
+ eval is constructed.
49
75
  """
50
76
 
51
77
  def __init__(self, class_name: str, module: str, extras: Sequence[str] = ()) -> None:
@@ -73,8 +99,35 @@ class _Lazy(BenchmarkFactory):
73
99
  self._loaded = getattr(module, self._class_name)
74
100
  return self._loaded
75
101
 
102
+ def create(self, num_fewshot: int, custom_subjects: list[str] | None, custom_hf_revision: str | None) -> BaseTask:
103
+ return self.task_class().with_overwrite(
104
+ num_fewshot=num_fewshot, custom_subjects=custom_subjects, custom_hf_revision=custom_hf_revision
105
+ )
106
+
107
+ def create_perturbation(
108
+ self,
109
+ perturbation_config: PerturbationConfig,
110
+ num_fewshot: int,
111
+ custom_subjects: list[str] | None,
112
+ custom_hf_revision: str | None,
113
+ ) -> BaseTask:
114
+ perturbation_task_class = create_perturbation_class(self.task_class(), perturbation_config)
115
+ return perturbation_task_class.with_overwrite(
116
+ num_fewshot=num_fewshot,
117
+ custom_subjects=custom_subjects,
118
+ custom_hf_revision=custom_hf_revision,
119
+ )
76
120
 
77
- class _Eager(BenchmarkFactory):
121
+ def response_type(self) -> ResponseType:
122
+ """The eval's response type"""
123
+ return self.task_class().get_response_type()
124
+
125
+ def metrics(self) -> list[type["BaseMetric"]]:
126
+ """The eval's metrics"""
127
+ return self.task_class().get_metrics()
128
+
129
+
130
+ class _Eager(EvalFactory):
78
131
  """Wraps an already-imported task class."""
79
132
 
80
133
  def __init__(self, task: type[BaseTask]) -> None:
@@ -87,6 +140,33 @@ class _Eager(BenchmarkFactory):
87
140
  def task_class(self) -> type[BaseTask]:
88
141
  return self._task
89
142
 
143
+ def create(self, num_fewshot: int, custom_subjects: list[str] | None, custom_hf_revision: str | None) -> BaseTask:
144
+ return self.task_class().with_overwrite(
145
+ num_fewshot=num_fewshot, custom_subjects=custom_subjects, custom_hf_revision=custom_hf_revision
146
+ )
147
+
148
+ def create_perturbation(
149
+ self,
150
+ perturbation_config: PerturbationConfig,
151
+ num_fewshot: int,
152
+ custom_subjects: list[str] | None,
153
+ custom_hf_revision: str | None,
154
+ ) -> BaseTask:
155
+ perturbation_task_class = create_perturbation_class(self.task_class(), perturbation_config)
156
+ return perturbation_task_class.with_overwrite(
157
+ num_fewshot=num_fewshot,
158
+ custom_subjects=custom_subjects,
159
+ custom_hf_revision=custom_hf_revision,
160
+ )
161
+
162
+ def response_type(self) -> ResponseType:
163
+ """The eval's response type"""
164
+ return self.task_class().get_response_type()
165
+
166
+ def metrics(self) -> list[type["BaseMetric"]]:
167
+ """The eval's metrics"""
168
+ return self.task_class().get_metrics()
169
+
90
170
 
91
171
  class Registry:
92
172
  """A registry for tasks with support for lazy loading.
@@ -97,7 +177,7 @@ class Registry:
97
177
 
98
178
  def __init__(self) -> None:
99
179
  # TODO: Lookup only with upper names
100
- self._registry: dict[str, tuple[str, BenchmarkFactory]] = dict()
180
+ self._registry: dict[str, tuple[str, EvalFactory]] = dict()
101
181
 
102
182
  def __iter__(self) -> Iterator[str]:
103
183
  for name, _ in self._registry.values():
@@ -116,20 +196,20 @@ class Registry:
116
196
  task_key = self._task_key(name)
117
197
  return task_key in self._registry
118
198
 
119
- def __getitem__(self, name: str, /) -> type[BaseTask]:
199
+ def __getitem__(self, name: str, /) -> EvalFactory:
120
200
  task_key = self._task_key(name)
121
201
  try:
122
202
  _, factory = self._registry[task_key]
123
203
  except KeyError:
124
- raise KeyError(f"Task not found: {name}")
204
+ raise KeyError(f"Task not found: {name=} with task_key {task_key=}")
125
205
 
126
- return factory.task_class()
206
+ return factory
127
207
 
128
208
  def add(self, task: type[BaseTask]) -> None:
129
209
  task_key = self._task_key(task.NAME)
130
210
  self._registry[task_key] = (task.NAME, _Eager(task))
131
211
 
132
- def __setitem__(self, name: str, factory: BenchmarkFactory) -> None:
212
+ def __setitem__(self, name: str, factory: EvalFactory) -> None:
133
213
  task_key = self._task_key(name)
134
214
  if task_key in self._registry:
135
215
  raise ValueError(f"Cannot register duplicate task with key: {task_key}")
@@ -140,6 +220,10 @@ class Registry:
140
220
  _REGISTRY = Registry()
141
221
 
142
222
 
223
+ def registry() -> Registry:
224
+ return _REGISTRY
225
+
226
+
143
227
  @contextlib.contextmanager
144
228
  def with_registry(registry: Registry) -> Generator[None, Any, None]:
145
229
  """Contextmanager to change the current registry."""
@@ -183,7 +267,7 @@ def get_task(name: str, /) -> type[BaseTask]:
183
267
 
184
268
  Note: This method will import any lazily registered task.
185
269
  """
186
- return _REGISTRY[name]
270
+ return _REGISTRY[name].task_class()
187
271
 
188
272
 
189
273
  def register_task(task: type[BaseTask]) -> str:
@@ -30,6 +30,8 @@ def register_all_tasks() -> None:
30
30
  register_lazy_task("eval_framework.tasks.benchmarks.goldenswag.GOLDENSWAG_IDK")
31
31
  register_lazy_task("eval_framework.tasks.benchmarks.gpqa.GPQA_OLMES")
32
32
  register_lazy_task("eval_framework.tasks.benchmarks.gsm8k.GSM8K_OLMES")
33
+ register_lazy_task("eval_framework.tasks.benchmarks.gsm8k.GSM8KBPB")
34
+ register_lazy_task("eval_framework.tasks.benchmarks.math_reasoning.MATHMinervaBPB")
33
35
  register_lazy_task("eval_framework.tasks.benchmarks.math_reasoning.GSM8KReasoning")
34
36
  register_lazy_task("eval_framework.tasks.benchmarks.hellaswag.HELLASWAG")
35
37
  register_lazy_task("eval_framework.tasks.benchmarks.hellaswag.HELLASWAG_OLMES")
@@ -6,7 +6,7 @@ from pathlib import Path
6
6
 
7
7
  import tqdm
8
8
 
9
- from eval_framework.tasks.registry import get_task, registered_task_names
9
+ from eval_framework.tasks.registry import registered_task_names, registry
10
10
  from eval_framework.tasks.task_loader import load_extra_tasks
11
11
  from template_formatting.formatter import BaseFormatter, ConcatFormatter, Llama3Formatter
12
12
 
@@ -69,7 +69,8 @@ def generate_docs_for_task(
69
69
  output_docs_directory: Path, task_name: str, formatters: list[BaseFormatter], add_prompt_examples: bool
70
70
  ) -> None:
71
71
  """Generate documentation for a specific task."""
72
- task_class = get_task(task_name)
72
+ eval_ = registry()[task_name]
73
+ task_class = eval_.task_class()
73
74
 
74
75
  try:
75
76
  num_fewshot = 1
@@ -98,16 +99,9 @@ def generate_docs_for_task(
98
99
  f.write(f"SAMPLE_SPLIT = {task.SAMPLE_SPLIT}".strip() + "\n")
99
100
  if hasattr(task, "FEWSHOT_SPLIT"):
100
101
  f.write(f"FEWSHOT_SPLIT = {task.FEWSHOT_SPLIT}".strip() + "\n")
101
- if hasattr(task, "TASK_STYLER"):
102
- f.write(f"RESPONSE_TYPE = {task.TASK_STYLER.response_type.name}".strip() + "\n")
103
- metrics_list = [f"{m.__name__}" for m in task.TASK_STYLER.metrics]
104
- f.write(f"METRICS = [{', '.join(metrics_list)}]".strip() + "\n")
105
- else:
106
- if hasattr(task, "RESPONSE_TYPE"):
107
- f.write(f"RESPONSE_TYPE = {task.RESPONSE_TYPE.name}".strip() + "\n")
108
- if hasattr(task, "METRICS"):
109
- metrics_list = [f"{m.__name__}" for m in task.METRICS]
110
- f.write(f"METRICS = [{', '.join(metrics_list)}]".strip() + "\n")
102
+ f.write(f"RESPONSE_TYPE = {eval_.response_type().name}".strip() + "\n")
103
+ metrics_list = [f"{m.__name__}" for m in eval_.metrics()]
104
+ f.write(f"METRICS = [{', '.join(metrics_list)}]".strip() + "\n")
111
105
  if hasattr(task, "SUBJECTS"):
112
106
  f.write(f"SUBJECTS = {repr(task.SUBJECTS)}".strip() + "\n")
113
107
  if hasattr(task, "LANGUAGE"):
File without changes
File without changes