eval-framework 0.3.8__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. {eval_framework-0.3.8 → eval_framework-0.5.0}/PKG-INFO +10 -14
  2. {eval_framework-0.3.8 → eval_framework-0.5.0}/README.md +0 -1
  3. {eval_framework-0.3.8 → eval_framework-0.5.0}/pyproject.toml +15 -36
  4. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/llm/openai.py +2 -2
  5. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/base.py +1 -1
  6. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/code_assertion.py +4 -14
  7. eval_framework-0.5.0/src/eval_framework/tasks/__init__.py +12 -0
  8. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/base.py +3 -3
  9. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/aidanbench.py +2 -2
  10. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/flores200.py +3 -3
  11. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/flores_plus.py +1 -2
  12. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/gsm8k.py +0 -1
  13. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/squad.py +21 -0
  14. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/triviaqa.py +27 -1
  15. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/wmt.py +2 -2
  16. {eval_framework-0.3.8/src/eval_framework/tasks/benchmarks → eval_framework-0.5.0/src/eval_framework/tasks}/dataset_revisions.py +30 -7
  17. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/registry.py +76 -45
  18. eval_framework-0.5.0/src/eval_framework/tasks/task-dataset-revisions.json +62 -0
  19. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/task_names.py +2 -122
  20. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/task_style.py +64 -2
  21. eval_framework-0.3.8/src/eval_framework/metrics/completion/comet.py +0 -56
  22. eval_framework-0.3.8/src/eval_framework/tasks/__init__.py +0 -6
  23. eval_framework-0.3.8/src/eval_framework/tasks/benchmarks/task-dataset-revisions.json +0 -179
  24. {eval_framework-0.3.8 → eval_framework-0.5.0}/LICENSE +0 -0
  25. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/__init__.py +0 -0
  26. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/base_config.py +0 -0
  27. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/context/__init__.py +0 -0
  28. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/context/determined.py +0 -0
  29. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/context/eval.py +0 -0
  30. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/context/local.py +0 -0
  31. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/evaluation_generator.py +0 -0
  32. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/exceptions.py +0 -0
  33. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/external/drop_process_results.py +0 -0
  34. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/external/ifeval_impl/README.md +0 -0
  35. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
  36. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
  37. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
  38. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
  39. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/llm/__init__.py +0 -0
  40. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/llm/aleph_alpha.py +0 -0
  41. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/llm/base.py +0 -0
  42. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/llm/huggingface.py +0 -0
  43. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/llm/mistral.py +0 -0
  44. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/llm/models.py +0 -0
  45. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/llm/vllm.py +0 -0
  46. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/llm/vllm_local_server.py +0 -0
  47. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/logger.py +0 -0
  48. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/main.py +0 -0
  49. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/__init__.py +0 -0
  50. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/aggregators/__init__.py +0 -0
  51. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/aggregators/aggregators.py +0 -0
  52. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/__init__.py +0 -0
  53. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
  54. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
  55. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/bleu.py +0 -0
  56. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/chrf.py +0 -0
  57. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
  58. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
  59. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/csv_format.py +0 -0
  60. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
  61. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/drop_completion.py +0 -0
  62. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
  63. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/f1.py +0 -0
  64. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/format_checker.py +0 -0
  65. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
  66. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/ifeval.py +0 -0
  67. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/json_format.py +0 -0
  68. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/language_checker.py +0 -0
  69. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/length_control.py +0 -0
  70. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/math_minerva_completion.py +0 -0
  71. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
  72. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/minerva_math_utils.py +0 -0
  73. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/multipl_e_assertion.py +0 -0
  74. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
  75. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
  76. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/repetition.py +0 -0
  77. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
  78. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
  79. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
  80. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
  81. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
  82. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/ter.py +0 -0
  83. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/text_counter.py +0 -0
  84. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
  85. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
  86. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/__init__.py +0 -0
  87. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/base.py +0 -0
  88. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
  89. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
  90. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
  91. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
  92. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
  93. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
  94. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
  95. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/language.py +0 -0
  96. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
  97. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/models.py +0 -0
  98. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
  99. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
  100. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
  101. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
  102. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
  103. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
  104. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
  105. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
  106. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
  107. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
  108. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
  109. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
  110. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
  111. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
  112. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
  113. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/utils.py +0 -0
  114. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
  115. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
  116. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
  117. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/loglikelihood/bits_per_byte.py +0 -0
  118. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
  119. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
  120. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
  121. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
  122. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/py.typed +0 -0
  123. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/response_generator.py +0 -0
  124. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/result_processors/__init__.py +0 -0
  125. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/result_processors/base.py +0 -0
  126. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/result_processors/hf_uploader.py +0 -0
  127. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/result_processors/result_processor.py +0 -0
  128. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
  129. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/run.py +0 -0
  130. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/run_direct.py +0 -0
  131. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/shared/types.py +0 -0
  132. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/suite.py +0 -0
  133. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/Dockerfile_codebench +0 -0
  134. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
  135. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
  136. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
  137. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
  138. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/balancedcopa.py +0 -0
  139. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
  140. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
  141. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
  142. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
  143. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
  144. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/csqa.py +0 -0
  145. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/drop.py +0 -0
  146. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
  147. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/global_mmlu.py +0 -0
  148. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/goldenswag.py +0 -0
  149. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
  150. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
  151. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
  152. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
  153. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
  154. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/include.py +0 -0
  155. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
  156. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/lab_bench.py +0 -0
  157. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/math_reasoning.py +0 -0
  158. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
  159. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/medqa.py +0 -0
  160. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
  161. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
  162. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
  163. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
  164. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/multipl_e.py +0 -0
  165. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/naturalqs_open.py +0 -0
  166. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
  167. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
  168. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
  169. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
  170. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
  171. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
  172. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/social_iqa.py +0 -0
  173. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
  174. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
  175. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
  176. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/truthfulqa.py +0 -0
  177. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
  178. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
  179. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
  180. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
  181. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/eval_config.py +0 -0
  182. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/perturbation.py +0 -0
  183. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/task_loader.py +0 -0
  184. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/utils.py +0 -0
  185. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/utils/constants.py +0 -0
  186. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/utils/file_ops.py +0 -0
  187. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/utils/generate_task_docs.py +0 -0
  188. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/utils/helpers.py +0 -0
  189. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/utils/logging.py +0 -0
  190. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/utils/packaging.py +0 -0
  191. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/utils/tqdm_handler.py +0 -0
  192. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/template_formatting/README.md +0 -0
  193. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/template_formatting/__init__.py +0 -0
  194. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/template_formatting/formatter.py +0 -0
  195. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/template_formatting/mistral_formatter.py +0 -0
  196. {eval_framework-0.3.8 → eval_framework-0.5.0}/src/template_formatting/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: eval-framework
3
- Version: 0.3.8
3
+ Version: 0.5.0
4
4
  Summary: Evaluation Framework
5
5
  Author: Aleph Alpha Research
6
6
  License: Apache License
@@ -212,16 +212,15 @@ Classifier: Programming Language :: Python :: 3 :: Only
212
212
  Classifier: Topic :: Software Development :: Libraries
213
213
  Classifier: Typing :: Typed
214
214
  Requires-Dist: pyyaml>=6.0.3,<7
215
- Requires-Dist: xmltodict>=0.15.1,<0.16
215
+ Requires-Dist: xmltodict>=1.0.4,<1.1
216
216
  Requires-Dist: pydantic>=2.13.4,<3
217
- Requires-Dist: datasets>=4.8.5,<5
217
+ Requires-Dist: datasets>=5.0.0,<6
218
218
  Requires-Dist: sacrebleu>=2.6.0,<3
219
- Requires-Dist: pycountry>=24.6.1,<25
219
+ Requires-Dist: pycountry>=26.2.16,<27
220
220
  Requires-Dist: nltk>=3.9.4,<4
221
221
  Requires-Dist: python-dotenv>=1.2.2,<2
222
222
  Requires-Dist: lingua-language-detector>=2.2.0,<3
223
223
  Requires-Dist: google-crc32c>=1.8.0,<2
224
- Requires-Dist: kubernetes>=31.0.0,<32
225
224
  Requires-Dist: langdetect>=1.0.9,<2
226
225
  Requires-Dist: spacy>=3.8.14,<4
227
226
  Requires-Dist: jsonschema>=4.26.0,<5
@@ -232,18 +231,17 @@ Requires-Dist: llm-sandbox[docker]==0.3.39
232
231
  Requires-Dist: jsonlines>=4,<5
233
232
  Requires-Dist: lxml>=6.1.1,<7
234
233
  Requires-Dist: python-iso639>=2026.4.20
235
- Requires-Dist: wandb>=0.27.0,<1
236
- Requires-Dist: boto3>=1.43.18,<2
237
- Requires-Dist: numpy>=1.26.4
234
+ Requires-Dist: wandb>=0.27.2,<1
235
+ Requires-Dist: boto3>=1.43.19,<2
236
+ Requires-Dist: numpy>=2.2.6
238
237
  Requires-Dist: antlr4-python3-runtime==4.11.0
239
238
  Requires-Dist: scipy>=1.17.1,<2
240
239
  Requires-Dist: accelerate ; extra == 'accelerate'
241
- Requires-Dist: eval-framework[determined,api,openai,transformers,accelerate,vllm,comet,optional,mistral] ; extra == 'all'
240
+ Requires-Dist: eval-framework[determined,api,openai,transformers,accelerate,vllm,optional,mistral] ; extra == 'all'
242
241
  Requires-Dist: aleph-alpha-client>=11.5.1 ; extra == 'api'
243
- Requires-Dist: unbabel-comet>=2.2.7,<3 ; extra == 'comet'
244
242
  Requires-Dist: determined>=0.38.1,<0.39 ; extra == 'determined'
245
243
  Requires-Dist: tensorboard==2.20.0 ; extra == 'determined'
246
- Requires-Dist: mistral-common>=1.11.2,<2 ; extra == 'mistral'
244
+ Requires-Dist: mistral-common>=1.11.3,<2 ; extra == 'mistral'
247
245
  Requires-Dist: huggingface-hub>=0.36.2,<0.37 ; extra == 'mistral'
248
246
  Requires-Dist: eval-framework[vllm] ; extra == 'mistral'
249
247
  Requires-Dist: openai>=1.62,<3 ; extra == 'openai'
@@ -253,7 +251,7 @@ Requires-Dist: transformers>=4.45.2,<5 ; extra == 'optional'
253
251
  Requires-Dist: jinja2>=3.1.6,<4 ; extra == 'optional'
254
252
  Requires-Dist: transformers>=4.45.2,<5 ; extra == 'transformers'
255
253
  Requires-Dist: torch>=2.5,<3 ; extra == 'transformers'
256
- Requires-Dist: accelerate>=0.34.2,<1 ; extra == 'transformers'
254
+ Requires-Dist: accelerate>=1.14.0,<2 ; extra == 'transformers'
257
255
  Requires-Dist: vllm>=0.8.5,<0.9 ; extra == 'vllm'
258
256
  Requires-Dist: torch>=2.5,<3 ; extra == 'vllm'
259
257
  Requires-Python: >=3.12, <3.13
@@ -261,7 +259,6 @@ Project-URL: repository, https://github.com/Aleph-Alpha-Research/eval-framework
261
259
  Provides-Extra: accelerate
262
260
  Provides-Extra: all
263
261
  Provides-Extra: api
264
- Provides-Extra: comet
265
262
  Provides-Extra: determined
266
263
  Provides-Extra: mistral
267
264
  Provides-Extra: openai
@@ -319,7 +316,6 @@ pip install eval_framework
319
316
 
320
317
  There are optional extras available to unlock specific features of the library:
321
318
  - `api` for inference using the aleph-alpha client.
322
- - `comet` for the COMET metric.
323
319
  - `determined` for running jobs via determined.
324
320
  - `mistral` for inference on Mistral models.
325
321
  - `transformers` for inference using the transformers library.
@@ -47,7 +47,6 @@ pip install eval_framework
47
47
 
48
48
  There are optional extras available to unlock specific features of the library:
49
49
  - `api` for inference using the aleph-alpha client.
50
- - `comet` for the COMET metric.
51
50
  - `determined` for running jobs via determined.
52
51
  - `mistral` for inference on Mistral models.
53
52
  - `transformers` for inference using the transformers library.
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "eval-framework"
3
- version = "0.3.8"
3
+ version = "0.5.0"
4
4
  description = "Evaluation Framework"
5
5
  readme = "README.md"
6
6
  license = { file = "LICENSE" }
@@ -19,16 +19,15 @@ classifiers = [
19
19
  ]
20
20
  dependencies = [
21
21
  "pyyaml>=6.0.3,<7",
22
- "xmltodict>=0.15.1,<0.16",
22
+ "xmltodict>=1.0.4,<1.1",
23
23
  "pydantic>=2.13.4,<3",
24
- "datasets>=4.8.5,<5",
24
+ "datasets>=5.0.0,<6",
25
25
  "sacrebleu>=2.6.0,<3",
26
- "pycountry>=24.6.1,<25",
26
+ "pycountry>=26.2.16,<27",
27
27
  "nltk>=3.9.4,<4",
28
28
  "python-dotenv>=1.2.2,<2",
29
29
  "lingua-language-detector>=2.2.0,<3",
30
30
  "google-crc32c>=1.8.0,<2",
31
- "kubernetes>=31.0.0,<32", # required by llm-sandbox though actually not needed
32
31
  "langdetect>=1.0.9,<2", # required by the original ifeval implementation
33
32
  "spacy>=3.8.14,<4",
34
33
  "jsonschema>=4.26.0,<5",
@@ -39,14 +38,13 @@ dependencies = [
39
38
  "jsonlines>=4,<5",
40
39
  "lxml>=6.1.1,<7",
41
40
  "python-iso639>=2026.4.20",
42
- "wandb>=0.27.0,<1",
43
- "boto3>=1.43.18,<2",
44
- "numpy>=1.26.4",
41
+ "wandb>=0.27.2,<1",
42
+ "boto3>=1.43.19,<2",
43
+ "numpy>=2.2.6",
45
44
  # is a dependency of sympy, but not explicitly listed in the requirements.txt
46
45
  # https://github.com/sympy/sympy/blob/0204fa34e8f6f6f8ccb4de01209be9a2345c9d6e/doc/src/contributing/dependencies.md?plain=1#L125
47
46
  "antlr4-python3-runtime==4.11.0",
48
47
  "scipy>=1.17.1,<2", # required for the aggregation of pass@k metrics
49
-
50
48
  ]
51
49
 
52
50
  [project.optional-dependencies]
@@ -64,7 +62,7 @@ openai = [
64
62
  transformers = [
65
63
  "transformers>=4.45.2,<5",
66
64
  "torch>=2.5,<3",
67
- "accelerate>=0.34.2,<1",
65
+ "accelerate>=1.14.0,<2",
68
66
  ]
69
67
  accelerate = ["accelerate"]
70
68
  vllm = [
@@ -72,21 +70,17 @@ vllm = [
72
70
  "torch>=2.5,<3"
73
71
  ]
74
72
  mistral = [
75
- "mistral-common>=1.11.2,<2",
73
+ "mistral-common>=1.11.3,<2",
76
74
  "huggingface-hub>=0.36.2,<0.37",
77
75
  "eval_framework[vllm]",
78
76
  ]
79
- # Benchmark/metric specific extras
80
- comet = [
81
- "unbabel-comet>=2.2.7,<3",
82
- ]
83
77
  # from template-formatting
84
78
  optional = [
85
79
  "transformers>=4.45.2,<5",
86
80
  "jinja2>=3.1.6,<4"
87
81
  ]
88
82
  all = [
89
- "eval_framework[determined,api,openai,transformers,accelerate,vllm,comet,optional,mistral]"
83
+ "eval_framework[determined,api,openai,transformers,accelerate,vllm,optional,mistral]"
90
84
  ]
91
85
 
92
86
  [project.urls]
@@ -98,15 +92,15 @@ eval_framework = "eval_framework.run:run"
98
92
  [dependency-groups]
99
93
  dev = [
100
94
  "mypy>=2.1.0,<3",
101
- "pytest>=9.0.3,<10",
95
+ "pytest>=9.1.0,<10",
102
96
  "pytest-mock>=3.15.1",
103
97
  "pytest-xdist>=3.8.0,<4",
104
98
  "pytest-sugar>1.1,<2",
105
99
  "types-pyyaml>=6.0.12.20260518,<7",
106
100
  "types-python-dateutil>=2.9.0.20260518,<3",
107
101
  "types-requests>=2.33.0.20260518,<3",
108
- "plotly>=5.24.1,<6",
109
- "ruff>=0.15.15",
102
+ "plotly>=6.8.0,<7",
103
+ "ruff>=0.15.18",
110
104
  "pip-licenses>=5.5.5",
111
105
  ]
112
106
  flash-attn = [
@@ -115,7 +109,7 @@ flash-attn = [
115
109
  ]
116
110
 
117
111
  [build-system]
118
- requires = ["uv_build>=0.11.17,<0.11.18"]
112
+ requires = ["uv_build>=0.11.22,<0.11.23"]
119
113
  build-backend = "uv_build"
120
114
 
121
115
  [tool.uv.build-backend]
@@ -126,22 +120,6 @@ override-dependencies = [
126
120
  "requests>=2.32,<3", # fix for determined
127
121
  ]
128
122
 
129
- [tool.uv.sources]
130
- torch = [
131
- { index = "pytorch-default", marker = "sys_platform != 'linux'" },
132
- { index = "pytorch-cu124", marker = "sys_platform == 'linux'" },
133
- ]
134
-
135
- [[tool.uv.index]]
136
- name = "pytorch-cu124"
137
- url = "https://download.pytorch.org/whl/cu124"
138
- explicit = true
139
-
140
- [[tool.uv.index]]
141
- name = "pytorch-default"
142
- url = "https://pypi.org/simple"
143
- explicit = true
144
-
145
123
  [tool.uv.extra-build-dependencies]
146
124
  # Build flash-attn with the same torch version as in the container. Details at:
147
125
  # https://docs.astral.sh/uv/concepts/projects/config/#augmenting-build-dependencies
@@ -167,6 +145,7 @@ known-third-party = ["wandb"]
167
145
 
168
146
  [tool.ruff.lint.extend-per-file-ignores]
169
147
  "__init__.py" = ["F401"]
148
+ "tests/tests_eval_framework/tasks/benchmarks/test_mmlu_de.py" = ["E501"]
170
149
 
171
150
  [tool.mypy]
172
151
  plugins = "pydantic.mypy"
@@ -55,7 +55,7 @@ class OpenAIModel(BaseLLM):
55
55
  formatter: BaseFormatter | None = None,
56
56
  temperature: float | None = None,
57
57
  top_p: float | None = None,
58
- api_key: str | None = os.getenv("OPENAI_API_KEY", ""),
58
+ api_key: str | None = None,
59
59
  organization: str | None = None,
60
60
  base_url: str | None = None,
61
61
  bytes_per_token: float | None = None,
@@ -86,7 +86,7 @@ class OpenAIModel(BaseLLM):
86
86
  self._top_p = top_p
87
87
 
88
88
  self._client = OpenAI(
89
- api_key=api_key,
89
+ api_key=api_key if api_key is not None else os.getenv("OPENAI_API_KEY", ""),
90
90
  organization=organization,
91
91
  base_url=base_url,
92
92
  )
@@ -36,7 +36,7 @@ class BaseMetric[Response](ABC):
36
36
  # macro averaging the overall computation default.
37
37
  AGGREGATORS: list[Aggregator] = []
38
38
  # Set by the evaluation generator before calculate(); controls how infra failures are handled.
39
- fail_on_error: bool = False
39
+ fail_on_error: bool = True
40
40
 
41
41
  @classproperty
42
42
  def NAMES(cls) -> list[str]:
@@ -1,7 +1,7 @@
1
1
  from llm_sandbox.exceptions import SandboxTimeoutError
2
2
 
3
3
  from eval_framework.metrics.base import BaseMetric, MetricResult
4
- from eval_framework.shared.types import Completion, Error
4
+ from eval_framework.shared.types import Completion
5
5
  from eval_framework.tasks.utils import run_python_code
6
6
 
7
7
 
@@ -16,7 +16,7 @@ class CodeCompletionAssertion(BaseMetric[Completion]):
16
16
  code = response.completion
17
17
  try:
18
18
  output = run_python_code(code, image="python:3.12-slim")
19
- except SandboxTimeoutError as e:
19
+ except SandboxTimeoutError:
20
20
  # The submitted code timed out (e.g. an infinite loop) -- a failing sample, not an infra
21
21
  # problem.
22
22
  import traceback
@@ -26,7 +26,7 @@ class CodeCompletionAssertion(BaseMetric[Completion]):
26
26
  metric_name=self.NAME,
27
27
  value=0.0,
28
28
  higher_is_better=True,
29
- error=Error(error_class=e.__class__.__name__, message=str(e), traceback=traceback.format_exc()),
29
+ code_execution_trace=traceback.format_exc(),
30
30
  )
31
31
  ]
32
32
  except Exception as e:
@@ -42,22 +42,12 @@ class CodeCompletionAssertion(BaseMetric[Completion]):
42
42
  last_output = output_parts[-1]
43
43
 
44
44
  success = last_output == "True"
45
- error = (
46
- None
47
- if success
48
- else Error(
49
- error_class="CodeCompletionAssertionError",
50
- message=f"Expected 'True' but got '{last_output}'",
51
- traceback=output,
52
- )
53
- )
54
-
55
45
  return [
56
46
  MetricResult(
57
47
  metric_name=self.NAME,
58
48
  value=1.0 if success else 0.0,
59
49
  higher_is_better=True,
60
- error=error,
50
+ error=None,
61
51
  code_execution_trace=output,
62
52
  )
63
53
  ]
@@ -0,0 +1,12 @@
1
+ # Register all tasks on import
2
+ from pathlib import Path
3
+
4
+ from .dataset_revisions import DatasetRevision
5
+ from .task_names import register_all_tasks
6
+
7
+ DatasetRevision.add_revision_file(Path(__file__).parent / "task-dataset-revisions.json")
8
+
9
+ register_all_tasks()
10
+
11
+ del register_all_tasks
12
+ del DatasetRevision
@@ -15,7 +15,7 @@ from huggingface_hub.errors import RevisionNotFoundError
15
15
  from pydantic import BaseModel, ConfigDict
16
16
 
17
17
  from eval_framework.shared.types import BaseMetricContext, Completion, Error, RawCompletion
18
- from eval_framework.tasks.benchmarks.dataset_revisions import get_pinned_dataset_revision
18
+ from eval_framework.tasks.dataset_revisions import DatasetRevision
19
19
  from eval_framework.tasks.utils import classproperty, raise_errors
20
20
  from template_formatting.formatter import Message, Role
21
21
 
@@ -118,7 +118,7 @@ class BaseTask[SubjectType](ABC):
118
118
  # Applied once at instance creation; not refreshed if the pin file changes mid-run.
119
119
  if custom_hf_revision:
120
120
  self.HF_REVISION = custom_hf_revision
121
- elif self.HF_REVISION is None and (pinned := get_pinned_dataset_revision(self.__class__.__name__)):
121
+ elif self.HF_REVISION is None and (pinned := DatasetRevision.pinned_revision(self.__class__.__name__)):
122
122
  self.HF_REVISION = pinned
123
123
 
124
124
  @classmethod
@@ -359,7 +359,7 @@ class BaseTask[SubjectType](ABC):
359
359
  samples: list[Sample],
360
360
  stop_sequences: list[str] | None = None,
361
361
  max_tokens: int | None = None,
362
- fail_on_error: bool = False,
362
+ fail_on_error: bool = True,
363
363
  ) -> list[Completion]:
364
364
  """
365
365
  Generates completions for the sample.
@@ -109,7 +109,7 @@ class AidanBenchOriginal(BaseTask[str]):
109
109
  stop_sequences: list[str] | None,
110
110
  max_tokens: int | None,
111
111
  initial_samples: list[Sample],
112
- fail_on_error: bool = False,
112
+ fail_on_error: bool = True,
113
113
  ) -> tuple[list[list[Message]], list[Union["Error", None]]]:
114
114
  initial_messages = [s.messages for s in initial_samples]
115
115
  samples = [(s, False) for s in initial_samples] # (sample, is_done)
@@ -170,7 +170,7 @@ class AidanBenchOriginal(BaseTask[str]):
170
170
  samples: list[Sample],
171
171
  stop_sequences: list[str] | None = None,
172
172
  max_tokens: int | None = None,
173
- fail_on_error: bool = False,
173
+ fail_on_error: bool = True,
174
174
  ) -> list[Completion]:
175
175
  assert all(len(s.messages) == 1 and s.messages[0].role == Role.USER for s in samples), (
176
176
  "Each sample must have exactly one USER message."
@@ -1,7 +1,7 @@
1
1
  import os
2
2
  import random
3
3
  from pathlib import Path
4
- from typing import Any
4
+ from typing import Any, cast
5
5
 
6
6
  import pycountry
7
7
  from datasets import DatasetDict, DownloadConfig, load_dataset
@@ -100,11 +100,11 @@ class Flores200(BaseTask[str]):
100
100
 
101
101
  def _get_instruction_text(self, item: dict[str, Any]) -> str:
102
102
  source_key = item["subject"].split("-")[0]
103
- source_language = pycountry.languages.get(alpha_3=source_key.split("_")[0]).name
103
+ source_language = cast(Any, pycountry.languages.get(alpha_3=source_key.split("_")[0])).name
104
104
  source = item[f"sentence_{source_key}"]
105
105
  instruction = f"{source_language} sentence: {source}\n"
106
106
  target_key = item["subject"].split("-")[1]
107
- target_language = pycountry.languages.get(alpha_3=target_key.split("_")[0]).name
107
+ target_language = cast(Any, pycountry.languages.get(alpha_3=target_key.split("_")[0])).name
108
108
 
109
109
  return f"{instruction}{target_language} sentence:"
110
110
 
@@ -4,7 +4,6 @@ from typing import Any
4
4
 
5
5
  from eval_framework.metrics.completion.bleu import BLEU
6
6
  from eval_framework.metrics.completion.chrf import CHRF
7
- from eval_framework.metrics.completion.comet import COMET
8
7
  from eval_framework.shared.types import BaseMetricContext, UntemplatedPrompt
9
8
  from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample
10
9
 
@@ -29,7 +28,7 @@ class FloresPlus(BaseTask[str]):
29
28
  SAMPLE_SPLIT = "dev"
30
29
  FEWSHOT_SPLIT = "devtest"
31
30
  RESPONSE_TYPE = ResponseType.COMPLETION
32
- METRICS = [BLEU, CHRF, COMET]
31
+ METRICS = [BLEU, CHRF]
33
32
  SUBJECTS = [f"{s}-{t}" for s, t in product(LANG_MAP, LANG_MAP) if s != t]
34
33
  PERTURBATION_UNMODIFIABLE_WORDS = ["sentence"]
35
34
  LANGUAGE = {
@@ -95,7 +95,6 @@ class GSM8KEvalHarness(BaseTask[str]):
95
95
 
96
96
  NAME = "GSM8KEvalHarness"
97
97
  DATASET_PATH = "openai/gsm8k"
98
- HF_REVISION = "main"
99
98
  SAMPLE_SPLIT = "test"
100
99
  FEWSHOT_SPLIT = "train"
101
100
  RESPONSE_TYPE = ResponseType.COMPLETION
@@ -236,6 +236,27 @@ class SQUAD(SQUAD2):
236
236
  return item["answers"]["text"]
237
237
 
238
238
 
239
+ class SQuAD2_MA(SQUAD2):
240
+ """SQuAD v2 with the exact system prompt used in MA training"""
241
+
242
+ NAME = "SQuAD2_MA"
243
+ UNANSWERABLE_STR = "unanswerable"
244
+
245
+ METRICS = [AccuracyCompletion, F1, F1SquadNormalized]
246
+
247
+ def _get_system_prompt_text(self, item: dict[str, Any]) -> str | None:
248
+ return (
249
+ "You are a helpful assistant and will answer the user's questions carefully, "
250
+ "logically, accurately and well-reasoned.\n"
251
+ "Use the given context to answer the question faithfully. Answer only if the "
252
+ f"answer is present in the given context, otherwise respond with '{self.UNANSWERABLE_STR}' "
253
+ "if the answer is not present in the context."
254
+ )
255
+
256
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
257
+ return f"Context:\n{item['context']}\n\nQuestion:\n{item['question']}\n"
258
+
259
+
239
260
  class SQuAD_OLMES(SQUAD):
240
261
  """SQuAD variant matching OLMES implementation."""
241
262
 
@@ -2,7 +2,7 @@ import random
2
2
  from typing import Any
3
3
 
4
4
  from eval_framework.metrics.completion.accuracy_completion import AccuracyCompletion
5
- from eval_framework.metrics.completion.f1 import F1
5
+ from eval_framework.metrics.completion.f1 import F1, F1SquadNormalized
6
6
  from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample
7
7
 
8
8
 
@@ -40,3 +40,29 @@ class TRIVIAQA(BaseTask[str]):
40
40
 
41
41
  def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
42
42
  return completion_text.strip().rstrip(".")
43
+
44
+
45
+ class TriviaQA_MA(TRIVIAQA):
46
+ """TriviaQA with the exact system prompt used in MA training"""
47
+
48
+ NAME = "TriviaQA_MA"
49
+ SUBJECTS = ["rc.wikipedia"]
50
+ UNANSWERABLE_STR = "unanswerable"
51
+
52
+ METRICS = [AccuracyCompletion, F1, F1SquadNormalized]
53
+ PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer", "Context", "unanswerable"]
54
+
55
+ def _get_context_text(self, item: dict[str, Any]) -> str:
56
+ return "\n\n".join(item["entity_pages"]["wiki_context"])
57
+
58
+ def _get_system_prompt_text(self, item: dict[str, Any]) -> str | None:
59
+ return (
60
+ "You are a helpful assistant and will answer the user's questions carefully, "
61
+ "logically, accurately and well-reasoned.\n"
62
+ "Use the given context to answer the question faithfully. Answer only if the "
63
+ f"answer is present in the given context, otherwise respond with '{self.UNANSWERABLE_STR}' "
64
+ "if the answer is not present in the context."
65
+ )
66
+
67
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
68
+ return f"Context:\n{self._get_context_text(item)}\n\nQuestion:\n{item['question'].strip()}\n"
@@ -1,6 +1,6 @@
1
1
  import random
2
2
  from abc import ABC
3
- from typing import Any
3
+ from typing import Any, cast
4
4
 
5
5
  import pycountry
6
6
  import sacrebleu
@@ -38,7 +38,7 @@ class WMT(BaseTask[str], ABC):
38
38
  def _code_to_language(self, code: str) -> str:
39
39
  # key is alpha_2 or alpha_3 depending on the code length
40
40
  key = f"alpha_{len(code)}"
41
- language_tuple = pycountry.languages.get(**{key: code})
41
+ language_tuple = cast(Any, pycountry.languages.get(**{key: code}))
42
42
  return language_tuple.name
43
43
 
44
44
  def _get_instruction_text(self, item: dict[str, Any]) -> str:
@@ -25,12 +25,35 @@ def _pinned_revisions(revisions_file: Path) -> dict[str, str]:
25
25
  return json.loads(revisions_file.read_text(encoding="utf-8"))
26
26
 
27
27
 
28
- def get_pinned_dataset_revision(
29
- task_class_name: str,
30
- *,
31
- revisions_file: Path | None = None,
32
- ) -> str | None:
33
- return _pinned_revisions(revisions_file or REVISIONS_FILE).get(task_class_name)
28
+ class DatasetRevision:
29
+ _INSTANCE: "DatasetRevision | None" = None
30
+
31
+ def __init__(self) -> None:
32
+ self._cache: dict[str, str] = {}
33
+
34
+ @classmethod
35
+ def _get_instance(cls) -> "DatasetRevision":
36
+ if cls._INSTANCE is None:
37
+ cls._INSTANCE = cls()
38
+ return cls._INSTANCE
39
+
40
+ @classmethod
41
+ def add_revision_file(cls, file_path: Path | str) -> None:
42
+ instance = cls._get_instance()
43
+ instance._append_revision_file(Path(file_path))
44
+
45
+ @classmethod
46
+ def pinned_revision(cls, task_class_name: str) -> str | None:
47
+ return cls._get_instance()._cache.get(task_class_name)
48
+
49
+ @classmethod
50
+ def reset(cls) -> None:
51
+ # for unit tests only.
52
+ cls._INSTANCE = None
53
+
54
+ def _append_revision_file(self, file_path: Path) -> None:
55
+ revisions = _pinned_revisions(file_path)
56
+ self._cache |= revisions
34
57
 
35
58
 
36
59
  def _repo_sha(api: HfApi, repo_id: str, cache: dict[str, str | None]) -> str | None:
@@ -73,7 +96,7 @@ def main() -> None:
73
96
  revisions = collect_dataset_revisions(registered_task_names(), HfApi())
74
97
  REVISIONS_FILE.parent.mkdir(parents=True, exist_ok=True)
75
98
  REVISIONS_FILE.write_text(
76
- json.dumps(dict(sorted(revisions.items())), indent=2, ensure_ascii=False) + "\n",
99
+ json.dumps(dict(sorted(revisions.items())), indent=4, ensure_ascii=False) + "\n",
77
100
  encoding="utf-8",
78
101
  )
79
102
  logger.info("Wrote %d revisions to %s", len(revisions), REVISIONS_FILE)