eval-framework 0.3.4__tar.gz → 0.3.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (191) hide show
  1. {eval_framework-0.3.4 → eval_framework-0.3.6}/PKG-INFO +1 -1
  2. {eval_framework-0.3.4 → eval_framework-0.3.6}/pyproject.toml +1 -1
  3. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/evaluation_generator.py +27 -2
  4. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/math_minerva_completion.py +33 -12
  5. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/minerva_math_utils.py +45 -4
  6. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/response_generator.py +4 -3
  7. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/result_processors/result_processor.py +4 -4
  8. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/base.py +5 -2
  9. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/aidanbench.py +11 -2
  10. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/eval_config.py +4 -0
  11. {eval_framework-0.3.4 → eval_framework-0.3.6}/LICENSE +0 -0
  12. {eval_framework-0.3.4 → eval_framework-0.3.6}/README.md +0 -0
  13. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/__init__.py +0 -0
  14. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/base_config.py +0 -0
  15. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/context/__init__.py +0 -0
  16. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/context/determined.py +0 -0
  17. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/context/eval.py +0 -0
  18. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/context/local.py +0 -0
  19. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/exceptions.py +0 -0
  20. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/external/drop_process_results.py +0 -0
  21. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/external/ifeval_impl/README.md +0 -0
  22. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
  23. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
  24. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
  25. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
  26. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/llm/__init__.py +0 -0
  27. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/llm/aleph_alpha.py +0 -0
  28. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/llm/base.py +0 -0
  29. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/llm/huggingface.py +0 -0
  30. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/llm/mistral.py +0 -0
  31. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/llm/models.py +0 -0
  32. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/llm/openai.py +0 -0
  33. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/llm/vllm.py +0 -0
  34. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/logger.py +0 -0
  35. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/main.py +0 -0
  36. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/__init__.py +0 -0
  37. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/aggregators/__init__.py +0 -0
  38. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/aggregators/aggregators.py +0 -0
  39. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/base.py +0 -0
  40. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/__init__.py +0 -0
  41. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
  42. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
  43. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/bleu.py +0 -0
  44. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/chrf.py +0 -0
  45. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
  46. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
  47. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/comet.py +0 -0
  48. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
  49. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/csv_format.py +0 -0
  50. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
  51. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/drop_completion.py +0 -0
  52. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
  53. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/f1.py +0 -0
  54. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/format_checker.py +0 -0
  55. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
  56. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/ifeval.py +0 -0
  57. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/json_format.py +0 -0
  58. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/language_checker.py +0 -0
  59. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/length_control.py +0 -0
  60. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
  61. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/multipl_e_assertion.py +0 -0
  62. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
  63. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
  64. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/repetition.py +0 -0
  65. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
  66. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
  67. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
  68. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
  69. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
  70. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/ter.py +0 -0
  71. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/text_counter.py +0 -0
  72. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
  73. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
  74. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/__init__.py +0 -0
  75. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/base.py +0 -0
  76. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
  77. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
  78. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
  79. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
  80. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
  81. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
  82. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
  83. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/graders/language.py +0 -0
  84. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
  85. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/graders/models.py +0 -0
  86. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
  87. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
  88. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
  89. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
  90. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
  91. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
  92. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
  93. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
  94. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
  95. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
  96. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
  97. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
  98. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
  99. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
  100. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
  101. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/utils.py +0 -0
  102. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
  103. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
  104. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
  105. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/loglikelihood/bits_per_byte.py +0 -0
  106. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
  107. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
  108. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
  109. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
  110. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/py.typed +0 -0
  111. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/result_processors/__init__.py +0 -0
  112. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/result_processors/base.py +0 -0
  113. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/result_processors/hf_uploader.py +0 -0
  114. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
  115. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/run.py +0 -0
  116. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/run_direct.py +0 -0
  117. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/shared/types.py +0 -0
  118. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/suite.py +0 -0
  119. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/Dockerfile_codebench +0 -0
  120. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/__init__.py +0 -0
  121. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
  122. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
  123. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
  124. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
  125. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/balancedcopa.py +0 -0
  126. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
  127. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
  128. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
  129. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
  130. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
  131. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/csqa.py +0 -0
  132. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/drop.py +0 -0
  133. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
  134. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/flores200.py +0 -0
  135. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
  136. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/global_mmlu.py +0 -0
  137. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/goldenswag.py +0 -0
  138. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
  139. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/gsm8k.py +0 -0
  140. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
  141. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
  142. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
  143. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
  144. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/include.py +0 -0
  145. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
  146. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/lab_bench.py +0 -0
  147. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/math_reasoning.py +0 -0
  148. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
  149. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/medqa.py +0 -0
  150. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
  151. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
  152. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
  153. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
  154. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/multipl_e.py +0 -0
  155. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/naturalqs_open.py +0 -0
  156. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
  157. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
  158. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
  159. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
  160. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
  161. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
  162. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/social_iqa.py +0 -0
  163. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
  164. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/squad.py +0 -0
  165. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
  166. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
  167. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
  168. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/truthfulqa.py +0 -0
  169. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
  170. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
  171. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
  172. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
  173. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
  174. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/perturbation.py +0 -0
  175. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/registry.py +0 -0
  176. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/task_loader.py +0 -0
  177. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/task_names.py +0 -0
  178. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/task_style.py +0 -0
  179. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/tasks/utils.py +0 -0
  180. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/utils/constants.py +0 -0
  181. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/utils/file_ops.py +0 -0
  182. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/utils/generate_task_docs.py +0 -0
  183. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/utils/helpers.py +0 -0
  184. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/utils/logging.py +0 -0
  185. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/utils/packaging.py +0 -0
  186. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/eval_framework/utils/tqdm_handler.py +0 -0
  187. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/template_formatting/README.md +0 -0
  188. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/template_formatting/__init__.py +0 -0
  189. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/template_formatting/formatter.py +0 -0
  190. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/template_formatting/mistral_formatter.py +0 -0
  191. {eval_framework-0.3.4 → eval_framework-0.3.6}/src/template_formatting/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: eval-framework
3
- Version: 0.3.4
3
+ Version: 0.3.6
4
4
  Summary: Evalulation Framework
5
5
  Author: Aleph Alpha Research
6
6
  License: Apache License
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "eval-framework"
3
- version = "0.3.4"
3
+ version = "0.3.6"
4
4
  description = "Evalulation Framework"
5
5
  readme = "README.md"
6
6
  license = { file = "LICENSE" }
@@ -127,7 +127,18 @@ class EvaluationGenerator:
127
127
  return results
128
128
 
129
129
  def _aggregate_results(self, results: list[Result]) -> dict[str, float | None]:
130
- data = pd.DataFrame([r.model_dump() for r in results])
130
+ data = pd.DataFrame(
131
+ [
132
+ {
133
+ "metric_name": r.metric_name,
134
+ "subject": r.subject,
135
+ "key": r.key,
136
+ "value": r.value,
137
+ "error": r.error,
138
+ }
139
+ for r in results
140
+ ]
141
+ )
131
142
  if len(data) == 0:
132
143
  return {}
133
144
  data.fillna({"key": ""}, inplace=True)
@@ -251,7 +262,20 @@ class EvaluationGenerator:
251
262
  return aggregated_results
252
263
 
253
264
  def _aggregate_results_with_aggregators(self, results: list[Result]) -> dict[str, float | None]:
254
- data = pd.DataFrame([r.model_dump() for r in results])
265
+ data = pd.DataFrame(
266
+ [
267
+ {
268
+ "metric_name": r.metric_name,
269
+ "metric_class_name": r.metric_class_name,
270
+ "subject": r.subject,
271
+ "key": r.key,
272
+ "value": r.value,
273
+ "error": r.error,
274
+ "prompt": r.prompt,
275
+ }
276
+ for r in results
277
+ ]
278
+ )
255
279
  if len(data) == 0:
256
280
  return {}
257
281
  data = data.fillna({"key": ""})
@@ -313,6 +337,7 @@ class EvaluationGenerator:
313
337
  raise ValueError("No saved completions found. Run 'run_completions' first.")
314
338
 
315
339
  metrics_results = self._run_metric_calculators(responses)
340
+ del responses
316
341
  aggregated_results = self._aggregate_results(metrics_results)
317
342
  results_with_aggregators = self._aggregate_results_with_aggregators(metrics_results)
318
343
  aggregated_results.update(results_with_aggregators)
@@ -17,21 +17,28 @@ class MathMinervaCompletion(BaseMetric[Completion]):
17
17
  Minerva MATH: reports Exact Match and Exact Match (Flex).
18
18
  Uses raw_completion to extract multiple candidates; primary for exact_match,
19
19
  all candidates with both Minerva and Hendrycks equivalence for exact_match_flex.
20
+
21
+ English Minerva extraction is the default. Subclasses select other
22
+ final-answer styles by overriding ``COT_STYLE`` / ``RELAXED``.
20
23
  """
21
24
 
22
25
  NAME = "Math Minerva Completion"
23
26
  KEYS = ["Exact", "Exact Flex"]
24
27
  AGGREGATORS = [PassAtK()]
25
28
 
29
+ # Defaults; subclasses override these class attributes to define variants.
30
+ COT_STYLE: str = "minerva"
31
+ RELAXED: bool = False
32
+
26
33
  def __init__(
27
34
  self,
28
35
  use_cot: bool = True,
29
- cot_style: str = "minerva",
30
- relaxed: bool = False,
36
+ cot_style: str | None = None,
37
+ relaxed: bool | None = None,
31
38
  ) -> None:
32
39
  self.use_cot = use_cot
33
- self.cot_style = cot_style
34
- self.relaxed = relaxed
40
+ self.cot_style = cot_style if cot_style is not None else self.COT_STYLE
41
+ self.relaxed = relaxed if relaxed is not None else self.RELAXED
35
42
 
36
43
  def calculate(self, response: Completion) -> list[MetricResult]:
37
44
  if response.error:
@@ -60,7 +67,12 @@ class MathMinervaCompletion(BaseMetric[Completion]):
60
67
  ]
61
68
 
62
69
  raw = response.raw_completion or response.completion
63
- all_candidates = extract_answers(raw, use_cot=self.use_cot, cot_style=self.cot_style, relaxed=self.relaxed)
70
+ all_candidates = extract_answers(
71
+ raw,
72
+ use_cot=self.use_cot,
73
+ cot_style=self.cot_style,
74
+ relaxed=self.relaxed,
75
+ )
64
76
 
65
77
  exact_match = 0.0
66
78
  if all_candidates:
@@ -83,10 +95,19 @@ class MathMinervaCompletion(BaseMetric[Completion]):
83
95
  class MathMinervaCompletionRelaxed(MathMinervaCompletion):
84
96
  """MathMinervaCompletion with relaxed=True by default (flexible final-answer matching)."""
85
97
 
86
- def __init__(
87
- self,
88
- use_cot: bool = True,
89
- cot_style: str = "minerva",
90
- relaxed: bool = True,
91
- ) -> None:
92
- super().__init__(use_cot=use_cot, cot_style=cot_style, relaxed=relaxed)
98
+ NAME = "Math Minerva Completion Relaxed"
99
+ RELAXED = True
100
+
101
+
102
+ class MathMinervaCompletionDE(MathMinervaCompletion):
103
+ """MathMinervaCompletion with German final-answer extraction (``Finale Antwort: …``)."""
104
+
105
+ NAME = "Math Minerva Completion DE"
106
+ COT_STYLE = "minerva_de"
107
+
108
+
109
+ class MathMinervaCompletionRelaxedDE(MathMinervaCompletionDE):
110
+ """MathMinervaCompletionDE with relaxed=True by default."""
111
+
112
+ NAME = "Math Minerva Completion Relaxed DE"
113
+ RELAXED = True
@@ -12,6 +12,7 @@ from sympy.parsing.latex.errors import LaTeXParsingError
12
12
 
13
13
  INVALID_ANSWER = "[invalidanswer]"
14
14
  END_SEQ = "I hope it is correct."
15
+ END_SEQ_DE = "Ich hoffe, die Antwort ist korrekt." # German pendant to END_SEQ
15
16
 
16
17
  # Minerva normalize_final_answer: appendix D of Lewkowycz et al. (2022)
17
18
  SUBSTITUTIONS = [
@@ -143,6 +144,44 @@ def get_unnormalized_answer(text: str, relaxed: bool = False) -> str:
143
144
  return INVALID_ANSWER
144
145
 
145
146
 
147
+ def get_unnormalized_answer_de(text: str, relaxed: bool = False) -> str:
148
+ """German analogue of ``get_unnormalized_answer``."""
149
+ if relaxed:
150
+ match = re.search(
151
+ r"(?i)(?:finale|endgültige)\s+antwort\s*:\s*"
152
+ r"(?:die\s+(?:finale\s+|endgültige\s+)?antwort\s+(?:ist|lautet)\s*)?(.*)",
153
+ text,
154
+ re.DOTALL,
155
+ )
156
+ if match:
157
+ raw = match.group(1).strip()
158
+ raw = re.sub(
159
+ r"\.?\s*ich\s+hoffe,?\s+(?:die\s+antwort|sie|es)\s+(?:ist|sei)\s+korrekt\.?\s*$",
160
+ "",
161
+ raw,
162
+ flags=re.IGNORECASE,
163
+ ).strip()
164
+ return raw
165
+ return INVALID_ANSWER
166
+ text = text + END_SEQ_DE
167
+ match = re.search(
168
+ r"Finale Antwort: Die finale Antwort lautet(.*?)\. Ich hoffe, die Antwort ist korrekt\.",
169
+ text,
170
+ )
171
+ if match:
172
+ return match.group(1).strip()
173
+ return INVALID_ANSWER
174
+
175
+
176
+ # Registry of supported `cot_style` values
177
+ # Keys are the strings passed by metric configurations; values are language-specific final-answer extractors
178
+ # `(text: str, relaxed: bool) -> str`. Extend this dict to add a new language.
179
+ COT_EXTRACTORS = {
180
+ "minerva": get_unnormalized_answer,
181
+ "minerva_de": get_unnormalized_answer_de,
182
+ }
183
+
184
+
146
185
  def normalized_gold_from_solution(solution: str) -> str | None:
147
186
  """Extract and normalize the gold answer from a solution string (last \\boxed{...})."""
148
187
  boxed = last_boxed_only_string(solution)
@@ -368,10 +407,12 @@ def extract_answers(
368
407
  all_answers: list[str] = []
369
408
 
370
409
  if use_cot:
371
- if cot_style == "minerva":
372
- minerva_answer = normalize_final_answer(get_unnormalized_answer(raw, relaxed=relaxed))
373
- if minerva_answer and minerva_answer != INVALID_ANSWER:
374
- all_answers.append(minerva_answer)
410
+ if cot_style not in COT_EXTRACTORS:
411
+ raise ValueError(f"Unknown cot_style {cot_style!r}; valid: {sorted(COT_EXTRACTORS)}")
412
+ extractor = COT_EXTRACTORS[cot_style]
413
+ minerva_answer = normalize_final_answer(extractor(raw, relaxed=relaxed))
414
+ if minerva_answer and minerva_answer != INVALID_ANSWER:
415
+ all_answers.append(minerva_answer)
375
416
  boxed = last_boxed_only_string(raw)
376
417
  if boxed is not None:
377
418
  try:
@@ -114,8 +114,8 @@ class ResponseGenerator:
114
114
  try:
115
115
  raw_loglikelihoods = self.llm.logprobs(samples)
116
116
  except Exception as e:
117
- if raise_errors():
118
- raise e
117
+ if raise_errors() or self.config.fail_on_error:
118
+ raise
119
119
  logger.info(f"Error: {e.__class__.__name__} {e}")
120
120
  raw_loglikelihoods = [
121
121
  RawLoglikelihood(
@@ -166,7 +166,8 @@ class ResponseGenerator:
166
166
  self.llm,
167
167
  stop_sequences=stop_sequences,
168
168
  max_tokens=max_tokens,
169
- ) # type: ignore[call-arg]
169
+ fail_on_error=self.config.fail_on_error,
170
+ )
170
171
  case ResponseType.LOGLIKELIHOODS:
171
172
  return self._generate_loglikelihoods
172
173
  case _:
@@ -36,9 +36,9 @@ class ResultsFileProcessor(ResultProcessor):
36
36
  return {}
37
37
 
38
38
  def save_responses(self, responses: list[Completion | Loglikelihood]) -> None:
39
- responses_data = [response.model_dump(mode="json", serialize_as_any=True) for response in responses]
40
39
  with jsonlines.open(self.output_dir / "output.jsonl", "w") as f:
41
- f.write_all(responses_data)
40
+ for response in responses:
41
+ f.write(response.model_dump(mode="json", serialize_as_any=True))
42
42
 
43
43
  def save_response(self, response: Completion | Loglikelihood) -> None:
44
44
  with jsonlines.open(self.output_dir / "output.jsonl", "a") as f:
@@ -72,9 +72,9 @@ class ResultsFileProcessor(ResultProcessor):
72
72
  return responses
73
73
 
74
74
  def save_metrics_results(self, results: list[Result]) -> None:
75
- result_data = [x.model_dump(mode="json") for x in results]
76
75
  with jsonlines.open(self.output_dir / "results.jsonl", "w") as f:
77
- f.write_all(result_data)
76
+ for result in results:
77
+ f.write(result.model_dump(mode="json"))
78
78
 
79
79
  def save_metrics_result(self, result: Result) -> None:
80
80
  with jsonlines.open(self.output_dir / "results.jsonl", "a") as f:
@@ -352,12 +352,15 @@ class BaseTask[SubjectType](ABC):
352
352
  samples: list[Sample],
353
353
  stop_sequences: list[str] | None = None,
354
354
  max_tokens: int | None = None,
355
+ fail_on_error: bool = False,
355
356
  ) -> list[Completion]:
356
357
  """
357
358
  Generates completions for the sample.
358
359
  :param sample: sample to generate completions for
359
360
  :param stop_sequences: stop sequences to use in completion generation
360
361
  :param max_tokens: maximum tokens to use in completion generation
362
+ :param fail_on_error: if True, re-raise the original exception instead of capturing it
363
+ into a per-sample Error completion
361
364
  :return: completion
362
365
  """
363
366
  if stop_sequences is None:
@@ -367,8 +370,8 @@ class BaseTask[SubjectType](ABC):
367
370
  try:
368
371
  raw_completions = llm.generate(samples=samples, stop_sequences=stop_sequences, max_tokens=max_tokens)
369
372
  except Exception as e:
370
- if raise_errors():
371
- raise e
373
+ if raise_errors() or fail_on_error:
374
+ raise
372
375
  logger.info(f"Error: {e.__class__.__name__} {e}")
373
376
  raw_completions = [
374
377
  RawCompletion(
@@ -104,7 +104,12 @@ class AidanBenchOriginal(BaseTask[str]):
104
104
  return [Message(role=Role.USER, content=instruction_message)]
105
105
 
106
106
  def _generation_loop(
107
- self, llm: "BaseLLM", stop_sequences: list[str] | None, max_tokens: int | None, initial_samples: list[Sample]
107
+ self,
108
+ llm: "BaseLLM",
109
+ stop_sequences: list[str] | None,
110
+ max_tokens: int | None,
111
+ initial_samples: list[Sample],
112
+ fail_on_error: bool = False,
108
113
  ) -> tuple[list[list[Message]], list[Union["Error", None]]]:
109
114
  initial_messages = [s.messages for s in initial_samples]
110
115
  samples = [(s, False) for s in initial_samples] # (sample, is_done)
@@ -118,6 +123,7 @@ class AidanBenchOriginal(BaseTask[str]):
118
123
  [samples[i][0] for i in not_done_idx],
119
124
  stop_sequences=stop_sequences,
120
125
  max_tokens=max_tokens,
126
+ fail_on_error=fail_on_error,
121
127
  )
122
128
  new_completion_messages: list[list[Message] | None] = [c.messages for c in new_completions]
123
129
  new_errors = [c.error for c in new_completions]
@@ -164,11 +170,14 @@ class AidanBenchOriginal(BaseTask[str]):
164
170
  samples: list[Sample],
165
171
  stop_sequences: list[str] | None = None,
166
172
  max_tokens: int | None = None,
173
+ fail_on_error: bool = False,
167
174
  ) -> list[Completion]:
168
175
  assert all(len(s.messages) == 1 and s.messages[0].role == Role.USER for s in samples), (
169
176
  "Each sample must have exactly one USER message."
170
177
  )
171
- all_message_histories, errors = self._generation_loop(llm, stop_sequences, max_tokens, samples)
178
+ all_message_histories, errors = self._generation_loop(
179
+ llm, stop_sequences, max_tokens, samples, fail_on_error=fail_on_error
180
+ )
172
181
 
173
182
  completion_list = []
174
183
  for idx, sample in enumerate(samples):
@@ -27,6 +27,7 @@ KEYS_UNRELATED_TO_RESULTS = {
27
27
  "save_intermediate_results",
28
28
  "save_logs",
29
29
  "delete_output_dir_after_upload",
30
+ "fail_on_error",
30
31
  }
31
32
 
32
33
 
@@ -59,6 +60,9 @@ class EvalConfig(BaseConfig):
59
60
  # how many times to repeat a single sample
60
61
  # can be used to reduce variance of tasks with low number of samples, e.g. AIME24
61
62
  repeats: Annotated[int, BeforeValidator(lambda v: 1 if v is None else v), Field(ge=1)] = 1
63
+ # When True, request/sample errors (e.g. unreachable inference endpoint, exhausted retries)
64
+ # propagate instead of being captured into a blank Error result.
65
+ fail_on_error: Annotated[bool, BeforeValidator(lambda v: False if v is None else v)] = False
62
66
  # Adding a new member? Remember to update KEYS_UNRELATED_TO_RESULTS if it doesn't impact eval results.
63
67
 
64
68
  @property
File without changes
File without changes