eval-framework 0.3.0__tar.gz → 0.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (191) hide show
  1. {eval_framework-0.3.0 → eval_framework-0.3.1}/PKG-INFO +1 -1
  2. {eval_framework-0.3.0 → eval_framework-0.3.1}/pyproject.toml +1 -1
  3. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/response_generator.py +24 -8
  4. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/suite.py +33 -31
  5. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/base.py +8 -7
  6. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/global_mmlu.py +7 -1
  7. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/mmmlu.py +7 -1
  8. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/naturalqs_open.py +8 -0
  9. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/eval_config.py +2 -1
  10. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/task_names.py +3 -0
  11. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/task_style.py +49 -6
  12. {eval_framework-0.3.0 → eval_framework-0.3.1}/LICENSE +0 -0
  13. {eval_framework-0.3.0 → eval_framework-0.3.1}/README.md +0 -0
  14. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/__init__.py +0 -0
  15. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/base_config.py +0 -0
  16. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/context/__init__.py +0 -0
  17. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/context/determined.py +0 -0
  18. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/context/eval.py +0 -0
  19. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/context/local.py +0 -0
  20. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/evaluation_generator.py +0 -0
  21. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/exceptions.py +0 -0
  22. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/external/drop_process_results.py +0 -0
  23. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/external/ifeval_impl/README.md +0 -0
  24. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
  25. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
  26. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
  27. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
  28. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/llm/__init__.py +0 -0
  29. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/llm/aleph_alpha.py +0 -0
  30. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/llm/base.py +0 -0
  31. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/llm/huggingface.py +0 -0
  32. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/llm/mistral.py +0 -0
  33. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/llm/models.py +0 -0
  34. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/llm/openai.py +0 -0
  35. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/llm/vllm.py +0 -0
  36. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/logger.py +0 -0
  37. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/main.py +0 -0
  38. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/__init__.py +0 -0
  39. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/aggregators/__init__.py +0 -0
  40. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/aggregators/aggregators.py +0 -0
  41. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/base.py +0 -0
  42. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/__init__.py +0 -0
  43. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
  44. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
  45. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/bleu.py +0 -0
  46. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/chrf.py +0 -0
  47. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
  48. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
  49. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/comet.py +0 -0
  50. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
  51. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/csv_format.py +0 -0
  52. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
  53. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/drop_completion.py +0 -0
  54. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
  55. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/f1.py +0 -0
  56. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/format_checker.py +0 -0
  57. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
  58. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/ifeval.py +0 -0
  59. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/json_format.py +0 -0
  60. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/language_checker.py +0 -0
  61. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/length_control.py +0 -0
  62. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/math_minerva_completion.py +0 -0
  63. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
  64. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/minerva_math_utils.py +0 -0
  65. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/multipl_e_assertion.py +0 -0
  66. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
  67. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
  68. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/repetition.py +0 -0
  69. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
  70. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
  71. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
  72. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
  73. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
  74. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/ter.py +0 -0
  75. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/text_counter.py +0 -0
  76. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
  77. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
  78. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/__init__.py +0 -0
  79. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/base.py +0 -0
  80. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
  81. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
  82. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
  83. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
  84. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
  85. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
  86. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
  87. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/graders/language.py +0 -0
  88. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
  89. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/graders/models.py +0 -0
  90. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
  91. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
  92. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
  93. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
  94. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
  95. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
  96. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
  97. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
  98. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
  99. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
  100. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
  101. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
  102. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
  103. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
  104. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
  105. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/utils.py +0 -0
  106. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
  107. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
  108. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
  109. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/loglikelihood/bits_per_byte.py +0 -0
  110. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
  111. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
  112. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
  113. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
  114. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/py.typed +0 -0
  115. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/result_processors/__init__.py +0 -0
  116. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/result_processors/base.py +0 -0
  117. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/result_processors/hf_uploader.py +0 -0
  118. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/result_processors/result_processor.py +0 -0
  119. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
  120. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/run.py +0 -0
  121. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/run_direct.py +0 -0
  122. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/shared/types.py +0 -0
  123. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/Dockerfile_codebench +0 -0
  124. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/__init__.py +0 -0
  125. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
  126. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/aidanbench.py +0 -0
  127. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
  128. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
  129. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
  130. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/balancedcopa.py +0 -0
  131. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
  132. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
  133. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
  134. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
  135. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
  136. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/csqa.py +0 -0
  137. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/drop.py +0 -0
  138. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
  139. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/flores200.py +0 -0
  140. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
  141. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/goldenswag.py +0 -0
  142. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
  143. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/gsm8k.py +0 -0
  144. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
  145. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
  146. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
  147. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
  148. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/include.py +0 -0
  149. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
  150. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/lab_bench.py +0 -0
  151. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/math_reasoning.py +0 -0
  152. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
  153. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/medqa.py +0 -0
  154. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
  155. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
  156. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
  157. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/multipl_e.py +0 -0
  158. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
  159. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
  160. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
  161. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
  162. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
  163. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
  164. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/social_iqa.py +0 -0
  165. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
  166. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/squad.py +0 -0
  167. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
  168. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
  169. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
  170. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/truthfulqa.py +0 -0
  171. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
  172. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
  173. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
  174. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
  175. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
  176. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/perturbation.py +0 -0
  177. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/registry.py +0 -0
  178. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/task_loader.py +0 -0
  179. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/tasks/utils.py +0 -0
  180. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/utils/constants.py +0 -0
  181. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/utils/file_ops.py +0 -0
  182. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/utils/generate_task_docs.py +0 -0
  183. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/utils/helpers.py +0 -0
  184. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/utils/logging.py +0 -0
  185. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/utils/packaging.py +0 -0
  186. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/eval_framework/utils/tqdm_handler.py +0 -0
  187. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/template_formatting/README.md +0 -0
  188. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/template_formatting/__init__.py +0 -0
  189. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/template_formatting/formatter.py +0 -0
  190. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/template_formatting/mistral_formatter.py +0 -0
  191. {eval_framework-0.3.0 → eval_framework-0.3.1}/src/template_formatting/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: eval-framework
3
- Version: 0.3.0
3
+ Version: 0.3.1
4
4
  Summary: Evalulation Framework
5
5
  Author: Aleph Alpha Research
6
6
  License: Apache License
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "eval-framework"
3
- version = "0.3.0"
3
+ version = "0.3.1"
4
4
  description = "Evalulation Framework"
5
5
  readme = "README.md"
6
6
  license = { file = "LICENSE" }
@@ -67,14 +67,18 @@ class ResponseGenerator:
67
67
  if config.perturbation_config is not None:
68
68
  perturbation_task_class = create_perturbation_class(task_class, config.perturbation_config)
69
69
  self.task = perturbation_task_class.with_overwrite(
70
- self.few_shot, custom_subjects=self.config.task_subjects, custom_hf_revision=self.config.hf_revision
70
+ self.few_shot,
71
+ custom_subjects=self.config.task_subjects,
72
+ custom_hf_revision=self.config.hf_revision,
71
73
  )
72
74
  else:
73
75
  self.task = task_class.with_overwrite(
74
- self.few_shot, custom_subjects=self.config.task_subjects, custom_hf_revision=self.config.hf_revision
76
+ self.few_shot,
77
+ custom_subjects=self.config.task_subjects,
78
+ custom_hf_revision=self.config.hf_revision,
75
79
  )
76
80
 
77
- self.response_type = task_class.RESPONSE_TYPE
81
+ self.response_type, _ = self.task._get_type_and_metrics()
78
82
 
79
83
  def _llm_task_param_precedence(self) -> tuple[list[str] | None, int | None]:
80
84
  """
@@ -89,7 +93,10 @@ class ResponseGenerator:
89
93
  task_stop_sequences = getattr(self.task, "stop_sequences", None)
90
94
  task_max_tokens = self.config.max_tokens or getattr(self.task, "max_tokens", None)
91
95
  # if both task and model define a max_token, the smaller value is used
92
- max_tokens = min([x for x in [llm_max_tokens, task_max_tokens] if x is not None], default=None)
96
+ max_tokens = min(
97
+ [x for x in [llm_max_tokens, task_max_tokens] if x is not None],
98
+ default=None,
99
+ )
93
100
  logger.info(f"Set max_tokens to {max_tokens}")
94
101
  # if both task and model define stop sequences, those are merged into one list
95
102
  stop_sequences_merged = (llm_stop_sequences or []) + (task_stop_sequences or [])
@@ -117,7 +124,9 @@ class ResponseGenerator:
117
124
  loglikelihoods={},
118
125
  loglikelihoods_sequence_positions={},
119
126
  raw_loglikelihood_error=Error(
120
- error_class=e.__class__.__name__, message=str(e), traceback=traceback.format_exc()
127
+ error_class=e.__class__.__name__,
128
+ message=str(e),
129
+ traceback=traceback.format_exc(),
121
130
  ),
122
131
  )
123
132
  for _ in range(len(samples))
@@ -142,7 +151,9 @@ class ResponseGenerator:
142
151
  )
143
152
  return loglikelihood_list
144
153
 
145
- def _generative_output_type_selector(self) -> Callable[[list[Sample]], list[Completion] | list[Loglikelihood]]:
154
+ def _generative_output_type_selector(
155
+ self,
156
+ ) -> Callable[[list[Sample]], list[Completion] | list[Loglikelihood]]:
146
157
  """
147
158
  Selects the generative output type based on the response type.
148
159
  :return: function to generate responses
@@ -151,7 +162,10 @@ class ResponseGenerator:
151
162
  case ResponseType.COMPLETION:
152
163
  stop_sequences, max_tokens = self._llm_task_param_precedence()
153
164
  return partial(
154
- self.task.generate_completions, self.llm, stop_sequences=stop_sequences, max_tokens=max_tokens
165
+ self.task.generate_completions,
166
+ self.llm,
167
+ stop_sequences=stop_sequences,
168
+ max_tokens=max_tokens,
155
169
  ) # type: ignore[call-arg]
156
170
  case ResponseType.LOGLIKELIHOODS:
157
171
  return self._generate_loglikelihoods
@@ -245,7 +259,9 @@ class ResponseGenerator:
245
259
 
246
260
  samples_batch: list[Sample] = []
247
261
  with tqdm(
248
- total=total_num_samples, desc=f"Processing {self.response_type.value}", disable=get_disable_bar_flag()
262
+ total=total_num_samples,
263
+ desc=f"Processing {self.response_type.value}",
264
+ disable=get_disable_bar_flag(),
249
265
  ) as pbar:
250
266
  samples = self.task.iterate_samples(self.num_samples)
251
267
  for i, sample in enumerate(repeat_samples(samples, repeats)):
@@ -46,7 +46,17 @@ def parse_strings_to_task_or_suite(v: str | list) -> str | list:
46
46
  return [{"tasks": item, "name": item} if isinstance(item, str) else item for item in v]
47
47
 
48
48
 
49
- _VALID_METHODS = {"mean", "median", "passthrough"}
49
+ _VALID_METHODS = {"mean", "median"}
50
+
51
+
52
+ class MetricSource(BaseModel):
53
+ """A single (child, metric) pair used as an input to a SuiteAggregate. See the examples folder
54
+ for how these are used."""
55
+
56
+ model_config = ConfigDict(extra="forbid")
57
+
58
+ child: str
59
+ metric: str
50
60
 
51
61
 
52
62
  class SuiteAggregate(BaseModel):
@@ -55,7 +65,7 @@ class SuiteAggregate(BaseModel):
55
65
  model_config = ConfigDict(extra="forbid")
56
66
 
57
67
  name: str
58
- metric: Annotated[list[str], BeforeValidator(lambda v: [v] if isinstance(v, str) else v)]
68
+ sources: list[MetricSource]
59
69
  method: str | Callable[[list[float]], float] = "mean"
60
70
 
61
71
  @field_validator("method")
@@ -197,52 +207,44 @@ def compute_aggregates(
197
207
  aggregates: list[SuiteAggregate],
198
208
  child_results: dict[str, SuiteResult],
199
209
  ) -> dict[str, float | None]:
200
- """Compute suite-level stats from children's results.
210
+ """Compute suite-level stats from explicitly named (child, metric) sources.
201
211
 
202
- Each SuiteAggregate either reduces a metric across all children
203
- (method="mean", "median", or a callable) or surfaces a metric
204
- from exactly one child as-is (method="passthrough").
212
+ For each `SuiteAggregate`, the value from each `MetricSource` is looked up by
213
+ child name and exact metric key. Sources whose child is missing or whose metric is
214
+ None or NaN are silently skipped. If no sources yield a valid value the aggregate is None.
205
215
  """
206
216
  result: dict[str, float | None] = {}
207
217
 
208
218
  for agg in aggregates:
209
- if agg.method == "passthrough":
210
- child = child_results.get(agg.name)
219
+ values: list[float] = []
220
+ for source in agg.sources:
221
+ child = child_results.get(source.child)
211
222
  if child is None:
212
- raise ValueError(
213
- f"SuiteAggregate '{agg.name}' uses method='passthrough' but no child "
214
- f"named '{agg.name}' exists. Available children: {list(child_results.keys())}."
223
+ logger.warning(
224
+ f"SuiteAggregate '{agg.name}' uses source '{source.child}' which is not a child of the suite. "
225
+ f"Available children: {list(child_results.keys())}."
215
226
  )
216
- # don't count NaN and None values.
217
- result[agg.name] = next(
218
- (v for m in agg.metric if (v := child.aggregates.get(m)) is not None and not math.isnan(v)),
219
- None,
220
- )
221
- else:
222
- values: dict[str, float] = {}
223
- for child_name, child in child_results.items():
224
- for m in agg.metric:
225
- val = child.aggregates.get(m)
226
- if val is not None and not math.isnan(val):
227
- values[child_name] = val
228
- break
229
- result[agg.name] = _apply_method(agg.method, values) if values else None
227
+ continue
228
+ val = child.aggregates.get(source.metric)
229
+ if val is not None and not math.isnan(val):
230
+ values.append(val)
231
+ else:
232
+ logger.warning(f"The value for source '{source.child}' with metric '{source.metric}' is None or NaN.")
233
+ result[agg.name] = _apply_method(agg.method, values) if values else None
230
234
 
231
235
  return result
232
236
 
233
237
 
234
238
  def _apply_method(
235
239
  method: str | Callable[[list[float]], float],
236
- values: dict[str, float],
240
+ values: list[float],
237
241
  ) -> float:
238
- vals = list(values.values())
239
-
240
242
  if callable(method):
241
- return method(vals)
243
+ return method(values)
242
244
  elif method == "mean":
243
- return float(np.mean(vals))
245
+ return float(np.mean(values))
244
246
  elif method == "median":
245
- return float(np.median(vals))
247
+ return float(np.median(values))
246
248
  else:
247
249
  raise ValueError(f"Unknown aggregation method: '{method}'. Use mean or median.")
248
250
 
@@ -34,6 +34,7 @@ class ResponseType(Enum):
34
34
  class TaskStyle(Enum):
35
35
  MULTIPLE_CHOICE = "multiple_choice"
36
36
  CLOZE = "cloze"
37
+ BPB = "bpb"
37
38
 
38
39
 
39
40
  class Language(Enum):
@@ -311,7 +312,7 @@ class BaseTask[SubjectType](ABC):
311
312
 
312
313
  def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
313
314
  if hasattr(self, "TASK_STYLER"):
314
- return self.TASK_STYLER.get_possible_completions(self._get_choices(item))
315
+ return self.TASK_STYLER.get_possible_completions(self._get_choices(item), self._get_correct_index(item))
315
316
  return None
316
317
 
317
318
  def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]:
@@ -331,12 +332,7 @@ class BaseTask[SubjectType](ABC):
331
332
  return None
332
333
 
333
334
  def get_metadata(self) -> dict[str, str | list[str]]:
334
- if hasattr(self, "TASK_STYLER"):
335
- response_type = self.TASK_STYLER.response_type
336
- metrics = self.TASK_STYLER.metrics
337
- else:
338
- response_type = self.RESPONSE_TYPE
339
- metrics = self.METRICS
335
+ response_type, metrics = self._get_type_and_metrics()
340
336
 
341
337
  meta: dict[str, str | list[str]] = {
342
338
  "dataset_path": self.DATASET_PATH,
@@ -423,3 +419,8 @@ class BaseTask[SubjectType](ABC):
423
419
  )
424
420
  )
425
421
  return completion_list
422
+
423
+ def _get_type_and_metrics(self) -> tuple[ResponseType, list[type["BaseMetric"]]]:
424
+ if hasattr(self, "TASK_STYLER"):
425
+ return self.TASK_STYLER.response_type, self.TASK_STYLER.metrics
426
+ return self.RESPONSE_TYPE, self.METRICS
@@ -477,7 +477,7 @@ class GlobalMMLU(BaseTask[tuple[str, str]]):
477
477
  METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood]
478
478
  SUBJECTS = list(product(GLOBAL_MMLU_LANGUAGES, MMLU_SUBJECTS))
479
479
  PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer"] + get_n_letters(4)
480
- LANGUAGE = {
480
+ LANGUAGE: Language | dict[str, Language] | None = {
481
481
  str((lang_code.split("_")[0], subject)): LANGUAGE_NAME_MAP[lang_code]
482
482
  for lang_code, subjects in LANGUAGE_SUBJECTS_MAP.items()
483
483
  for subject in subjects
@@ -531,3 +531,9 @@ class GlobalMMLU(BaseTask[tuple[str, str]]):
531
531
 
532
532
  def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
533
533
  return [f" {key}" for key in self.keys]
534
+
535
+
536
+ class GlobalMMLU_German(GlobalMMLU):
537
+ NAME = "GlobalMMLU_German"
538
+ SUBJECTS = [("de", subject) for subject in MMLU_SUBJECTS]
539
+ LANGUAGE = Language.DEU
@@ -426,7 +426,7 @@ class MMMLU(BaseTask[tuple[str, str]]):
426
426
  METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
427
427
  SUBJECTS = list(product(MMMLU_LANGS, MMLU_SUBJECTS))
428
428
  PERTURBATION_UNMODIFIABLE_WORDS = ["Question"] + get_n_letters(4)
429
- LANGUAGE = {
429
+ LANGUAGE: Language | dict[str, Language] | None = {
430
430
  str((lang_code.split("_")[0], subject)): LANGUAGE_NAME_MAP[lang_code]
431
431
  for lang_code, subjects in LANGUAGE_SUBJECTS_MAP.items()
432
432
  for subject in subjects
@@ -480,6 +480,12 @@ class MMMLU(BaseTask[tuple[str, str]]):
480
480
  return [f" {key}" for key in self.keys]
481
481
 
482
482
 
483
+ class MMMLU_German(MMMLU):
484
+ NAME = "MMMLU_German"
485
+ SUBJECTS = [("DE_DE", subject) for subject in MMLU_SUBJECTS]
486
+ LANGUAGE = Language.DEU
487
+
488
+
483
489
  class MMMLU_GERMAN_COT(MMMLU):
484
490
  NAME = "MMMLU_GERMAN_COT"
485
491
  RESPONSE_TYPE = ResponseType.COMPLETION
@@ -3,6 +3,7 @@ from typing import Any
3
3
  from eval_framework.metrics.completion.drop_completion import DropF1ExactMatch, DropMetricContext
4
4
  from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType
5
5
  from eval_framework.tasks.task_style import (
6
+ BPBStyle,
6
7
  ClozeStyle,
7
8
  MCStyle,
8
9
  answer_key_to_index,
@@ -93,3 +94,10 @@ class NaturalQsOpenMC_OLMES(_NaturalQsOpenChoice_Base):
93
94
 
94
95
  NAME = "NaturalQsOpenMC_OLMES"
95
96
  TASK_STYLER = MCStyle(space_prefixed_labels=True)
97
+
98
+
99
+ class NaturalQsOpenBPB(_NaturalQsOpenChoice_Base):
100
+ """BPB-only variant."""
101
+
102
+ NAME = "NaturalQsOpenBPB"
103
+ TASK_STYLER = BPBStyle()
@@ -112,7 +112,8 @@ class EvalConfig(BaseConfig):
112
112
  @model_validator(mode="after")
113
113
  def validate_llm_judge_defined(self) -> "EvalConfig":
114
114
  task = get_task(self.task_name)
115
- for metric_class in task.METRICS:
115
+ _, task_metrics = task(num_fewshot=0)._get_type_and_metrics()
116
+ for metric_class in task_metrics:
116
117
  if issubclass(metric_class, BaseLLMJudgeMetric):
117
118
  assert self.llm_judge_class is not None, "The LLM Judge must be defined for this evaluation task."
118
119
  return self
@@ -119,8 +119,10 @@ def register_all_tasks() -> None:
119
119
  register_lazy_task("eval_framework.tasks.benchmarks.mmlu_pro.MMLU_PRO_COT")
120
120
  register_lazy_task("eval_framework.tasks.benchmarks.mmlu.MMLU_COT")
121
121
  register_lazy_task("eval_framework.tasks.benchmarks.mmmlu.MMMLU")
122
+ register_lazy_task("eval_framework.tasks.benchmarks.mmmlu.MMMLU_German")
122
123
  register_lazy_task("eval_framework.tasks.benchmarks.mmmlu.MMMLU_GERMAN_COT")
123
124
  register_lazy_task("eval_framework.tasks.benchmarks.global_mmlu.GlobalMMLU")
125
+ register_lazy_task("eval_framework.tasks.benchmarks.global_mmlu.GlobalMMLU_German")
124
126
  register_lazy_task("eval_framework.tasks.benchmarks.pawsx.PAWSX")
125
127
  register_lazy_task("eval_framework.tasks.benchmarks.piqa.PIQA")
126
128
  register_lazy_task("eval_framework.tasks.benchmarks.piqa.PIQA_IDK")
@@ -190,6 +192,7 @@ def register_all_tasks() -> None:
190
192
  register_lazy_task("eval_framework.tasks.benchmarks.naturalqs_open.NaturalQsOpenCloze")
191
193
  register_lazy_task("eval_framework.tasks.benchmarks.naturalqs_open.NaturalQsOpenMC")
192
194
  register_lazy_task("eval_framework.tasks.benchmarks.naturalqs_open.NaturalQsOpenMC_OLMES")
195
+ register_lazy_task("eval_framework.tasks.benchmarks.naturalqs_open.NaturalQsOpenBPB")
193
196
  register_lazy_task("eval_framework.tasks.benchmarks.social_iqa.SocialIQACloze")
194
197
  register_lazy_task("eval_framework.tasks.benchmarks.social_iqa.SocialIQAMC_OLMES")
195
198
  register_lazy_task("eval_framework.tasks.benchmarks.social_iqa.SocialIQAMC")
@@ -52,6 +52,10 @@ dataset attributes and data-access methods. Variants only differ in ``TASK_STYL
52
52
  class ARC_MC(_ARC_Base):
53
53
  NAME = "ARC_MC"
54
54
  TASK_STYLER = MCStyle(space_prefixed_labels=True)
55
+
56
+ class ARC_BPB(_ARC_Base):
57
+ NAME = "ARC_BPB"
58
+ TASK_STYLER = BPBStyle()
55
59
  """
56
60
 
57
61
  import hashlib
@@ -111,8 +115,13 @@ class TaskStyler(ABC):
111
115
  """Return the ground-truth string for scoring."""
112
116
 
113
117
  @abstractmethod
114
- def get_possible_completions(self, choices: list[str]) -> list[str]:
115
- """Return the list of scored completion strings."""
118
+ def get_possible_completions(self, choices: list[str], correct_index: int | None = None) -> list[str]:
119
+ """Return the list of completion strings to be evaluated.
120
+
121
+ ``correct_index`` is only required by ``BPBStyle``, which scores solely the
122
+ ground-truth completion. ``MCStyle`` and ``ClozeStyle`` score all choices and
123
+ ignore it; callers may omit it when using those stylers.
124
+ """
116
125
 
117
126
  @abstractmethod
118
127
  def get_cue_text(self) -> str:
@@ -196,7 +205,8 @@ class MCStyle(TaskStyler):
196
205
  labels = get_n_letters(len(choices))
197
206
  return f" {labels[correct_index]}"
198
207
 
199
- def get_possible_completions(self, choices: list[str]) -> list[str]:
208
+ def get_possible_completions(self, choices: list[str], correct_index: int | None = None) -> list[str]:
209
+ """Note: `correct_index` is ignored for `MCStyle` and only used for `BPBStyle`."""
200
210
  return [f" {label}" for label in get_n_letters(len(choices))]
201
211
 
202
212
 
@@ -241,10 +251,12 @@ class ClozeStyle(TaskStyler):
241
251
  question_prefix: str = "Question: ",
242
252
  cue_text: str = "Answer:",
243
253
  trailing_newline: bool = True,
254
+ leading_space_continuations: bool = True,
244
255
  ) -> None:
245
256
  self.question_prefix = question_prefix
246
257
  self._cue_text = cue_text
247
258
  self.trailing_newline = trailing_newline
259
+ self.leading_space_continuations = leading_space_continuations
248
260
 
249
261
  def get_cue_text(self) -> str:
250
262
  return self._cue_text
@@ -254,10 +266,41 @@ class ClozeStyle(TaskStyler):
254
266
  return f"{text}\n" if self.trailing_newline else text
255
267
 
256
268
  def get_ground_truth(self, choices: list[str], correct_index: int) -> str:
257
- return f" {choices[correct_index]}"
269
+ return f" {choices[correct_index]}" if self.leading_space_continuations else choices[correct_index]
270
+
271
+ def get_possible_completions(self, choices: list[str], correct_index: int | None = None) -> list[str]:
272
+ return [f" {c}" for c in choices] if self.leading_space_continuations else [f"{c}" for c in choices]
273
+
274
+
275
+ class BPBStyle(ClozeStyle):
276
+ """BPB-only styler: prompt identical to ClozeStyle, but scores only the ground-truth completion.
277
+
278
+ One LLM forward pass per sample instead of N (one per choice), making evaluation
279
+ significantly faster when accuracy metrics are not needed.
280
+
281
+ Args:
282
+ question_prefix: Prepended to the raw question (default ``"Question: "``).
283
+ cue_text: Assistant cue after the prompt (default ``"Answer:"``).
284
+ trailing_newline: When ``True`` (default), the instruction ends with ``"\\n"``.
285
+
286
+ Assembled prompt example (3 choices)::
287
+
288
+ "Question: What is the capital of France?\\n"
289
+
290
+ Scored completions: [" Paris"] ← ground truth only, one forward pass
291
+ Ground truth: " Paris"
292
+ """
293
+
294
+ metrics: list[type["BaseMetric"]] = [BitsPerByteLoglikelihood]
295
+ task_style = TaskStyle.BPB
258
296
 
259
- def get_possible_completions(self, choices: list[str]) -> list[str]:
260
- return [f" {c}" for c in choices]
297
+ def get_possible_completions(self, choices: list[str], correct_index: int | None = None) -> list[str]:
298
+ if correct_index is None:
299
+ raise ValueError(
300
+ "BPBStyle evaluates the loglikelihood of the ground truth answer only,"
301
+ "and thus requires the correct index."
302
+ )
303
+ return [f" {choices[correct_index]}"] if self.leading_space_continuations else [choices[correct_index]]
261
304
 
262
305
 
263
306
  # ---------------------------------------------------------------------------
File without changes
File without changes