eval-framework 0.2.13__tar.gz → 0.2.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (184) hide show
  1. {eval_framework-0.2.13 → eval_framework-0.2.14}/PKG-INFO +1 -1
  2. {eval_framework-0.2.13 → eval_framework-0.2.14}/pyproject.toml +1 -1
  3. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/llm/aleph_alpha.py +7 -2
  4. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/llm/base.py +5 -2
  5. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/llm/huggingface.py +3 -1
  6. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/llm/openai.py +15 -0
  7. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/llm/vllm.py +10 -1
  8. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/math_reasoning.py +19 -0
  9. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/mbpp.py +101 -6
  10. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/task_names.py +2 -0
  11. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/utils/generate_task_docs.py +8 -4
  12. {eval_framework-0.2.13 → eval_framework-0.2.14}/LICENSE +0 -0
  13. {eval_framework-0.2.13 → eval_framework-0.2.14}/README.md +0 -0
  14. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/__init__.py +0 -0
  15. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/base_config.py +0 -0
  16. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/context/__init__.py +0 -0
  17. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/context/determined.py +0 -0
  18. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/context/eval.py +0 -0
  19. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/context/local.py +0 -0
  20. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/evaluation_generator.py +0 -0
  21. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/exceptions.py +0 -0
  22. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/external/drop_process_results.py +0 -0
  23. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/external/ifeval_impl/README.md +0 -0
  24. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
  25. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
  26. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
  27. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
  28. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/llm/__init__.py +0 -0
  29. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/llm/mistral.py +0 -0
  30. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/llm/models.py +0 -0
  31. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/logger.py +0 -0
  32. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/main.py +0 -0
  33. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/__init__.py +0 -0
  34. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/base.py +0 -0
  35. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/__init__.py +0 -0
  36. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
  37. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
  38. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/bleu.py +0 -0
  39. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/chrf.py +0 -0
  40. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
  41. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
  42. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/comet.py +0 -0
  43. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
  44. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/csv_format.py +0 -0
  45. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
  46. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/drop_completion.py +0 -0
  47. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
  48. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/f1.py +0 -0
  49. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/format_checker.py +0 -0
  50. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
  51. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/ifeval.py +0 -0
  52. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/json_format.py +0 -0
  53. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/language_checker.py +0 -0
  54. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/length_control.py +0 -0
  55. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/math_minerva_completion.py +0 -0
  56. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
  57. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/minerva_math_utils.py +0 -0
  58. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
  59. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
  60. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/repetition.py +0 -0
  61. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
  62. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
  63. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
  64. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
  65. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
  66. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/ter.py +0 -0
  67. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/text_counter.py +0 -0
  68. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
  69. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
  70. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/__init__.py +0 -0
  71. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/base.py +0 -0
  72. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
  73. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
  74. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
  75. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
  76. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
  77. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
  78. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
  79. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/language.py +0 -0
  80. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
  81. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/models.py +0 -0
  82. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
  83. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
  84. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
  85. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
  86. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
  87. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
  88. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
  89. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
  90. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
  91. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
  92. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
  93. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
  94. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
  95. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
  96. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
  97. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/utils.py +0 -0
  98. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
  99. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
  100. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
  101. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/loglikelihood/bits_per_byte.py +0 -0
  102. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
  103. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
  104. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
  105. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
  106. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/py.typed +0 -0
  107. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/response_generator.py +0 -0
  108. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/result_processors/__init__.py +0 -0
  109. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/result_processors/base.py +0 -0
  110. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/result_processors/hf_uploader.py +0 -0
  111. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/result_processors/result_processor.py +0 -0
  112. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
  113. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/run.py +0 -0
  114. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/run_direct.py +0 -0
  115. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/shared/types.py +0 -0
  116. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/__init__.py +0 -0
  117. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/base.py +0 -0
  118. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
  119. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/aidanbench.py +0 -0
  120. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
  121. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
  122. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
  123. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/balancedcopa.py +0 -0
  124. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
  125. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
  126. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
  127. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
  128. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
  129. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/csqa.py +0 -0
  130. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/drop.py +0 -0
  131. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
  132. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/flores200.py +0 -0
  133. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
  134. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/global_mmlu.py +0 -0
  135. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/goldenswag.py +0 -0
  136. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
  137. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/gsm8k.py +0 -0
  138. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
  139. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
  140. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
  141. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
  142. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/include.py +0 -0
  143. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
  144. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/lab_bench.py +0 -0
  145. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/medqa.py +0 -0
  146. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
  147. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
  148. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
  149. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
  150. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/naturalqs_open.py +0 -0
  151. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
  152. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
  153. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
  154. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
  155. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
  156. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
  157. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/social_iqa.py +0 -0
  158. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
  159. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/squad.py +0 -0
  160. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
  161. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
  162. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
  163. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/truthfulqa.py +0 -0
  164. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
  165. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
  166. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
  167. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
  168. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
  169. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/eval_config.py +0 -0
  170. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/perturbation.py +0 -0
  171. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/registry.py +0 -0
  172. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/task_loader.py +0 -0
  173. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/utils.py +0 -0
  174. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/utils/constants.py +0 -0
  175. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/utils/file_ops.py +0 -0
  176. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/utils/helpers.py +0 -0
  177. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/utils/logging.py +0 -0
  178. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/utils/packaging.py +0 -0
  179. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/utils/tqdm_handler.py +0 -0
  180. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/template_formatting/README.md +0 -0
  181. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/template_formatting/__init__.py +0 -0
  182. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/template_formatting/formatter.py +0 -0
  183. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/template_formatting/mistral_formatter.py +0 -0
  184. {eval_framework-0.2.13 → eval_framework-0.2.14}/src/template_formatting/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: eval-framework
3
- Version: 0.2.13
3
+ Version: 0.2.14
4
4
  Summary: Evalulation Framework
5
5
  Author: Aleph Alpha Research
6
6
  License: Apache License
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "eval-framework"
3
- version = "0.2.13"
3
+ version = "0.2.14"
4
4
  description = "Evalulation Framework"
5
5
  readme = "README.md"
6
6
  license = { file = "LICENSE" }
@@ -200,11 +200,16 @@ class AlephAlphaAPIModel(BaseLLM):
200
200
  stop_sequences: list[str] | None = None,
201
201
  max_tokens: int | None = None,
202
202
  temperature: float | None = None,
203
+ top_p: float | None = None,
203
204
  ) -> list[RawCompletion]:
204
205
  effective_temperature = temperature if temperature is not None else self._temperature
206
+ if effective_temperature is not None and not (0 <= effective_temperature <= 2):
207
+ raise ValueError(f"temperature must be between 0 and 2, got {effective_temperature}")
208
+ effective_top_p = top_p if top_p is not None else self._top_p
209
+ if effective_top_p is not None and not (0 < effective_top_p <= 1):
210
+ raise ValueError(f"top_p must be between 0 and 1 (exclusive), got {effective_top_p}")
205
211
 
206
212
  requests: list[CompletionRequest] = []
207
-
208
213
  # Adjust max tokens based on bytes_per_token_scalar so that non-standard models generate full responses
209
214
  scaled_max_tokens = math.ceil(max_tokens * self.bytes_per_token_scalar) if max_tokens is not None else None
210
215
 
@@ -215,7 +220,7 @@ class AlephAlphaAPIModel(BaseLLM):
215
220
  maximum_tokens=scaled_max_tokens,
216
221
  stop_sequences=stop_sequences,
217
222
  temperature=effective_temperature,
218
- top_p=self._top_p,
223
+ top_p=effective_top_p,
219
224
  )
220
225
  )
221
226
 
@@ -24,6 +24,7 @@ class BaseLLM(ABC):
24
24
  stop_sequences: list[str] | None = None,
25
25
  max_tokens: int | None = None,
26
26
  temperature: float | None = None,
27
+ top_p: float | None = None,
27
28
  ) -> list[RawCompletion]:
28
29
  """
29
30
  stop_sequences and max_tokens are injected by the task if exist. They should be overwritten or
@@ -47,6 +48,7 @@ class BaseLLM(ABC):
47
48
  stop_sequences: list[str] | None = None,
48
49
  max_tokens: int | None = None,
49
50
  temperature: float | None = None,
51
+ top_p: float | None = None,
50
52
  ) -> list[RawCompletion]:
51
53
  """
52
54
  stop_sequences and max_tokens are injected by the task if exist. They should be overwritten or
@@ -79,6 +81,7 @@ class BaseLLM(ABC):
79
81
  stop_sequences: list[str] | None = None,
80
82
  max_tokens: int | None = None,
81
83
  temperature: float | None = None,
84
+ top_p: float | None = None,
82
85
  ) -> list[RawCompletion]:
83
86
  """Generates a model response for each sample.
84
87
 
@@ -86,10 +89,10 @@ class BaseLLM(ABC):
86
89
  otherwise falls back to 'generate_from_messages'.
87
90
  """
88
91
  try:
89
- return self.generate_from_samples(samples, stop_sequences, max_tokens, temperature)
92
+ return self.generate_from_samples(samples, stop_sequences, max_tokens, temperature, top_p)
90
93
  except NotImplementedError:
91
94
  messages: list[Sequence[Message]] = [sample.messages for sample in samples]
92
- return self.generate_from_messages(messages, stop_sequences, max_tokens, temperature)
95
+ return self.generate_from_messages(messages, stop_sequences, max_tokens, temperature, top_p)
93
96
 
94
97
  def post_process_completion(self, completion: str, sample: Sample) -> str:
95
98
  """
@@ -146,6 +146,7 @@ class BaseHFLLM(BaseLLM):
146
146
  stop_sequences: list[str] | None = None,
147
147
  max_tokens: int | None = None,
148
148
  temperature: float | None = None,
149
+ top_p: float | None = None,
149
150
  ) -> list[RawCompletion]:
150
151
  if temperature is None:
151
152
  effective_temperature = 0.0 # Current default, TODO: refactor to use model's default
@@ -154,7 +155,8 @@ class BaseHFLLM(BaseLLM):
154
155
  )
155
156
  else:
156
157
  effective_temperature = temperature
157
-
158
+ if top_p is not None:
159
+ logger.warning("Huggingface LLM does not support top_p. Ignoring top_p value.")
158
160
  raw_completions = []
159
161
  for single_messages in messages:
160
162
  # format
@@ -34,6 +34,7 @@ class OpenAIModel(BaseLLM):
34
34
  model_name: str | None = None,
35
35
  formatter: BaseFormatter | None = None,
36
36
  temperature: float | None = None,
37
+ top_p: float | None = None,
37
38
  api_key: str | None = os.getenv("OPENAI_API_KEY", ""),
38
39
  organization: str | None = None,
39
40
  base_url: str | None = None,
@@ -46,6 +47,7 @@ class OpenAIModel(BaseLLM):
46
47
  model_name: OpenAI model name (e.g., "gpt-4o", "gpt-3.5-turbo"). If None, uses LLM_NAME class attribute.
47
48
  formatter: Optional message formatter.
48
49
  temperature: Sampling temperature used when not passed to generate methods (from 0.0 to 2.0).
50
+ top_p: Nucleus sampling probability mass (from 0.0 to 1.0). If None, the API default is used.
49
51
  api_key: OpenAI API key (defaults to OPENAI_API_KEY env variable).
50
52
  organization: Optional OpenAI organization ID.
51
53
  base_url: Optional API base URL for Azure or alternate endpoints.
@@ -59,6 +61,10 @@ class OpenAIModel(BaseLLM):
59
61
  self._temperature = temperature if temperature is not None else 0.0
60
62
  assert 0.0 <= self._temperature <= 2.0, "Temperature must be between 0.0 and 2.0"
61
63
 
64
+ if top_p is not None:
65
+ assert 0.0 <= top_p <= 1.0, "top_p must be between 0.0 and 1.0"
66
+ self._top_p = top_p
67
+
62
68
  self._client = OpenAI(
63
69
  api_key=api_key,
64
70
  organization=organization,
@@ -97,6 +103,7 @@ class OpenAIModel(BaseLLM):
97
103
  stop_sequences: list[str] | None = None,
98
104
  max_tokens: int | None = None,
99
105
  temperature: float | None = None,
106
+ top_p: float | None = None,
100
107
  ) -> list[RawCompletion]:
101
108
  """
102
109
  Generate completions for a list of message sequences concurrently.
@@ -108,6 +115,7 @@ class OpenAIModel(BaseLLM):
108
115
  stop_sequences: Optional list of stop sequences.
109
116
  max_tokens: Optional maximum number of tokens to generate.
110
117
  temperature: Sampling temperature.
118
+ top_p: Nucleus sampling probability mass (0.0 to 1.0). Overrides instance default if provided.
111
119
 
112
120
  Returns:
113
121
  List of RawCompletion objects containing prompts and completions.
@@ -116,6 +124,10 @@ class OpenAIModel(BaseLLM):
116
124
  effective_temperature = temperature if temperature is not None else self._temperature
117
125
  assert 0.0 <= effective_temperature <= 2.0, "Temperature must be between 0.0 and 2.0"
118
126
 
127
+ effective_top_p = top_p if top_p is not None else self._top_p
128
+ if effective_top_p is not None:
129
+ assert 0.0 <= effective_top_p <= 1.0, "top_p must be between 0.0 and 1.0"
130
+
119
131
  def _process_one(single_messages: Sequence[Message]) -> RawCompletion:
120
132
  # Adjust max tokens based on bytes_per_token_scalar so that non-standard models generate full responses
121
133
  scaled_max_tokens = math.ceil(max_tokens * self.bytes_per_token_scalar) if max_tokens is not None else None
@@ -129,6 +141,7 @@ class OpenAIModel(BaseLLM):
129
141
  model=self._model_name,
130
142
  prompt=prompt,
131
143
  temperature=effective_temperature,
144
+ top_p=effective_top_p,
132
145
  max_tokens=scaled_max_tokens,
133
146
  stop=stop_sequences,
134
147
  )
@@ -158,6 +171,7 @@ class OpenAIModel(BaseLLM):
158
171
  model=self._model_name,
159
172
  messages=chat_messages,
160
173
  temperature=effective_temperature,
174
+ top_p=effective_top_p,
161
175
  max_tokens=scaled_max_tokens,
162
176
  stop=stop_sequences,
163
177
  )
@@ -300,6 +314,7 @@ class OpenAIEmbeddingModel(BaseLLM):
300
314
  stop_sequences: list[str] | None = None,
301
315
  max_tokens: int | None = None,
302
316
  temperature: float | None = None,
317
+ top_p: float | None = None,
303
318
  ) -> list[RawCompletion]:
304
319
  raise NotImplementedError(
305
320
  "Embedding model does not support generate_from_messages. Use generate_embeddings instead."
@@ -226,6 +226,7 @@ class BaseVLLMModel(BaseLLM):
226
226
  stop_sequences: list[str] | None = None,
227
227
  max_tokens: int | None = None,
228
228
  temperature: float | None = None,
229
+ top_p: float | None = None,
229
230
  ) -> list[RawCompletion]:
230
231
  raw_completions: list[RawCompletion | None] = [None] * len(messages)
231
232
  prompt_objs = []
@@ -235,7 +236,7 @@ class BaseVLLMModel(BaseLLM):
235
236
  scaled_max_tokens = math.ceil(max_tokens * self.bytes_per_token_scalar) if max_tokens is not None else None
236
237
 
237
238
  sampling_params = self._resolve_sampling_params(
238
- self.sampling_params, scaled_max_tokens, stop_sequences, temperature
239
+ self.sampling_params, scaled_max_tokens, stop_sequences, temperature, top_p
239
240
  )
240
241
 
241
242
  for i, single_messages in enumerate(messages):
@@ -295,6 +296,7 @@ class BaseVLLMModel(BaseLLM):
295
296
  max_tokens: int | None,
296
297
  stop_sequences: list[str] | None,
297
298
  temperature: float | None,
299
+ top_p: float | None = None,
298
300
  ) -> SamplingParams:
299
301
  sampling_params.max_tokens = max_tokens
300
302
  sampling_params.stop = stop_sequences
@@ -308,6 +310,13 @@ class BaseVLLMModel(BaseLLM):
308
310
  f"Using sampling params temperature value: {sampling_params.temperature} "
309
311
  f"as no custom temperature value was provided"
310
312
  )
313
+ if top_p is not None:
314
+ logger.warning(f"Overriding sampling params top_p {sampling_params.top_p} with custom value {top_p}")
315
+ sampling_params.top_p = top_p
316
+ else:
317
+ logger.info(
318
+ f"Using sampling params top_p value: {sampling_params.top_p} as no custom top_p value was provided"
319
+ )
311
320
  return sampling_params
312
321
 
313
322
  def _model_generate(
@@ -394,6 +394,25 @@ class AIME2025(AIME2024):
394
394
  return item["answer"]
395
395
 
396
396
 
397
+ class AIME2026(AIME2024):
398
+ """AIME 2026 dataset: https://huggingface.co/datasets/math-ai/aime26
399
+
400
+ This dataset contains a single test split of 30 questions.
401
+ Data contains
402
+ problem | answer | id
403
+
404
+ pass@1 evaluation
405
+ """
406
+
407
+ NAME = "AIME2026"
408
+ DATASET_PATH = "math-ai/aime26"
409
+ SAMPLE_SPLIT = "test"
410
+ FEWSHOT_SPLIT = "test"
411
+
412
+ def _get_ground_truth(self, item: dict[str, Any]) -> str | None | list[str]:
413
+ return item["answer"]
414
+
415
+
397
416
  class MATH500(MATHReasoning):
398
417
  """MATH500 dataset: https://huggingface.co/datasets/HuggingFaceH4/MATH-500
399
418
 
@@ -105,9 +105,7 @@ class MBPP(BaseTask[str]):
105
105
  def _get_context(self, item: dict[str, Any]) -> MBPPMetricContext:
106
106
  return MBPPMetricContext(tests_code="\n".join(item["test_list"]))
107
107
 
108
- def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
109
- assert sample is not None
110
-
108
+ def post_process_generated_completion(self, completion_text: str, sample: Sample) -> str: # type: ignore[override]
111
109
  if BEGIN in completion_text:
112
110
  completion_text = completion_text.split(f"{BEGIN}\n")[1]
113
111
 
@@ -193,9 +191,7 @@ class MBPP_PROMPT_WITHOUT_TESTS(MBPP):
193
191
  return postfix
194
192
  return f"{postfix.strip()}:"
195
193
 
196
- def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
197
- assert sample is not None
198
-
194
+ def post_process_generated_completion(self, completion_text: str, sample: Sample) -> str: # type: ignore[override]
199
195
  if BEGIN in completion_text:
200
196
  completion_text = completion_text.split(BEGIN)[1]
201
197
 
@@ -212,3 +208,102 @@ class MBPP_PROMPT_WITHOUT_TESTS(MBPP):
212
208
  class MBPP_PROMPT_WITHOUT_TESTS_SANITIZED(MBPP_PROMPT_WITHOUT_TESTS):
213
209
  NAME = "MBPP_PROMPT_WITHOUT_TESTS_SANITIZED"
214
210
  SUBJECTS = ["sanitized"]
211
+
212
+
213
+ _OLMES_FEWSHOT_EXAMPLES: list[dict[str, Any]] = [
214
+ {
215
+ "text": "Write a function to find the similar elements from the given two tuple lists.",
216
+ "code": (
217
+ "def similar_elements(test_tup1, test_tup2):\n"
218
+ " res = tuple(set(test_tup1) & set(test_tup2))\n return (res)"
219
+ ),
220
+ "test_list": [
221
+ "assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)",
222
+ "assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4)",
223
+ "assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14)",
224
+ ],
225
+ },
226
+ {
227
+ "text": "Write a python function to identify non-prime numbers.",
228
+ "code": (
229
+ "import math\ndef is_not_prime(n):\n result = False\n"
230
+ " for i in range(2,int(math.sqrt(n)) + 1):\n"
231
+ " if n % i == 0:\n result = True\n return result"
232
+ ),
233
+ "test_list": [
234
+ "assert is_not_prime(2) == False",
235
+ "assert is_not_prime(10) == True",
236
+ "assert is_not_prime(35) == True",
237
+ ],
238
+ },
239
+ {
240
+ "text": (
241
+ "Write a function to find the largest integers from a given list of numbers using heap queue algorithm."
242
+ ),
243
+ "code": (
244
+ "import heapq as hq\ndef heap_queue_largest(nums,n):\n"
245
+ " largest_nums = hq.nlargest(n, nums)\n return largest_nums"
246
+ ),
247
+ "test_list": [
248
+ "assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] ",
249
+ "assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] ",
250
+ "assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35]",
251
+ ],
252
+ },
253
+ ]
254
+
255
+
256
+ class MBPP_OLMES(MBPP):
257
+ """
258
+ MBPP OLMES variant replicating oe_eval's ``mbpp:3shot::olmo3:n32:v2``.
259
+
260
+ Uses the EvalPlus prompt format with 3 hardcoded fewshot examples from the
261
+ original MBPP "prompt" split (matching oe_eval's ordering). Each prompt
262
+ shows one test case (the first) instead of all.
263
+
264
+ Recommended EvalConfig settings for full replication::
265
+
266
+ split: test
267
+ num_fewshot: 3 (hardcoded, prompt split)
268
+ metric: pass_at_1
269
+ temperature: 0.6
270
+ top_p: 0.6
271
+ repeats: 32
272
+ """
273
+
274
+ NAME = "MBPP_OLMES"
275
+ FEWSHOT_SPLIT = "test"
276
+
277
+ def __init__(self, num_fewshot: int = 3) -> None:
278
+ super().__init__(num_fewshot)
279
+ assert num_fewshot == 3, "MBPP_OLMES requires exactly 3 fewshot examples"
280
+ self.stop_sequences = ["```", '\n"""', "\nassert", "\n#"]
281
+
282
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
283
+ text = item["text"] if "text" in item else item["prompt"]
284
+ test = item["test_list"][0]
285
+ return (
286
+ "Please provide a self-contained Python script that solves the following problem"
287
+ f" in a markdown code block:\n```\n{text.strip()}\n{test}\n```\n"
288
+ )
289
+
290
+ def _get_cue_text(self, item: dict[str, Any]) -> str:
291
+ return "Here is the completed function:\n\n```python\n"
292
+
293
+ def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
294
+ return item["code"] + "\n"
295
+
296
+ def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]:
297
+ return list(_OLMES_FEWSHOT_EXAMPLES)
298
+
299
+ def post_process_generated_completion(self, completion_text: str, sample: Sample) -> str: # type: ignore[override]
300
+ assert self.stop_sequences is not None
301
+
302
+ for stop_seq in self.stop_sequences:
303
+ if stop_seq in completion_text:
304
+ completion_text = completion_text.split(stop_seq)[0]
305
+
306
+ extracted_code = completion_text + "\n"
307
+ mbpp_ground_truth = str(sample.ground_truth)
308
+ code = self._code_expander(extracted_code, mbpp_ground_truth)
309
+ return code
@@ -19,6 +19,7 @@ def register_all_tasks() -> None:
19
19
  """Register all the benchmark tasks with the eval framework."""
20
20
  register_lazy_task("eval_framework.tasks.benchmarks.math_reasoning.AIME2024")
21
21
  register_lazy_task("eval_framework.tasks.benchmarks.math_reasoning.AIME2025")
22
+ register_lazy_task("eval_framework.tasks.benchmarks.math_reasoning.AIME2026")
22
23
  register_lazy_task("eval_framework.tasks.benchmarks.arc.ARC")
23
24
  register_lazy_task("eval_framework.tasks.benchmarks.arc.ARC_IDK")
24
25
  register_lazy_task("eval_framework.tasks.benchmarks.arc.ARC_OLMES")
@@ -88,6 +89,7 @@ def register_all_tasks() -> None:
88
89
  register_lazy_task("eval_framework.tasks.benchmarks.mbpp.MBPP_SANITIZED")
89
90
  register_lazy_task("eval_framework.tasks.benchmarks.mbpp.MBPP_PROMPT_WITHOUT_TESTS")
90
91
  register_lazy_task("eval_framework.tasks.benchmarks.mbpp.MBPP_PROMPT_WITHOUT_TESTS_SANITIZED")
92
+ register_lazy_task("eval_framework.tasks.benchmarks.mbpp.MBPP_OLMES")
91
93
  register_lazy_task("eval_framework.tasks.benchmarks.mmlu.MMLU")
92
94
  register_lazy_task("eval_framework.tasks.benchmarks.mmlu.MMLU_IDK")
93
95
  register_lazy_task("eval_framework.tasks.benchmarks.mmlu.MMLU_OLMES")
@@ -74,13 +74,17 @@ def generate_docs_for_task(
74
74
  try:
75
75
  num_fewshot = 1
76
76
  task = task_class(num_fewshot=num_fewshot)
77
- except Exception:
77
+ except (TypeError, ValueError, AssertionError):
78
78
  try:
79
79
  num_fewshot = 0
80
80
  task = task_class(num_fewshot=num_fewshot)
81
- except Exception as e:
82
- print(f"Failed to instantiate task {task_name}: {e}")
83
- return
81
+ except (TypeError, ValueError, AssertionError):
82
+ try:
83
+ task = task_class()
84
+ num_fewshot = task.num_fewshot
85
+ except Exception as e:
86
+ print(f"Failed to instantiate task {task_name}: {e}")
87
+ return
84
88
 
85
89
  with open(f"{output_docs_directory}/{task_name}.md", "w") as f:
86
90
  f.write(f"# {task_name}\n\n")
File without changes