eval-framework 0.3.5__tar.gz → 0.3.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (191) hide show
  1. {eval_framework-0.3.5 → eval_framework-0.3.7}/PKG-INFO +1 -1
  2. {eval_framework-0.3.5 → eval_framework-0.3.7}/pyproject.toml +1 -1
  3. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/llm/openai.py +23 -10
  4. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/math_minerva_completion.py +33 -12
  5. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/minerva_math_utils.py +45 -4
  6. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/ifeval.py +2 -0
  7. {eval_framework-0.3.5 → eval_framework-0.3.7}/LICENSE +0 -0
  8. {eval_framework-0.3.5 → eval_framework-0.3.7}/README.md +0 -0
  9. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/__init__.py +0 -0
  10. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/base_config.py +0 -0
  11. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/context/__init__.py +0 -0
  12. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/context/determined.py +0 -0
  13. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/context/eval.py +0 -0
  14. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/context/local.py +0 -0
  15. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/evaluation_generator.py +0 -0
  16. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/exceptions.py +0 -0
  17. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/external/drop_process_results.py +0 -0
  18. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/external/ifeval_impl/README.md +0 -0
  19. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
  20. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
  21. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
  22. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
  23. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/llm/__init__.py +0 -0
  24. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/llm/aleph_alpha.py +0 -0
  25. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/llm/base.py +0 -0
  26. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/llm/huggingface.py +0 -0
  27. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/llm/mistral.py +0 -0
  28. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/llm/models.py +0 -0
  29. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/llm/vllm.py +0 -0
  30. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/logger.py +0 -0
  31. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/main.py +0 -0
  32. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/__init__.py +0 -0
  33. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/aggregators/__init__.py +0 -0
  34. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/aggregators/aggregators.py +0 -0
  35. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/base.py +0 -0
  36. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/__init__.py +0 -0
  37. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
  38. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
  39. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/bleu.py +0 -0
  40. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/chrf.py +0 -0
  41. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
  42. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
  43. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/comet.py +0 -0
  44. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
  45. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/csv_format.py +0 -0
  46. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
  47. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/drop_completion.py +0 -0
  48. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
  49. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/f1.py +0 -0
  50. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/format_checker.py +0 -0
  51. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
  52. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/ifeval.py +0 -0
  53. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/json_format.py +0 -0
  54. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/language_checker.py +0 -0
  55. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/length_control.py +0 -0
  56. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
  57. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/multipl_e_assertion.py +0 -0
  58. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
  59. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
  60. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/repetition.py +0 -0
  61. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
  62. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
  63. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
  64. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
  65. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
  66. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/ter.py +0 -0
  67. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/text_counter.py +0 -0
  68. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
  69. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
  70. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/__init__.py +0 -0
  71. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/base.py +0 -0
  72. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
  73. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
  74. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
  75. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
  76. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
  77. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
  78. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
  79. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/language.py +0 -0
  80. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
  81. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/models.py +0 -0
  82. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
  83. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
  84. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
  85. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
  86. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
  87. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
  88. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
  89. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
  90. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
  91. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
  92. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
  93. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
  94. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
  95. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
  96. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
  97. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/utils.py +0 -0
  98. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
  99. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
  100. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
  101. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/loglikelihood/bits_per_byte.py +0 -0
  102. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
  103. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
  104. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
  105. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
  106. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/py.typed +0 -0
  107. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/response_generator.py +0 -0
  108. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/result_processors/__init__.py +0 -0
  109. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/result_processors/base.py +0 -0
  110. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/result_processors/hf_uploader.py +0 -0
  111. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/result_processors/result_processor.py +0 -0
  112. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
  113. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/run.py +0 -0
  114. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/run_direct.py +0 -0
  115. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/shared/types.py +0 -0
  116. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/suite.py +0 -0
  117. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/Dockerfile_codebench +0 -0
  118. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/__init__.py +0 -0
  119. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/base.py +0 -0
  120. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
  121. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/aidanbench.py +0 -0
  122. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
  123. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
  124. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
  125. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/balancedcopa.py +0 -0
  126. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
  127. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
  128. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
  129. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
  130. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
  131. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/csqa.py +0 -0
  132. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/drop.py +0 -0
  133. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
  134. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/flores200.py +0 -0
  135. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
  136. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/global_mmlu.py +0 -0
  137. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/goldenswag.py +0 -0
  138. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
  139. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/gsm8k.py +0 -0
  140. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
  141. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
  142. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
  143. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/include.py +0 -0
  144. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
  145. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/lab_bench.py +0 -0
  146. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/math_reasoning.py +0 -0
  147. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
  148. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/medqa.py +0 -0
  149. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
  150. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
  151. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
  152. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
  153. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/multipl_e.py +0 -0
  154. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/naturalqs_open.py +0 -0
  155. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
  156. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
  157. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
  158. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
  159. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
  160. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
  161. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/social_iqa.py +0 -0
  162. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
  163. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/squad.py +0 -0
  164. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
  165. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
  166. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
  167. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/truthfulqa.py +0 -0
  168. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
  169. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
  170. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
  171. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
  172. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
  173. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/eval_config.py +0 -0
  174. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/perturbation.py +0 -0
  175. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/registry.py +0 -0
  176. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/task_loader.py +0 -0
  177. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/task_names.py +0 -0
  178. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/task_style.py +0 -0
  179. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/tasks/utils.py +0 -0
  180. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/utils/constants.py +0 -0
  181. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/utils/file_ops.py +0 -0
  182. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/utils/generate_task_docs.py +0 -0
  183. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/utils/helpers.py +0 -0
  184. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/utils/logging.py +0 -0
  185. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/utils/packaging.py +0 -0
  186. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/eval_framework/utils/tqdm_handler.py +0 -0
  187. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/template_formatting/README.md +0 -0
  188. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/template_formatting/__init__.py +0 -0
  189. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/template_formatting/formatter.py +0 -0
  190. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/template_formatting/mistral_formatter.py +0 -0
  191. {eval_framework-0.3.5 → eval_framework-0.3.7}/src/template_formatting/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: eval-framework
3
- Version: 0.3.5
3
+ Version: 0.3.7
4
4
  Summary: Evalulation Framework
5
5
  Author: Aleph Alpha Research
6
6
  License: Apache License
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "eval-framework"
3
- version = "0.3.5"
3
+ version = "0.3.7"
4
4
  description = "Evalulation Framework"
5
5
  readme = "README.md"
6
6
  license = { file = "LICENSE" }
@@ -8,18 +8,38 @@ from functools import partial
8
8
 
9
9
  import tiktoken
10
10
  from openai import OpenAI
11
- from openai.types.chat import ChatCompletionAssistantMessageParam, ChatCompletionUserMessageParam
11
+ from openai.types.chat import (
12
+ ChatCompletionAssistantMessageParam,
13
+ ChatCompletionMessageParam,
14
+ ChatCompletionSystemMessageParam,
15
+ ChatCompletionUserMessageParam,
16
+ )
12
17
  from tokenizers import Tokenizer
13
18
  from transformers import AutoTokenizer
14
19
 
15
20
  from eval_framework.llm.base import BaseLLM
16
21
  from eval_framework.shared.types import ConcatCompression, Error, RawCompletion, RawLoglikelihood
17
22
  from eval_framework.tasks.base import Sample
18
- from template_formatting.formatter import BaseFormatter, ConcatFormatter, HFFormatter, Message
23
+ from template_formatting.formatter import BaseFormatter, ConcatFormatter, HFFormatter, Message, Role
19
24
 
20
25
  logger = logging.getLogger(__name__)
21
26
 
22
27
 
28
+ def _to_chat_completion_message(message: Message) -> ChatCompletionMessageParam:
29
+ match message.role:
30
+ case Role.SYSTEM:
31
+ return ChatCompletionSystemMessageParam(role="system", content=message.content)
32
+ case Role.USER:
33
+ return ChatCompletionUserMessageParam(role="user", content=message.content)
34
+ case Role.ASSISTANT:
35
+ return ChatCompletionAssistantMessageParam(role="assistant", content=message.content)
36
+ case None:
37
+ raise ValueError(
38
+ "Cannot send a Message without a role through the chat completion API; "
39
+ "the legacy roleless format is only supported for fine-tuning."
40
+ )
41
+
42
+
23
43
  class OpenAIModel(BaseLLM):
24
44
  """
25
45
  LLM wrapper for OpenAI API providing text/chat completions and log-probability evaluation output.
@@ -158,14 +178,7 @@ class OpenAIModel(BaseLLM):
158
178
 
159
179
  else:
160
180
  # Use chat completion API
161
- chat_messages = [
162
- (
163
- ChatCompletionUserMessageParam(role="user", content=m.content)
164
- if m.role is not None and m.role.value.lower() == "user"
165
- else ChatCompletionAssistantMessageParam(role="assistant", content=m.content)
166
- )
167
- for m in single_messages
168
- ]
181
+ chat_messages = [_to_chat_completion_message(m) for m in single_messages]
169
182
  assert self._model_name is not None
170
183
  chat_response = self._client.chat.completions.create(
171
184
  model=self._model_name,
@@ -17,21 +17,28 @@ class MathMinervaCompletion(BaseMetric[Completion]):
17
17
  Minerva MATH: reports Exact Match and Exact Match (Flex).
18
18
  Uses raw_completion to extract multiple candidates; primary for exact_match,
19
19
  all candidates with both Minerva and Hendrycks equivalence for exact_match_flex.
20
+
21
+ English Minerva extraction is the default. Subclasses select other
22
+ final-answer styles by overriding ``COT_STYLE`` / ``RELAXED``.
20
23
  """
21
24
 
22
25
  NAME = "Math Minerva Completion"
23
26
  KEYS = ["Exact", "Exact Flex"]
24
27
  AGGREGATORS = [PassAtK()]
25
28
 
29
+ # Defaults; subclasses override these class attributes to define variants.
30
+ COT_STYLE: str = "minerva"
31
+ RELAXED: bool = False
32
+
26
33
  def __init__(
27
34
  self,
28
35
  use_cot: bool = True,
29
- cot_style: str = "minerva",
30
- relaxed: bool = False,
36
+ cot_style: str | None = None,
37
+ relaxed: bool | None = None,
31
38
  ) -> None:
32
39
  self.use_cot = use_cot
33
- self.cot_style = cot_style
34
- self.relaxed = relaxed
40
+ self.cot_style = cot_style if cot_style is not None else self.COT_STYLE
41
+ self.relaxed = relaxed if relaxed is not None else self.RELAXED
35
42
 
36
43
  def calculate(self, response: Completion) -> list[MetricResult]:
37
44
  if response.error:
@@ -60,7 +67,12 @@ class MathMinervaCompletion(BaseMetric[Completion]):
60
67
  ]
61
68
 
62
69
  raw = response.raw_completion or response.completion
63
- all_candidates = extract_answers(raw, use_cot=self.use_cot, cot_style=self.cot_style, relaxed=self.relaxed)
70
+ all_candidates = extract_answers(
71
+ raw,
72
+ use_cot=self.use_cot,
73
+ cot_style=self.cot_style,
74
+ relaxed=self.relaxed,
75
+ )
64
76
 
65
77
  exact_match = 0.0
66
78
  if all_candidates:
@@ -83,10 +95,19 @@ class MathMinervaCompletion(BaseMetric[Completion]):
83
95
  class MathMinervaCompletionRelaxed(MathMinervaCompletion):
84
96
  """MathMinervaCompletion with relaxed=True by default (flexible final-answer matching)."""
85
97
 
86
- def __init__(
87
- self,
88
- use_cot: bool = True,
89
- cot_style: str = "minerva",
90
- relaxed: bool = True,
91
- ) -> None:
92
- super().__init__(use_cot=use_cot, cot_style=cot_style, relaxed=relaxed)
98
+ NAME = "Math Minerva Completion Relaxed"
99
+ RELAXED = True
100
+
101
+
102
+ class MathMinervaCompletionDE(MathMinervaCompletion):
103
+ """MathMinervaCompletion with German final-answer extraction (``Finale Antwort: …``)."""
104
+
105
+ NAME = "Math Minerva Completion DE"
106
+ COT_STYLE = "minerva_de"
107
+
108
+
109
+ class MathMinervaCompletionRelaxedDE(MathMinervaCompletionDE):
110
+ """MathMinervaCompletionDE with relaxed=True by default."""
111
+
112
+ NAME = "Math Minerva Completion Relaxed DE"
113
+ RELAXED = True
@@ -12,6 +12,7 @@ from sympy.parsing.latex.errors import LaTeXParsingError
12
12
 
13
13
  INVALID_ANSWER = "[invalidanswer]"
14
14
  END_SEQ = "I hope it is correct."
15
+ END_SEQ_DE = "Ich hoffe, die Antwort ist korrekt." # German pendant to END_SEQ
15
16
 
16
17
  # Minerva normalize_final_answer: appendix D of Lewkowycz et al. (2022)
17
18
  SUBSTITUTIONS = [
@@ -143,6 +144,44 @@ def get_unnormalized_answer(text: str, relaxed: bool = False) -> str:
143
144
  return INVALID_ANSWER
144
145
 
145
146
 
147
+ def get_unnormalized_answer_de(text: str, relaxed: bool = False) -> str:
148
+ """German analogue of ``get_unnormalized_answer``."""
149
+ if relaxed:
150
+ match = re.search(
151
+ r"(?i)(?:finale|endgültige)\s+antwort\s*:\s*"
152
+ r"(?:die\s+(?:finale\s+|endgültige\s+)?antwort\s+(?:ist|lautet)\s*)?(.*)",
153
+ text,
154
+ re.DOTALL,
155
+ )
156
+ if match:
157
+ raw = match.group(1).strip()
158
+ raw = re.sub(
159
+ r"\.?\s*ich\s+hoffe,?\s+(?:die\s+antwort|sie|es)\s+(?:ist|sei)\s+korrekt\.?\s*$",
160
+ "",
161
+ raw,
162
+ flags=re.IGNORECASE,
163
+ ).strip()
164
+ return raw
165
+ return INVALID_ANSWER
166
+ text = text + END_SEQ_DE
167
+ match = re.search(
168
+ r"Finale Antwort: Die finale Antwort lautet(.*?)\. Ich hoffe, die Antwort ist korrekt\.",
169
+ text,
170
+ )
171
+ if match:
172
+ return match.group(1).strip()
173
+ return INVALID_ANSWER
174
+
175
+
176
+ # Registry of supported `cot_style` values
177
+ # Keys are the strings passed by metric configurations; values are language-specific final-answer extractors
178
+ # `(text: str, relaxed: bool) -> str`. Extend this dict to add a new language.
179
+ COT_EXTRACTORS = {
180
+ "minerva": get_unnormalized_answer,
181
+ "minerva_de": get_unnormalized_answer_de,
182
+ }
183
+
184
+
146
185
  def normalized_gold_from_solution(solution: str) -> str | None:
147
186
  """Extract and normalize the gold answer from a solution string (last \\boxed{...})."""
148
187
  boxed = last_boxed_only_string(solution)
@@ -368,10 +407,12 @@ def extract_answers(
368
407
  all_answers: list[str] = []
369
408
 
370
409
  if use_cot:
371
- if cot_style == "minerva":
372
- minerva_answer = normalize_final_answer(get_unnormalized_answer(raw, relaxed=relaxed))
373
- if minerva_answer and minerva_answer != INVALID_ANSWER:
374
- all_answers.append(minerva_answer)
410
+ if cot_style not in COT_EXTRACTORS:
411
+ raise ValueError(f"Unknown cot_style {cot_style!r}; valid: {sorted(COT_EXTRACTORS)}")
412
+ extractor = COT_EXTRACTORS[cot_style]
413
+ minerva_answer = normalize_final_answer(extractor(raw, relaxed=relaxed))
414
+ if minerva_answer and minerva_answer != INVALID_ANSWER:
415
+ all_answers.append(minerva_answer)
375
416
  boxed = last_boxed_only_string(raw)
376
417
  if boxed is not None:
377
418
  try:
@@ -1,6 +1,7 @@
1
1
  from typing import Any
2
2
 
3
3
  from eval_framework.metrics.completion.ifeval import IFEvalMetric, IFEvalMetricContext
4
+ from eval_framework.metrics.completion.language_checker import LanguageRawConsistencyChecker
4
5
  from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType
5
6
 
6
7
 
@@ -76,3 +77,4 @@ class IFEvalDe(IFEval):
76
77
  DATASET_PATH = "jzhang86/de_ifeval"
77
78
  SUBJECTS = [NO_SUBJECT]
78
79
  LANGUAGE = {NO_SUBJECT: Language.DEU}
80
+ METRICS = [IFEvalMetric, LanguageRawConsistencyChecker]
File without changes
File without changes