eval-framework 0.3.2__tar.gz → 0.3.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (191) hide show
  1. {eval_framework-0.3.2 → eval_framework-0.3.4}/PKG-INFO +1 -1
  2. {eval_framework-0.3.2 → eval_framework-0.3.4}/pyproject.toml +1 -1
  3. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/drop_completion.py +2 -2
  4. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/math_minerva_completion.py +9 -22
  5. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/response_generator.py +1 -1
  6. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/base.py +30 -11
  7. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/naturalqs_open.py +1 -1
  8. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/truthfulqa.py +17 -2
  9. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/eval_config.py +1 -1
  10. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/utils.py +19 -1
  11. {eval_framework-0.3.2 → eval_framework-0.3.4}/LICENSE +0 -0
  12. {eval_framework-0.3.2 → eval_framework-0.3.4}/README.md +0 -0
  13. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/__init__.py +0 -0
  14. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/base_config.py +0 -0
  15. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/context/__init__.py +0 -0
  16. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/context/determined.py +0 -0
  17. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/context/eval.py +0 -0
  18. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/context/local.py +0 -0
  19. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/evaluation_generator.py +0 -0
  20. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/exceptions.py +0 -0
  21. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/external/drop_process_results.py +0 -0
  22. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/external/ifeval_impl/README.md +0 -0
  23. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
  24. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
  25. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
  26. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
  27. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/llm/__init__.py +0 -0
  28. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/llm/aleph_alpha.py +0 -0
  29. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/llm/base.py +0 -0
  30. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/llm/huggingface.py +0 -0
  31. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/llm/mistral.py +0 -0
  32. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/llm/models.py +0 -0
  33. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/llm/openai.py +0 -0
  34. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/llm/vllm.py +0 -0
  35. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/logger.py +0 -0
  36. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/main.py +0 -0
  37. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/__init__.py +0 -0
  38. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/aggregators/__init__.py +0 -0
  39. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/aggregators/aggregators.py +0 -0
  40. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/base.py +0 -0
  41. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/__init__.py +0 -0
  42. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
  43. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
  44. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/bleu.py +0 -0
  45. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/chrf.py +0 -0
  46. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
  47. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
  48. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/comet.py +0 -0
  49. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
  50. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/csv_format.py +0 -0
  51. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
  52. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
  53. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/f1.py +0 -0
  54. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/format_checker.py +0 -0
  55. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
  56. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/ifeval.py +0 -0
  57. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/json_format.py +0 -0
  58. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/language_checker.py +0 -0
  59. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/length_control.py +0 -0
  60. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
  61. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/minerva_math_utils.py +0 -0
  62. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/multipl_e_assertion.py +0 -0
  63. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
  64. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
  65. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/repetition.py +0 -0
  66. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
  67. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
  68. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
  69. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
  70. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
  71. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/ter.py +0 -0
  72. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/text_counter.py +0 -0
  73. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
  74. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
  75. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/__init__.py +0 -0
  76. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/base.py +0 -0
  77. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
  78. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
  79. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
  80. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
  81. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
  82. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
  83. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
  84. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/language.py +0 -0
  85. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
  86. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/models.py +0 -0
  87. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
  88. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
  89. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
  90. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
  91. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
  92. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
  93. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
  94. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
  95. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
  96. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
  97. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
  98. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
  99. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
  100. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
  101. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
  102. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/utils.py +0 -0
  103. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
  104. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
  105. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
  106. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/loglikelihood/bits_per_byte.py +0 -0
  107. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
  108. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
  109. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
  110. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
  111. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/py.typed +0 -0
  112. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/result_processors/__init__.py +0 -0
  113. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/result_processors/base.py +0 -0
  114. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/result_processors/hf_uploader.py +0 -0
  115. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/result_processors/result_processor.py +0 -0
  116. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
  117. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/run.py +0 -0
  118. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/run_direct.py +0 -0
  119. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/shared/types.py +0 -0
  120. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/suite.py +0 -0
  121. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/Dockerfile_codebench +0 -0
  122. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/__init__.py +0 -0
  123. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
  124. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/aidanbench.py +0 -0
  125. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
  126. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
  127. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
  128. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/balancedcopa.py +0 -0
  129. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
  130. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
  131. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
  132. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
  133. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
  134. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/csqa.py +0 -0
  135. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/drop.py +0 -0
  136. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
  137. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/flores200.py +0 -0
  138. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
  139. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/global_mmlu.py +0 -0
  140. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/goldenswag.py +0 -0
  141. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
  142. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/gsm8k.py +0 -0
  143. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
  144. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
  145. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
  146. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
  147. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/include.py +0 -0
  148. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
  149. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/lab_bench.py +0 -0
  150. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/math_reasoning.py +0 -0
  151. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
  152. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/medqa.py +0 -0
  153. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
  154. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
  155. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
  156. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
  157. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/multipl_e.py +0 -0
  158. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
  159. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
  160. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
  161. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
  162. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
  163. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
  164. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/social_iqa.py +0 -0
  165. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
  166. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/squad.py +0 -0
  167. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
  168. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
  169. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
  170. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
  171. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
  172. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
  173. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
  174. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
  175. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/perturbation.py +0 -0
  176. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/registry.py +0 -0
  177. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/task_loader.py +0 -0
  178. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/task_names.py +0 -0
  179. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/task_style.py +0 -0
  180. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/utils/constants.py +0 -0
  181. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/utils/file_ops.py +0 -0
  182. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/utils/generate_task_docs.py +0 -0
  183. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/utils/helpers.py +0 -0
  184. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/utils/logging.py +0 -0
  185. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/utils/packaging.py +0 -0
  186. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/utils/tqdm_handler.py +0 -0
  187. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/template_formatting/README.md +0 -0
  188. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/template_formatting/__init__.py +0 -0
  189. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/template_formatting/formatter.py +0 -0
  190. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/template_formatting/mistral_formatter.py +0 -0
  191. {eval_framework-0.3.2 → eval_framework-0.3.4}/src/template_formatting/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: eval-framework
3
- Version: 0.3.2
3
+ Version: 0.3.4
4
4
  Summary: Evalulation Framework
5
5
  Author: Aleph Alpha Research
6
6
  License: Apache License
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "eval-framework"
3
- version = "0.3.2"
3
+ version = "0.3.4"
4
4
  description = "Evalulation Framework"
5
5
  readme = "README.md"
6
6
  license = { file = "LICENSE" }
@@ -18,7 +18,7 @@ class DropMetricContext(BaseMetricContext):
18
18
  class DropF1ExactMatch(BaseMetric[Completion]):
19
19
  """DROP F1 and exact match. Requires DropMetricContext with answer_tuples."""
20
20
 
21
- NAME = "DROP F1 / Exact Match"
21
+ NAME = "Drop F1"
22
22
  KEYS = ["f1", "exact_match"]
23
23
 
24
24
  def calculate(self, response: Completion) -> list[MetricResult]:
@@ -52,5 +52,5 @@ class DropF1ExactMatch(BaseMetric[Completion]):
52
52
  higher_is_better=True,
53
53
  error=response.error,
54
54
  )
55
- for name, key in zip([n.strip() for n in self.NAME.split("/")], self.KEYS)
55
+ for name, key in zip(self.NAMES, self.KEYS)
56
56
  ]
@@ -20,6 +20,7 @@ class MathMinervaCompletion(BaseMetric[Completion]):
20
20
  """
21
21
 
22
22
  NAME = "Math Minerva Completion"
23
+ KEYS = ["Exact", "Exact Flex"]
23
24
  AGGREGATORS = [PassAtK()]
24
25
 
25
26
  def __init__(
@@ -36,17 +37,12 @@ class MathMinervaCompletion(BaseMetric[Completion]):
36
37
  if response.error:
37
38
  return [
38
39
  MetricResult(
39
- metric_name="Exact Match",
40
+ metric_name=x,
40
41
  value=None,
41
42
  higher_is_better=True,
42
43
  error=response.error,
43
- ),
44
- MetricResult(
45
- metric_name="Exact Match (Flex)",
46
- value=None,
47
- higher_is_better=True,
48
- error=response.error,
49
- ),
44
+ )
45
+ for x in self.NAMES
50
46
  ]
51
47
 
52
48
  gold = response.ground_truth
@@ -55,17 +51,12 @@ class MathMinervaCompletion(BaseMetric[Completion]):
55
51
  if not gold:
56
52
  return [
57
53
  MetricResult(
58
- metric_name="Exact Match",
59
- value=None,
60
- higher_is_better=True,
61
- error="No ground truth available",
62
- ),
63
- MetricResult(
64
- metric_name="Exact Match (Flex)",
54
+ metric_name=x,
65
55
  value=None,
66
56
  higher_is_better=True,
67
57
  error="No ground truth available",
68
- ),
58
+ )
59
+ for x in self.NAMES
69
60
  ]
70
61
 
71
62
  raw = response.raw_completion or response.completion
@@ -84,12 +75,8 @@ class MathMinervaCompletion(BaseMetric[Completion]):
84
75
  )
85
76
 
86
77
  return [
87
- MetricResult(metric_name="Exact Match", value=exact_match, higher_is_better=True),
88
- MetricResult(
89
- metric_name="Exact Match (Flex)",
90
- value=exact_match_flex,
91
- higher_is_better=True,
92
- ),
78
+ MetricResult(metric_name=name, value=value, higher_is_better=True)
79
+ for name, value in zip(self.NAMES, [exact_match, exact_match_flex])
93
80
  ]
94
81
 
95
82
 
@@ -78,7 +78,7 @@ class ResponseGenerator:
78
78
  custom_hf_revision=self.config.hf_revision,
79
79
  )
80
80
 
81
- self.response_type, _ = self.task._get_type_and_metrics()
81
+ self.response_type = self.task.get_response_type()
82
82
 
83
83
  def _llm_task_param_precedence(self) -> tuple[list[str] | None, int | None]:
84
84
  """
@@ -15,7 +15,7 @@ from huggingface_hub.errors import RevisionNotFoundError
15
15
  from pydantic import BaseModel, ConfigDict
16
16
 
17
17
  from eval_framework.shared.types import BaseMetricContext, Completion, Error, RawCompletion
18
- from eval_framework.tasks.utils import raise_errors
18
+ from eval_framework.tasks.utils import classproperty, raise_errors
19
19
  from template_formatting.formatter import Message, Role
20
20
 
21
21
  if TYPE_CHECKING:
@@ -91,8 +91,6 @@ class BaseTask[SubjectType](ABC):
91
91
  DATASET_PATH: str
92
92
  SAMPLE_SPLIT: str
93
93
  FEWSHOT_SPLIT: str
94
- RESPONSE_TYPE: ResponseType
95
- METRICS: list[type["BaseMetric"]]
96
94
  SUBJECTS: list[SubjectType]
97
95
  HF_REVISION: str | None = None # tag name, or branch name, or commit hash to ensure reproducibility
98
96
 
@@ -104,6 +102,10 @@ class BaseTask[SubjectType](ABC):
104
102
  # language by subtopic, or `None` (for tasks not specific to a single language).
105
103
  LANGUAGE: Language | dict[str, Language] | dict[str, tuple[Language, Language]] | None
106
104
 
105
+ # RESPONSE_TYPE and METRICS use exposed as classproperties, so you can access them via either
106
+ # `TaskClass.*` or `task.*` (or `task.get_metrics()`). This avoids mypy conflicts from re-declaring class vars.
107
+ # By default, these values come from TASK_STYLER if set, otherwise from legacy class attributes.
108
+
107
109
  def __init__(self, num_fewshot: int = 0) -> None:
108
110
  self.num_fewshot = num_fewshot
109
111
  self.stop_sequences: list[str] | None = None
@@ -332,14 +334,12 @@ class BaseTask[SubjectType](ABC):
332
334
  return None
333
335
 
334
336
  def get_metadata(self) -> dict[str, str | list[str]]:
335
- response_type, metrics = self._get_type_and_metrics()
336
-
337
337
  meta: dict[str, str | list[str]] = {
338
338
  "dataset_path": self.DATASET_PATH,
339
339
  "sample_split": self.SAMPLE_SPLIT,
340
340
  "fewshot_split": self.FEWSHOT_SPLIT,
341
- "response_type": response_type.value,
342
- "metrics": [m.NAME for m in metrics],
341
+ "response_type": self.get_response_type().value,
342
+ "metrics": [m.NAME for m in self.get_metrics()],
343
343
  "subjects": [str(s) for s in self.SUBJECTS],
344
344
  }
345
345
  if hasattr(self, "TASK_STYLER"):
@@ -420,7 +420,26 @@ class BaseTask[SubjectType](ABC):
420
420
  )
421
421
  return completion_list
422
422
 
423
- def _get_type_and_metrics(self) -> tuple[ResponseType, list[type["BaseMetric"]]]:
424
- if hasattr(self, "TASK_STYLER"):
425
- return self.TASK_STYLER.response_type, self.TASK_STYLER.metrics
426
- return self.RESPONSE_TYPE, self.METRICS
423
+ @classmethod
424
+ def get_response_type(cls) -> ResponseType:
425
+ """Return the response type of the task (or the styler if it exists)."""
426
+ if hasattr(cls, "TASK_STYLER"):
427
+ return cls.TASK_STYLER.response_type
428
+ return cls.RESPONSE_TYPE
429
+
430
+ @classmethod
431
+ def get_metrics(cls) -> list[type["BaseMetric"]]:
432
+ """Return the metrics of the task (or the styler if it exists)."""
433
+ if hasattr(cls, "TASK_STYLER"):
434
+ return cls.TASK_STYLER.metrics
435
+ return cls.METRICS
436
+
437
+ @classproperty
438
+ def RESPONSE_TYPE(cls) -> ResponseType:
439
+ """For backwards compatibility."""
440
+ return cls.get_response_type()
441
+
442
+ @classproperty
443
+ def METRICS(cls) -> list[type["BaseMetric"]]:
444
+ """For backwards compatibility."""
445
+ return cls.get_metrics()
@@ -86,7 +86,7 @@ class NaturalQsOpenCloze(_NaturalQsOpenChoice_Base):
86
86
 
87
87
  class NaturalQsOpenMC(_NaturalQsOpenChoice_Base):
88
88
  NAME = "NaturalQsOpenMC"
89
- TASK_STYLER = MCStyle(space_prefixed_labels=True)
89
+ TASK_STYLER = MCStyle()
90
90
 
91
91
 
92
92
  class NaturalQsOpenMC_OLMES(_NaturalQsOpenChoice_Base):
@@ -106,13 +106,18 @@ class TRUTHFULQA_OLMES(TRUTHFULQA):
106
106
  """
107
107
 
108
108
  NAME = "TruthfulQA_OLMES"
109
+ FEWSHOT_SPLIT = "validation" # use dataset few-shot for multiple-choice options
110
+ PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer"]
111
+
112
+ def _get_cue_text(self, item: dict[str, Any]) -> str:
113
+ return "Answer:"
109
114
 
110
115
  def _get_instruction_text(self, item: dict[str, Any]) -> str:
111
116
  question = item["question"]
112
117
  choices = item[self.target_identifier]["choices"]
113
118
  labels = get_n_letters(len(choices))
114
119
  options = "\n".join(f" {label}. {choice}" for label, choice in zip(labels, choices))
115
- return f"Q: {question}\n{options}\n"
120
+ return f"Question: {question}\n{options}\n"
116
121
 
117
122
  def _get_ground_truth(self, item: dict[str, Any]) -> str | None | list[str]:
118
123
  labels_arr = item[self.target_identifier]["labels"]
@@ -126,7 +131,17 @@ class TRUTHFULQA_OLMES(TRUTHFULQA):
126
131
  return [f" {letter}" for letter in letters]
127
132
 
128
133
  def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]:
129
- return self.FEWSHOT_ITEMS[: self.num_fewshot]
134
+ # Reuse BaseTask's split-based sampler.
135
+ return BaseTask._sample_fewshot_examples(self, item)
136
+
137
+ def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
138
+ cue_text = self._get_cue_text(item)
139
+ labels_arr = item[self.target_identifier]["labels"]
140
+ letters = get_n_letters(len(labels_arr))
141
+ # Pick one correct option for the demonstration answer.
142
+ correct_letters = [letters[i] for i, label in enumerate(labels_arr) if label == 1]
143
+ letter = correct_letters[0] if correct_letters else letters[0]
144
+ return f"{cue_text} {letter}"
130
145
 
131
146
 
132
147
  class TRUTHFULQA_IDK(TRUTHFULQA):
@@ -112,7 +112,7 @@ class EvalConfig(BaseConfig):
112
112
  @model_validator(mode="after")
113
113
  def validate_llm_judge_defined(self) -> "EvalConfig":
114
114
  task = get_task(self.task_name)
115
- _, task_metrics = task(num_fewshot=0)._get_type_and_metrics()
115
+ task_metrics = task(num_fewshot=0).get_metrics()
116
116
  for metric_class in task_metrics:
117
117
  if issubclass(metric_class, BaseLLMJudgeMetric):
118
118
  assert self.llm_judge_class is not None, "The LLM Judge must be defined for this evaluation task."
@@ -8,7 +8,7 @@ import string
8
8
  import threading
9
9
  from collections.abc import Callable
10
10
  from pathlib import Path
11
- from typing import Any, Literal, NamedTuple
11
+ from typing import Any, Literal, NamedTuple, overload
12
12
 
13
13
  import dill
14
14
  import numpy as np
@@ -22,6 +22,24 @@ logger = logging.getLogger(__name__)
22
22
  RANDOM_SEED = 42 # hacky way to get around circular import
23
23
  redis_warning_printed = False
24
24
 
25
+
26
+ class classproperty[T]:
27
+ """Descriptor supporting property-like access on classes and instances."""
28
+
29
+ def __init__(self, fget: Callable[[Any], T]) -> None:
30
+ self.fget = fget
31
+
32
+ @overload
33
+ def __get__(self, obj: None, owner: type[Any]) -> T: ...
34
+
35
+ @overload
36
+ def __get__(self, obj: object, owner: type[Any] | None = None) -> T: ...
37
+
38
+ def __get__(self, obj: object | None, owner: type[Any] | None = None) -> T:
39
+ cls = owner if owner is not None else type(obj)
40
+ return self.fget(cls)
41
+
42
+
25
43
  _pools: dict[tuple[str | None, tuple[str, ...] | None], ContainerPoolManager] = {}
26
44
  _pools_lock = threading.Lock()
27
45
 
File without changes
File without changes