eval-framework 0.3.2__tar.gz → 0.3.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (191) hide show
  1. {eval_framework-0.3.2 → eval_framework-0.3.3}/PKG-INFO +1 -1
  2. {eval_framework-0.3.2 → eval_framework-0.3.3}/pyproject.toml +1 -1
  3. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/drop_completion.py +2 -2
  4. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/math_minerva_completion.py +9 -22
  5. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/truthfulqa.py +17 -2
  6. {eval_framework-0.3.2 → eval_framework-0.3.3}/LICENSE +0 -0
  7. {eval_framework-0.3.2 → eval_framework-0.3.3}/README.md +0 -0
  8. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/__init__.py +0 -0
  9. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/base_config.py +0 -0
  10. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/context/__init__.py +0 -0
  11. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/context/determined.py +0 -0
  12. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/context/eval.py +0 -0
  13. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/context/local.py +0 -0
  14. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/evaluation_generator.py +0 -0
  15. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/exceptions.py +0 -0
  16. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/external/drop_process_results.py +0 -0
  17. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/external/ifeval_impl/README.md +0 -0
  18. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
  19. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
  20. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
  21. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
  22. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/llm/__init__.py +0 -0
  23. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/llm/aleph_alpha.py +0 -0
  24. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/llm/base.py +0 -0
  25. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/llm/huggingface.py +0 -0
  26. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/llm/mistral.py +0 -0
  27. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/llm/models.py +0 -0
  28. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/llm/openai.py +0 -0
  29. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/llm/vllm.py +0 -0
  30. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/logger.py +0 -0
  31. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/main.py +0 -0
  32. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/__init__.py +0 -0
  33. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/aggregators/__init__.py +0 -0
  34. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/aggregators/aggregators.py +0 -0
  35. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/base.py +0 -0
  36. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/__init__.py +0 -0
  37. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
  38. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
  39. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/bleu.py +0 -0
  40. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/chrf.py +0 -0
  41. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
  42. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
  43. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/comet.py +0 -0
  44. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
  45. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/csv_format.py +0 -0
  46. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
  47. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
  48. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/f1.py +0 -0
  49. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/format_checker.py +0 -0
  50. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
  51. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/ifeval.py +0 -0
  52. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/json_format.py +0 -0
  53. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/language_checker.py +0 -0
  54. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/length_control.py +0 -0
  55. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
  56. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/minerva_math_utils.py +0 -0
  57. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/multipl_e_assertion.py +0 -0
  58. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
  59. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
  60. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/repetition.py +0 -0
  61. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
  62. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
  63. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
  64. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
  65. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
  66. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/ter.py +0 -0
  67. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/text_counter.py +0 -0
  68. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
  69. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
  70. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/__init__.py +0 -0
  71. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/base.py +0 -0
  72. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
  73. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
  74. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
  75. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
  76. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
  77. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
  78. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
  79. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/graders/language.py +0 -0
  80. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
  81. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/graders/models.py +0 -0
  82. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
  83. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
  84. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
  85. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
  86. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
  87. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
  88. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
  89. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
  90. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
  91. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
  92. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
  93. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
  94. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
  95. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
  96. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
  97. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/utils.py +0 -0
  98. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
  99. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
  100. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
  101. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/loglikelihood/bits_per_byte.py +0 -0
  102. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
  103. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
  104. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
  105. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
  106. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/py.typed +0 -0
  107. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/response_generator.py +0 -0
  108. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/result_processors/__init__.py +0 -0
  109. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/result_processors/base.py +0 -0
  110. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/result_processors/hf_uploader.py +0 -0
  111. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/result_processors/result_processor.py +0 -0
  112. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
  113. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/run.py +0 -0
  114. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/run_direct.py +0 -0
  115. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/shared/types.py +0 -0
  116. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/suite.py +0 -0
  117. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/Dockerfile_codebench +0 -0
  118. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/__init__.py +0 -0
  119. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/base.py +0 -0
  120. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
  121. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/aidanbench.py +0 -0
  122. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
  123. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
  124. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
  125. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/balancedcopa.py +0 -0
  126. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
  127. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
  128. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
  129. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
  130. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
  131. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/csqa.py +0 -0
  132. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/drop.py +0 -0
  133. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
  134. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/flores200.py +0 -0
  135. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
  136. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/global_mmlu.py +0 -0
  137. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/goldenswag.py +0 -0
  138. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
  139. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/gsm8k.py +0 -0
  140. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
  141. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
  142. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
  143. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
  144. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/include.py +0 -0
  145. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
  146. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/lab_bench.py +0 -0
  147. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/math_reasoning.py +0 -0
  148. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
  149. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/medqa.py +0 -0
  150. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
  151. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
  152. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
  153. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
  154. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/multipl_e.py +0 -0
  155. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/naturalqs_open.py +0 -0
  156. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
  157. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
  158. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
  159. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
  160. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
  161. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
  162. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/social_iqa.py +0 -0
  163. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
  164. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/squad.py +0 -0
  165. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
  166. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
  167. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
  168. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
  169. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
  170. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
  171. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
  172. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
  173. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/eval_config.py +0 -0
  174. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/perturbation.py +0 -0
  175. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/registry.py +0 -0
  176. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/task_loader.py +0 -0
  177. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/task_names.py +0 -0
  178. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/task_style.py +0 -0
  179. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/tasks/utils.py +0 -0
  180. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/utils/constants.py +0 -0
  181. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/utils/file_ops.py +0 -0
  182. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/utils/generate_task_docs.py +0 -0
  183. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/utils/helpers.py +0 -0
  184. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/utils/logging.py +0 -0
  185. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/utils/packaging.py +0 -0
  186. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/eval_framework/utils/tqdm_handler.py +0 -0
  187. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/template_formatting/README.md +0 -0
  188. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/template_formatting/__init__.py +0 -0
  189. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/template_formatting/formatter.py +0 -0
  190. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/template_formatting/mistral_formatter.py +0 -0
  191. {eval_framework-0.3.2 → eval_framework-0.3.3}/src/template_formatting/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: eval-framework
3
- Version: 0.3.2
3
+ Version: 0.3.3
4
4
  Summary: Evalulation Framework
5
5
  Author: Aleph Alpha Research
6
6
  License: Apache License
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "eval-framework"
3
- version = "0.3.2"
3
+ version = "0.3.3"
4
4
  description = "Evalulation Framework"
5
5
  readme = "README.md"
6
6
  license = { file = "LICENSE" }
@@ -18,7 +18,7 @@ class DropMetricContext(BaseMetricContext):
18
18
  class DropF1ExactMatch(BaseMetric[Completion]):
19
19
  """DROP F1 and exact match. Requires DropMetricContext with answer_tuples."""
20
20
 
21
- NAME = "DROP F1 / Exact Match"
21
+ NAME = "Drop F1"
22
22
  KEYS = ["f1", "exact_match"]
23
23
 
24
24
  def calculate(self, response: Completion) -> list[MetricResult]:
@@ -52,5 +52,5 @@ class DropF1ExactMatch(BaseMetric[Completion]):
52
52
  higher_is_better=True,
53
53
  error=response.error,
54
54
  )
55
- for name, key in zip([n.strip() for n in self.NAME.split("/")], self.KEYS)
55
+ for name, key in zip(self.NAMES, self.KEYS)
56
56
  ]
@@ -20,6 +20,7 @@ class MathMinervaCompletion(BaseMetric[Completion]):
20
20
  """
21
21
 
22
22
  NAME = "Math Minerva Completion"
23
+ KEYS = ["Exact", "Exact Flex"]
23
24
  AGGREGATORS = [PassAtK()]
24
25
 
25
26
  def __init__(
@@ -36,17 +37,12 @@ class MathMinervaCompletion(BaseMetric[Completion]):
36
37
  if response.error:
37
38
  return [
38
39
  MetricResult(
39
- metric_name="Exact Match",
40
+ metric_name=x,
40
41
  value=None,
41
42
  higher_is_better=True,
42
43
  error=response.error,
43
- ),
44
- MetricResult(
45
- metric_name="Exact Match (Flex)",
46
- value=None,
47
- higher_is_better=True,
48
- error=response.error,
49
- ),
44
+ )
45
+ for x in self.NAMES
50
46
  ]
51
47
 
52
48
  gold = response.ground_truth
@@ -55,17 +51,12 @@ class MathMinervaCompletion(BaseMetric[Completion]):
55
51
  if not gold:
56
52
  return [
57
53
  MetricResult(
58
- metric_name="Exact Match",
59
- value=None,
60
- higher_is_better=True,
61
- error="No ground truth available",
62
- ),
63
- MetricResult(
64
- metric_name="Exact Match (Flex)",
54
+ metric_name=x,
65
55
  value=None,
66
56
  higher_is_better=True,
67
57
  error="No ground truth available",
68
- ),
58
+ )
59
+ for x in self.NAMES
69
60
  ]
70
61
 
71
62
  raw = response.raw_completion or response.completion
@@ -84,12 +75,8 @@ class MathMinervaCompletion(BaseMetric[Completion]):
84
75
  )
85
76
 
86
77
  return [
87
- MetricResult(metric_name="Exact Match", value=exact_match, higher_is_better=True),
88
- MetricResult(
89
- metric_name="Exact Match (Flex)",
90
- value=exact_match_flex,
91
- higher_is_better=True,
92
- ),
78
+ MetricResult(metric_name=name, value=value, higher_is_better=True)
79
+ for name, value in zip(self.NAMES, [exact_match, exact_match_flex])
93
80
  ]
94
81
 
95
82
 
@@ -106,13 +106,18 @@ class TRUTHFULQA_OLMES(TRUTHFULQA):
106
106
  """
107
107
 
108
108
  NAME = "TruthfulQA_OLMES"
109
+ FEWSHOT_SPLIT = "validation" # use dataset few-shot for multiple-choice options
110
+ PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer"]
111
+
112
+ def _get_cue_text(self, item: dict[str, Any]) -> str:
113
+ return "Answer:"
109
114
 
110
115
  def _get_instruction_text(self, item: dict[str, Any]) -> str:
111
116
  question = item["question"]
112
117
  choices = item[self.target_identifier]["choices"]
113
118
  labels = get_n_letters(len(choices))
114
119
  options = "\n".join(f" {label}. {choice}" for label, choice in zip(labels, choices))
115
- return f"Q: {question}\n{options}\n"
120
+ return f"Question: {question}\n{options}\n"
116
121
 
117
122
  def _get_ground_truth(self, item: dict[str, Any]) -> str | None | list[str]:
118
123
  labels_arr = item[self.target_identifier]["labels"]
@@ -126,7 +131,17 @@ class TRUTHFULQA_OLMES(TRUTHFULQA):
126
131
  return [f" {letter}" for letter in letters]
127
132
 
128
133
  def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]:
129
- return self.FEWSHOT_ITEMS[: self.num_fewshot]
134
+ # Reuse BaseTask's split-based sampler.
135
+ return BaseTask._sample_fewshot_examples(self, item)
136
+
137
+ def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
138
+ cue_text = self._get_cue_text(item)
139
+ labels_arr = item[self.target_identifier]["labels"]
140
+ letters = get_n_letters(len(labels_arr))
141
+ # Pick one correct option for the demonstration answer.
142
+ correct_letters = [letters[i] for i, label in enumerate(labels_arr) if label == 1]
143
+ letter = correct_letters[0] if correct_letters else letters[0]
144
+ return f"{cue_text} {letter}"
130
145
 
131
146
 
132
147
  class TRUTHFULQA_IDK(TRUTHFULQA):
File without changes
File without changes