eval-framework 0.3.3__tar.gz → 0.3.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (191) hide show
  1. {eval_framework-0.3.3 → eval_framework-0.3.5}/PKG-INFO +1 -1
  2. {eval_framework-0.3.3 → eval_framework-0.3.5}/pyproject.toml +1 -1
  3. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/evaluation_generator.py +27 -2
  4. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/response_generator.py +5 -4
  5. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/result_processors/result_processor.py +4 -4
  6. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/base.py +35 -13
  7. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/aidanbench.py +11 -2
  8. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/naturalqs_open.py +1 -1
  9. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/eval_config.py +5 -1
  10. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/utils.py +19 -1
  11. {eval_framework-0.3.3 → eval_framework-0.3.5}/LICENSE +0 -0
  12. {eval_framework-0.3.3 → eval_framework-0.3.5}/README.md +0 -0
  13. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/__init__.py +0 -0
  14. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/base_config.py +0 -0
  15. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/context/__init__.py +0 -0
  16. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/context/determined.py +0 -0
  17. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/context/eval.py +0 -0
  18. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/context/local.py +0 -0
  19. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/exceptions.py +0 -0
  20. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/external/drop_process_results.py +0 -0
  21. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/external/ifeval_impl/README.md +0 -0
  22. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
  23. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
  24. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
  25. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
  26. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/llm/__init__.py +0 -0
  27. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/llm/aleph_alpha.py +0 -0
  28. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/llm/base.py +0 -0
  29. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/llm/huggingface.py +0 -0
  30. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/llm/mistral.py +0 -0
  31. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/llm/models.py +0 -0
  32. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/llm/openai.py +0 -0
  33. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/llm/vllm.py +0 -0
  34. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/logger.py +0 -0
  35. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/main.py +0 -0
  36. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/__init__.py +0 -0
  37. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/aggregators/__init__.py +0 -0
  38. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/aggregators/aggregators.py +0 -0
  39. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/base.py +0 -0
  40. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/__init__.py +0 -0
  41. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
  42. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
  43. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/bleu.py +0 -0
  44. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/chrf.py +0 -0
  45. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
  46. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
  47. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/comet.py +0 -0
  48. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
  49. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/csv_format.py +0 -0
  50. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
  51. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/drop_completion.py +0 -0
  52. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
  53. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/f1.py +0 -0
  54. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/format_checker.py +0 -0
  55. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
  56. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/ifeval.py +0 -0
  57. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/json_format.py +0 -0
  58. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/language_checker.py +0 -0
  59. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/length_control.py +0 -0
  60. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/math_minerva_completion.py +0 -0
  61. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
  62. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/minerva_math_utils.py +0 -0
  63. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/multipl_e_assertion.py +0 -0
  64. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
  65. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
  66. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/repetition.py +0 -0
  67. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
  68. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
  69. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
  70. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
  71. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
  72. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/ter.py +0 -0
  73. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/text_counter.py +0 -0
  74. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
  75. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
  76. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/__init__.py +0 -0
  77. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/base.py +0 -0
  78. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
  79. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
  80. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
  81. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
  82. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
  83. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
  84. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
  85. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/language.py +0 -0
  86. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
  87. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/models.py +0 -0
  88. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
  89. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
  90. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
  91. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
  92. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
  93. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
  94. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
  95. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
  96. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
  97. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
  98. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
  99. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
  100. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
  101. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
  102. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
  103. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/utils.py +0 -0
  104. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
  105. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
  106. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
  107. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/loglikelihood/bits_per_byte.py +0 -0
  108. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
  109. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
  110. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
  111. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
  112. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/py.typed +0 -0
  113. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/result_processors/__init__.py +0 -0
  114. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/result_processors/base.py +0 -0
  115. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/result_processors/hf_uploader.py +0 -0
  116. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
  117. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/run.py +0 -0
  118. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/run_direct.py +0 -0
  119. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/shared/types.py +0 -0
  120. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/suite.py +0 -0
  121. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/Dockerfile_codebench +0 -0
  122. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/__init__.py +0 -0
  123. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
  124. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
  125. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
  126. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
  127. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/balancedcopa.py +0 -0
  128. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
  129. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
  130. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
  131. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
  132. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
  133. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/csqa.py +0 -0
  134. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/drop.py +0 -0
  135. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
  136. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/flores200.py +0 -0
  137. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
  138. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/global_mmlu.py +0 -0
  139. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/goldenswag.py +0 -0
  140. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
  141. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/gsm8k.py +0 -0
  142. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
  143. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
  144. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
  145. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
  146. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/include.py +0 -0
  147. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
  148. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/lab_bench.py +0 -0
  149. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/math_reasoning.py +0 -0
  150. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
  151. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/medqa.py +0 -0
  152. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
  153. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
  154. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
  155. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
  156. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/multipl_e.py +0 -0
  157. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
  158. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
  159. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
  160. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
  161. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
  162. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
  163. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/social_iqa.py +0 -0
  164. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
  165. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/squad.py +0 -0
  166. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
  167. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
  168. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
  169. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/truthfulqa.py +0 -0
  170. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
  171. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
  172. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
  173. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
  174. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
  175. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/perturbation.py +0 -0
  176. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/registry.py +0 -0
  177. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/task_loader.py +0 -0
  178. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/task_names.py +0 -0
  179. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/task_style.py +0 -0
  180. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/utils/constants.py +0 -0
  181. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/utils/file_ops.py +0 -0
  182. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/utils/generate_task_docs.py +0 -0
  183. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/utils/helpers.py +0 -0
  184. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/utils/logging.py +0 -0
  185. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/utils/packaging.py +0 -0
  186. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/utils/tqdm_handler.py +0 -0
  187. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/template_formatting/README.md +0 -0
  188. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/template_formatting/__init__.py +0 -0
  189. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/template_formatting/formatter.py +0 -0
  190. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/template_formatting/mistral_formatter.py +0 -0
  191. {eval_framework-0.3.3 → eval_framework-0.3.5}/src/template_formatting/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: eval-framework
3
- Version: 0.3.3
3
+ Version: 0.3.5
4
4
  Summary: Evalulation Framework
5
5
  Author: Aleph Alpha Research
6
6
  License: Apache License
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "eval-framework"
3
- version = "0.3.3"
3
+ version = "0.3.5"
4
4
  description = "Evalulation Framework"
5
5
  readme = "README.md"
6
6
  license = { file = "LICENSE" }
@@ -127,7 +127,18 @@ class EvaluationGenerator:
127
127
  return results
128
128
 
129
129
  def _aggregate_results(self, results: list[Result]) -> dict[str, float | None]:
130
- data = pd.DataFrame([r.model_dump() for r in results])
130
+ data = pd.DataFrame(
131
+ [
132
+ {
133
+ "metric_name": r.metric_name,
134
+ "subject": r.subject,
135
+ "key": r.key,
136
+ "value": r.value,
137
+ "error": r.error,
138
+ }
139
+ for r in results
140
+ ]
141
+ )
131
142
  if len(data) == 0:
132
143
  return {}
133
144
  data.fillna({"key": ""}, inplace=True)
@@ -251,7 +262,20 @@ class EvaluationGenerator:
251
262
  return aggregated_results
252
263
 
253
264
  def _aggregate_results_with_aggregators(self, results: list[Result]) -> dict[str, float | None]:
254
- data = pd.DataFrame([r.model_dump() for r in results])
265
+ data = pd.DataFrame(
266
+ [
267
+ {
268
+ "metric_name": r.metric_name,
269
+ "metric_class_name": r.metric_class_name,
270
+ "subject": r.subject,
271
+ "key": r.key,
272
+ "value": r.value,
273
+ "error": r.error,
274
+ "prompt": r.prompt,
275
+ }
276
+ for r in results
277
+ ]
278
+ )
255
279
  if len(data) == 0:
256
280
  return {}
257
281
  data = data.fillna({"key": ""})
@@ -313,6 +337,7 @@ class EvaluationGenerator:
313
337
  raise ValueError("No saved completions found. Run 'run_completions' first.")
314
338
 
315
339
  metrics_results = self._run_metric_calculators(responses)
340
+ del responses
316
341
  aggregated_results = self._aggregate_results(metrics_results)
317
342
  results_with_aggregators = self._aggregate_results_with_aggregators(metrics_results)
318
343
  aggregated_results.update(results_with_aggregators)
@@ -78,7 +78,7 @@ class ResponseGenerator:
78
78
  custom_hf_revision=self.config.hf_revision,
79
79
  )
80
80
 
81
- self.response_type, _ = self.task._get_type_and_metrics()
81
+ self.response_type = self.task.get_response_type()
82
82
 
83
83
  def _llm_task_param_precedence(self) -> tuple[list[str] | None, int | None]:
84
84
  """
@@ -114,8 +114,8 @@ class ResponseGenerator:
114
114
  try:
115
115
  raw_loglikelihoods = self.llm.logprobs(samples)
116
116
  except Exception as e:
117
- if raise_errors():
118
- raise e
117
+ if raise_errors() or self.config.fail_on_error:
118
+ raise
119
119
  logger.info(f"Error: {e.__class__.__name__} {e}")
120
120
  raw_loglikelihoods = [
121
121
  RawLoglikelihood(
@@ -166,7 +166,8 @@ class ResponseGenerator:
166
166
  self.llm,
167
167
  stop_sequences=stop_sequences,
168
168
  max_tokens=max_tokens,
169
- ) # type: ignore[call-arg]
169
+ fail_on_error=self.config.fail_on_error,
170
+ )
170
171
  case ResponseType.LOGLIKELIHOODS:
171
172
  return self._generate_loglikelihoods
172
173
  case _:
@@ -36,9 +36,9 @@ class ResultsFileProcessor(ResultProcessor):
36
36
  return {}
37
37
 
38
38
  def save_responses(self, responses: list[Completion | Loglikelihood]) -> None:
39
- responses_data = [response.model_dump(mode="json", serialize_as_any=True) for response in responses]
40
39
  with jsonlines.open(self.output_dir / "output.jsonl", "w") as f:
41
- f.write_all(responses_data)
40
+ for response in responses:
41
+ f.write(response.model_dump(mode="json", serialize_as_any=True))
42
42
 
43
43
  def save_response(self, response: Completion | Loglikelihood) -> None:
44
44
  with jsonlines.open(self.output_dir / "output.jsonl", "a") as f:
@@ -72,9 +72,9 @@ class ResultsFileProcessor(ResultProcessor):
72
72
  return responses
73
73
 
74
74
  def save_metrics_results(self, results: list[Result]) -> None:
75
- result_data = [x.model_dump(mode="json") for x in results]
76
75
  with jsonlines.open(self.output_dir / "results.jsonl", "w") as f:
77
- f.write_all(result_data)
76
+ for result in results:
77
+ f.write(result.model_dump(mode="json"))
78
78
 
79
79
  def save_metrics_result(self, result: Result) -> None:
80
80
  with jsonlines.open(self.output_dir / "results.jsonl", "a") as f:
@@ -15,7 +15,7 @@ from huggingface_hub.errors import RevisionNotFoundError
15
15
  from pydantic import BaseModel, ConfigDict
16
16
 
17
17
  from eval_framework.shared.types import BaseMetricContext, Completion, Error, RawCompletion
18
- from eval_framework.tasks.utils import raise_errors
18
+ from eval_framework.tasks.utils import classproperty, raise_errors
19
19
  from template_formatting.formatter import Message, Role
20
20
 
21
21
  if TYPE_CHECKING:
@@ -91,8 +91,6 @@ class BaseTask[SubjectType](ABC):
91
91
  DATASET_PATH: str
92
92
  SAMPLE_SPLIT: str
93
93
  FEWSHOT_SPLIT: str
94
- RESPONSE_TYPE: ResponseType
95
- METRICS: list[type["BaseMetric"]]
96
94
  SUBJECTS: list[SubjectType]
97
95
  HF_REVISION: str | None = None # tag name, or branch name, or commit hash to ensure reproducibility
98
96
 
@@ -104,6 +102,10 @@ class BaseTask[SubjectType](ABC):
104
102
  # language by subtopic, or `None` (for tasks not specific to a single language).
105
103
  LANGUAGE: Language | dict[str, Language] | dict[str, tuple[Language, Language]] | None
106
104
 
105
+ # RESPONSE_TYPE and METRICS use exposed as classproperties, so you can access them via either
106
+ # `TaskClass.*` or `task.*` (or `task.get_metrics()`). This avoids mypy conflicts from re-declaring class vars.
107
+ # By default, these values come from TASK_STYLER if set, otherwise from legacy class attributes.
108
+
107
109
  def __init__(self, num_fewshot: int = 0) -> None:
108
110
  self.num_fewshot = num_fewshot
109
111
  self.stop_sequences: list[str] | None = None
@@ -332,14 +334,12 @@ class BaseTask[SubjectType](ABC):
332
334
  return None
333
335
 
334
336
  def get_metadata(self) -> dict[str, str | list[str]]:
335
- response_type, metrics = self._get_type_and_metrics()
336
-
337
337
  meta: dict[str, str | list[str]] = {
338
338
  "dataset_path": self.DATASET_PATH,
339
339
  "sample_split": self.SAMPLE_SPLIT,
340
340
  "fewshot_split": self.FEWSHOT_SPLIT,
341
- "response_type": response_type.value,
342
- "metrics": [m.NAME for m in metrics],
341
+ "response_type": self.get_response_type().value,
342
+ "metrics": [m.NAME for m in self.get_metrics()],
343
343
  "subjects": [str(s) for s in self.SUBJECTS],
344
344
  }
345
345
  if hasattr(self, "TASK_STYLER"):
@@ -352,12 +352,15 @@ class BaseTask[SubjectType](ABC):
352
352
  samples: list[Sample],
353
353
  stop_sequences: list[str] | None = None,
354
354
  max_tokens: int | None = None,
355
+ fail_on_error: bool = False,
355
356
  ) -> list[Completion]:
356
357
  """
357
358
  Generates completions for the sample.
358
359
  :param sample: sample to generate completions for
359
360
  :param stop_sequences: stop sequences to use in completion generation
360
361
  :param max_tokens: maximum tokens to use in completion generation
362
+ :param fail_on_error: if True, re-raise the original exception instead of capturing it
363
+ into a per-sample Error completion
361
364
  :return: completion
362
365
  """
363
366
  if stop_sequences is None:
@@ -367,8 +370,8 @@ class BaseTask[SubjectType](ABC):
367
370
  try:
368
371
  raw_completions = llm.generate(samples=samples, stop_sequences=stop_sequences, max_tokens=max_tokens)
369
372
  except Exception as e:
370
- if raise_errors():
371
- raise e
373
+ if raise_errors() or fail_on_error:
374
+ raise
372
375
  logger.info(f"Error: {e.__class__.__name__} {e}")
373
376
  raw_completions = [
374
377
  RawCompletion(
@@ -420,7 +423,26 @@ class BaseTask[SubjectType](ABC):
420
423
  )
421
424
  return completion_list
422
425
 
423
- def _get_type_and_metrics(self) -> tuple[ResponseType, list[type["BaseMetric"]]]:
424
- if hasattr(self, "TASK_STYLER"):
425
- return self.TASK_STYLER.response_type, self.TASK_STYLER.metrics
426
- return self.RESPONSE_TYPE, self.METRICS
426
+ @classmethod
427
+ def get_response_type(cls) -> ResponseType:
428
+ """Return the response type of the task (or the styler if it exists)."""
429
+ if hasattr(cls, "TASK_STYLER"):
430
+ return cls.TASK_STYLER.response_type
431
+ return cls.RESPONSE_TYPE
432
+
433
+ @classmethod
434
+ def get_metrics(cls) -> list[type["BaseMetric"]]:
435
+ """Return the metrics of the task (or the styler if it exists)."""
436
+ if hasattr(cls, "TASK_STYLER"):
437
+ return cls.TASK_STYLER.metrics
438
+ return cls.METRICS
439
+
440
+ @classproperty
441
+ def RESPONSE_TYPE(cls) -> ResponseType:
442
+ """For backwards compatibility."""
443
+ return cls.get_response_type()
444
+
445
+ @classproperty
446
+ def METRICS(cls) -> list[type["BaseMetric"]]:
447
+ """For backwards compatibility."""
448
+ return cls.get_metrics()
@@ -104,7 +104,12 @@ class AidanBenchOriginal(BaseTask[str]):
104
104
  return [Message(role=Role.USER, content=instruction_message)]
105
105
 
106
106
  def _generation_loop(
107
- self, llm: "BaseLLM", stop_sequences: list[str] | None, max_tokens: int | None, initial_samples: list[Sample]
107
+ self,
108
+ llm: "BaseLLM",
109
+ stop_sequences: list[str] | None,
110
+ max_tokens: int | None,
111
+ initial_samples: list[Sample],
112
+ fail_on_error: bool = False,
108
113
  ) -> tuple[list[list[Message]], list[Union["Error", None]]]:
109
114
  initial_messages = [s.messages for s in initial_samples]
110
115
  samples = [(s, False) for s in initial_samples] # (sample, is_done)
@@ -118,6 +123,7 @@ class AidanBenchOriginal(BaseTask[str]):
118
123
  [samples[i][0] for i in not_done_idx],
119
124
  stop_sequences=stop_sequences,
120
125
  max_tokens=max_tokens,
126
+ fail_on_error=fail_on_error,
121
127
  )
122
128
  new_completion_messages: list[list[Message] | None] = [c.messages for c in new_completions]
123
129
  new_errors = [c.error for c in new_completions]
@@ -164,11 +170,14 @@ class AidanBenchOriginal(BaseTask[str]):
164
170
  samples: list[Sample],
165
171
  stop_sequences: list[str] | None = None,
166
172
  max_tokens: int | None = None,
173
+ fail_on_error: bool = False,
167
174
  ) -> list[Completion]:
168
175
  assert all(len(s.messages) == 1 and s.messages[0].role == Role.USER for s in samples), (
169
176
  "Each sample must have exactly one USER message."
170
177
  )
171
- all_message_histories, errors = self._generation_loop(llm, stop_sequences, max_tokens, samples)
178
+ all_message_histories, errors = self._generation_loop(
179
+ llm, stop_sequences, max_tokens, samples, fail_on_error=fail_on_error
180
+ )
172
181
 
173
182
  completion_list = []
174
183
  for idx, sample in enumerate(samples):
@@ -86,7 +86,7 @@ class NaturalQsOpenCloze(_NaturalQsOpenChoice_Base):
86
86
 
87
87
  class NaturalQsOpenMC(_NaturalQsOpenChoice_Base):
88
88
  NAME = "NaturalQsOpenMC"
89
- TASK_STYLER = MCStyle(space_prefixed_labels=True)
89
+ TASK_STYLER = MCStyle()
90
90
 
91
91
 
92
92
  class NaturalQsOpenMC_OLMES(_NaturalQsOpenChoice_Base):
@@ -27,6 +27,7 @@ KEYS_UNRELATED_TO_RESULTS = {
27
27
  "save_intermediate_results",
28
28
  "save_logs",
29
29
  "delete_output_dir_after_upload",
30
+ "fail_on_error",
30
31
  }
31
32
 
32
33
 
@@ -59,6 +60,9 @@ class EvalConfig(BaseConfig):
59
60
  # how many times to repeat a single sample
60
61
  # can be used to reduce variance of tasks with low number of samples, e.g. AIME24
61
62
  repeats: Annotated[int, BeforeValidator(lambda v: 1 if v is None else v), Field(ge=1)] = 1
63
+ # When True, request/sample errors (e.g. unreachable inference endpoint, exhausted retries)
64
+ # propagate instead of being captured into a blank Error result.
65
+ fail_on_error: Annotated[bool, BeforeValidator(lambda v: False if v is None else v)] = False
62
66
  # Adding a new member? Remember to update KEYS_UNRELATED_TO_RESULTS if it doesn't impact eval results.
63
67
 
64
68
  @property
@@ -112,7 +116,7 @@ class EvalConfig(BaseConfig):
112
116
  @model_validator(mode="after")
113
117
  def validate_llm_judge_defined(self) -> "EvalConfig":
114
118
  task = get_task(self.task_name)
115
- _, task_metrics = task(num_fewshot=0)._get_type_and_metrics()
119
+ task_metrics = task(num_fewshot=0).get_metrics()
116
120
  for metric_class in task_metrics:
117
121
  if issubclass(metric_class, BaseLLMJudgeMetric):
118
122
  assert self.llm_judge_class is not None, "The LLM Judge must be defined for this evaluation task."
@@ -8,7 +8,7 @@ import string
8
8
  import threading
9
9
  from collections.abc import Callable
10
10
  from pathlib import Path
11
- from typing import Any, Literal, NamedTuple
11
+ from typing import Any, Literal, NamedTuple, overload
12
12
 
13
13
  import dill
14
14
  import numpy as np
@@ -22,6 +22,24 @@ logger = logging.getLogger(__name__)
22
22
  RANDOM_SEED = 42 # hacky way to get around circular import
23
23
  redis_warning_printed = False
24
24
 
25
+
26
+ class classproperty[T]:
27
+ """Descriptor supporting property-like access on classes and instances."""
28
+
29
+ def __init__(self, fget: Callable[[Any], T]) -> None:
30
+ self.fget = fget
31
+
32
+ @overload
33
+ def __get__(self, obj: None, owner: type[Any]) -> T: ...
34
+
35
+ @overload
36
+ def __get__(self, obj: object, owner: type[Any] | None = None) -> T: ...
37
+
38
+ def __get__(self, obj: object | None, owner: type[Any] | None = None) -> T:
39
+ cls = owner if owner is not None else type(obj)
40
+ return self.fget(cls)
41
+
42
+
25
43
  _pools: dict[tuple[str | None, tuple[str, ...] | None], ContainerPoolManager] = {}
26
44
  _pools_lock = threading.Lock()
27
45
 
File without changes
File without changes