eval-framework 0.3.4__tar.gz → 0.3.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (191) hide show
  1. {eval_framework-0.3.4 → eval_framework-0.3.5}/PKG-INFO +1 -1
  2. {eval_framework-0.3.4 → eval_framework-0.3.5}/pyproject.toml +1 -1
  3. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/evaluation_generator.py +27 -2
  4. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/response_generator.py +4 -3
  5. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/result_processors/result_processor.py +4 -4
  6. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/base.py +5 -2
  7. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/aidanbench.py +11 -2
  8. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/eval_config.py +4 -0
  9. {eval_framework-0.3.4 → eval_framework-0.3.5}/LICENSE +0 -0
  10. {eval_framework-0.3.4 → eval_framework-0.3.5}/README.md +0 -0
  11. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/__init__.py +0 -0
  12. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/base_config.py +0 -0
  13. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/context/__init__.py +0 -0
  14. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/context/determined.py +0 -0
  15. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/context/eval.py +0 -0
  16. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/context/local.py +0 -0
  17. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/exceptions.py +0 -0
  18. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/external/drop_process_results.py +0 -0
  19. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/external/ifeval_impl/README.md +0 -0
  20. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
  21. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
  22. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
  23. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
  24. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/llm/__init__.py +0 -0
  25. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/llm/aleph_alpha.py +0 -0
  26. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/llm/base.py +0 -0
  27. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/llm/huggingface.py +0 -0
  28. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/llm/mistral.py +0 -0
  29. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/llm/models.py +0 -0
  30. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/llm/openai.py +0 -0
  31. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/llm/vllm.py +0 -0
  32. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/logger.py +0 -0
  33. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/main.py +0 -0
  34. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/__init__.py +0 -0
  35. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/aggregators/__init__.py +0 -0
  36. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/aggregators/aggregators.py +0 -0
  37. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/base.py +0 -0
  38. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/__init__.py +0 -0
  39. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
  40. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
  41. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/bleu.py +0 -0
  42. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/chrf.py +0 -0
  43. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
  44. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
  45. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/comet.py +0 -0
  46. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
  47. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/csv_format.py +0 -0
  48. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
  49. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/drop_completion.py +0 -0
  50. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
  51. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/f1.py +0 -0
  52. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/format_checker.py +0 -0
  53. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
  54. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/ifeval.py +0 -0
  55. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/json_format.py +0 -0
  56. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/language_checker.py +0 -0
  57. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/length_control.py +0 -0
  58. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/math_minerva_completion.py +0 -0
  59. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
  60. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/minerva_math_utils.py +0 -0
  61. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/multipl_e_assertion.py +0 -0
  62. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
  63. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
  64. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/repetition.py +0 -0
  65. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
  66. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
  67. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
  68. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
  69. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
  70. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/ter.py +0 -0
  71. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/text_counter.py +0 -0
  72. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
  73. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
  74. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/__init__.py +0 -0
  75. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/base.py +0 -0
  76. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
  77. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
  78. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
  79. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
  80. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
  81. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
  82. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
  83. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/language.py +0 -0
  84. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
  85. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/models.py +0 -0
  86. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
  87. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
  88. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
  89. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
  90. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
  91. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
  92. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
  93. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
  94. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
  95. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
  96. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
  97. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
  98. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
  99. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
  100. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
  101. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/utils.py +0 -0
  102. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
  103. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
  104. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
  105. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/loglikelihood/bits_per_byte.py +0 -0
  106. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
  107. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
  108. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
  109. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
  110. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/py.typed +0 -0
  111. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/result_processors/__init__.py +0 -0
  112. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/result_processors/base.py +0 -0
  113. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/result_processors/hf_uploader.py +0 -0
  114. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
  115. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/run.py +0 -0
  116. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/run_direct.py +0 -0
  117. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/shared/types.py +0 -0
  118. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/suite.py +0 -0
  119. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/Dockerfile_codebench +0 -0
  120. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/__init__.py +0 -0
  121. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
  122. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
  123. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
  124. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
  125. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/balancedcopa.py +0 -0
  126. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
  127. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
  128. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
  129. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
  130. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
  131. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/csqa.py +0 -0
  132. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/drop.py +0 -0
  133. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
  134. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/flores200.py +0 -0
  135. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
  136. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/global_mmlu.py +0 -0
  137. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/goldenswag.py +0 -0
  138. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
  139. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/gsm8k.py +0 -0
  140. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
  141. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
  142. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
  143. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
  144. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/include.py +0 -0
  145. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
  146. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/lab_bench.py +0 -0
  147. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/math_reasoning.py +0 -0
  148. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
  149. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/medqa.py +0 -0
  150. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
  151. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
  152. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
  153. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
  154. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/multipl_e.py +0 -0
  155. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/naturalqs_open.py +0 -0
  156. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
  157. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
  158. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
  159. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
  160. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
  161. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
  162. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/social_iqa.py +0 -0
  163. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
  164. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/squad.py +0 -0
  165. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
  166. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
  167. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
  168. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/truthfulqa.py +0 -0
  169. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
  170. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
  171. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
  172. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
  173. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
  174. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/perturbation.py +0 -0
  175. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/registry.py +0 -0
  176. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/task_loader.py +0 -0
  177. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/task_names.py +0 -0
  178. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/task_style.py +0 -0
  179. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/utils.py +0 -0
  180. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/utils/constants.py +0 -0
  181. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/utils/file_ops.py +0 -0
  182. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/utils/generate_task_docs.py +0 -0
  183. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/utils/helpers.py +0 -0
  184. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/utils/logging.py +0 -0
  185. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/utils/packaging.py +0 -0
  186. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/utils/tqdm_handler.py +0 -0
  187. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/template_formatting/README.md +0 -0
  188. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/template_formatting/__init__.py +0 -0
  189. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/template_formatting/formatter.py +0 -0
  190. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/template_formatting/mistral_formatter.py +0 -0
  191. {eval_framework-0.3.4 → eval_framework-0.3.5}/src/template_formatting/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: eval-framework
3
- Version: 0.3.4
3
+ Version: 0.3.5
4
4
  Summary: Evalulation Framework
5
5
  Author: Aleph Alpha Research
6
6
  License: Apache License
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "eval-framework"
3
- version = "0.3.4"
3
+ version = "0.3.5"
4
4
  description = "Evalulation Framework"
5
5
  readme = "README.md"
6
6
  license = { file = "LICENSE" }
@@ -127,7 +127,18 @@ class EvaluationGenerator:
127
127
  return results
128
128
 
129
129
  def _aggregate_results(self, results: list[Result]) -> dict[str, float | None]:
130
- data = pd.DataFrame([r.model_dump() for r in results])
130
+ data = pd.DataFrame(
131
+ [
132
+ {
133
+ "metric_name": r.metric_name,
134
+ "subject": r.subject,
135
+ "key": r.key,
136
+ "value": r.value,
137
+ "error": r.error,
138
+ }
139
+ for r in results
140
+ ]
141
+ )
131
142
  if len(data) == 0:
132
143
  return {}
133
144
  data.fillna({"key": ""}, inplace=True)
@@ -251,7 +262,20 @@ class EvaluationGenerator:
251
262
  return aggregated_results
252
263
 
253
264
  def _aggregate_results_with_aggregators(self, results: list[Result]) -> dict[str, float | None]:
254
- data = pd.DataFrame([r.model_dump() for r in results])
265
+ data = pd.DataFrame(
266
+ [
267
+ {
268
+ "metric_name": r.metric_name,
269
+ "metric_class_name": r.metric_class_name,
270
+ "subject": r.subject,
271
+ "key": r.key,
272
+ "value": r.value,
273
+ "error": r.error,
274
+ "prompt": r.prompt,
275
+ }
276
+ for r in results
277
+ ]
278
+ )
255
279
  if len(data) == 0:
256
280
  return {}
257
281
  data = data.fillna({"key": ""})
@@ -313,6 +337,7 @@ class EvaluationGenerator:
313
337
  raise ValueError("No saved completions found. Run 'run_completions' first.")
314
338
 
315
339
  metrics_results = self._run_metric_calculators(responses)
340
+ del responses
316
341
  aggregated_results = self._aggregate_results(metrics_results)
317
342
  results_with_aggregators = self._aggregate_results_with_aggregators(metrics_results)
318
343
  aggregated_results.update(results_with_aggregators)
@@ -114,8 +114,8 @@ class ResponseGenerator:
114
114
  try:
115
115
  raw_loglikelihoods = self.llm.logprobs(samples)
116
116
  except Exception as e:
117
- if raise_errors():
118
- raise e
117
+ if raise_errors() or self.config.fail_on_error:
118
+ raise
119
119
  logger.info(f"Error: {e.__class__.__name__} {e}")
120
120
  raw_loglikelihoods = [
121
121
  RawLoglikelihood(
@@ -166,7 +166,8 @@ class ResponseGenerator:
166
166
  self.llm,
167
167
  stop_sequences=stop_sequences,
168
168
  max_tokens=max_tokens,
169
- ) # type: ignore[call-arg]
169
+ fail_on_error=self.config.fail_on_error,
170
+ )
170
171
  case ResponseType.LOGLIKELIHOODS:
171
172
  return self._generate_loglikelihoods
172
173
  case _:
@@ -36,9 +36,9 @@ class ResultsFileProcessor(ResultProcessor):
36
36
  return {}
37
37
 
38
38
  def save_responses(self, responses: list[Completion | Loglikelihood]) -> None:
39
- responses_data = [response.model_dump(mode="json", serialize_as_any=True) for response in responses]
40
39
  with jsonlines.open(self.output_dir / "output.jsonl", "w") as f:
41
- f.write_all(responses_data)
40
+ for response in responses:
41
+ f.write(response.model_dump(mode="json", serialize_as_any=True))
42
42
 
43
43
  def save_response(self, response: Completion | Loglikelihood) -> None:
44
44
  with jsonlines.open(self.output_dir / "output.jsonl", "a") as f:
@@ -72,9 +72,9 @@ class ResultsFileProcessor(ResultProcessor):
72
72
  return responses
73
73
 
74
74
  def save_metrics_results(self, results: list[Result]) -> None:
75
- result_data = [x.model_dump(mode="json") for x in results]
76
75
  with jsonlines.open(self.output_dir / "results.jsonl", "w") as f:
77
- f.write_all(result_data)
76
+ for result in results:
77
+ f.write(result.model_dump(mode="json"))
78
78
 
79
79
  def save_metrics_result(self, result: Result) -> None:
80
80
  with jsonlines.open(self.output_dir / "results.jsonl", "a") as f:
@@ -352,12 +352,15 @@ class BaseTask[SubjectType](ABC):
352
352
  samples: list[Sample],
353
353
  stop_sequences: list[str] | None = None,
354
354
  max_tokens: int | None = None,
355
+ fail_on_error: bool = False,
355
356
  ) -> list[Completion]:
356
357
  """
357
358
  Generates completions for the sample.
358
359
  :param sample: sample to generate completions for
359
360
  :param stop_sequences: stop sequences to use in completion generation
360
361
  :param max_tokens: maximum tokens to use in completion generation
362
+ :param fail_on_error: if True, re-raise the original exception instead of capturing it
363
+ into a per-sample Error completion
361
364
  :return: completion
362
365
  """
363
366
  if stop_sequences is None:
@@ -367,8 +370,8 @@ class BaseTask[SubjectType](ABC):
367
370
  try:
368
371
  raw_completions = llm.generate(samples=samples, stop_sequences=stop_sequences, max_tokens=max_tokens)
369
372
  except Exception as e:
370
- if raise_errors():
371
- raise e
373
+ if raise_errors() or fail_on_error:
374
+ raise
372
375
  logger.info(f"Error: {e.__class__.__name__} {e}")
373
376
  raw_completions = [
374
377
  RawCompletion(
@@ -104,7 +104,12 @@ class AidanBenchOriginal(BaseTask[str]):
104
104
  return [Message(role=Role.USER, content=instruction_message)]
105
105
 
106
106
  def _generation_loop(
107
- self, llm: "BaseLLM", stop_sequences: list[str] | None, max_tokens: int | None, initial_samples: list[Sample]
107
+ self,
108
+ llm: "BaseLLM",
109
+ stop_sequences: list[str] | None,
110
+ max_tokens: int | None,
111
+ initial_samples: list[Sample],
112
+ fail_on_error: bool = False,
108
113
  ) -> tuple[list[list[Message]], list[Union["Error", None]]]:
109
114
  initial_messages = [s.messages for s in initial_samples]
110
115
  samples = [(s, False) for s in initial_samples] # (sample, is_done)
@@ -118,6 +123,7 @@ class AidanBenchOriginal(BaseTask[str]):
118
123
  [samples[i][0] for i in not_done_idx],
119
124
  stop_sequences=stop_sequences,
120
125
  max_tokens=max_tokens,
126
+ fail_on_error=fail_on_error,
121
127
  )
122
128
  new_completion_messages: list[list[Message] | None] = [c.messages for c in new_completions]
123
129
  new_errors = [c.error for c in new_completions]
@@ -164,11 +170,14 @@ class AidanBenchOriginal(BaseTask[str]):
164
170
  samples: list[Sample],
165
171
  stop_sequences: list[str] | None = None,
166
172
  max_tokens: int | None = None,
173
+ fail_on_error: bool = False,
167
174
  ) -> list[Completion]:
168
175
  assert all(len(s.messages) == 1 and s.messages[0].role == Role.USER for s in samples), (
169
176
  "Each sample must have exactly one USER message."
170
177
  )
171
- all_message_histories, errors = self._generation_loop(llm, stop_sequences, max_tokens, samples)
178
+ all_message_histories, errors = self._generation_loop(
179
+ llm, stop_sequences, max_tokens, samples, fail_on_error=fail_on_error
180
+ )
172
181
 
173
182
  completion_list = []
174
183
  for idx, sample in enumerate(samples):
@@ -27,6 +27,7 @@ KEYS_UNRELATED_TO_RESULTS = {
27
27
  "save_intermediate_results",
28
28
  "save_logs",
29
29
  "delete_output_dir_after_upload",
30
+ "fail_on_error",
30
31
  }
31
32
 
32
33
 
@@ -59,6 +60,9 @@ class EvalConfig(BaseConfig):
59
60
  # how many times to repeat a single sample
60
61
  # can be used to reduce variance of tasks with low number of samples, e.g. AIME24
61
62
  repeats: Annotated[int, BeforeValidator(lambda v: 1 if v is None else v), Field(ge=1)] = 1
63
+ # When True, request/sample errors (e.g. unreachable inference endpoint, exhausted retries)
64
+ # propagate instead of being captured into a blank Error result.
65
+ fail_on_error: Annotated[bool, BeforeValidator(lambda v: False if v is None else v)] = False
62
66
  # Adding a new member? Remember to update KEYS_UNRELATED_TO_RESULTS if it doesn't impact eval results.
63
67
 
64
68
  @property
File without changes
File without changes