eval-framework 0.3.1__tar.gz → 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (191) hide show
  1. {eval_framework-0.3.1 → eval_framework-0.3.2}/PKG-INFO +1 -1
  2. {eval_framework-0.3.1 → eval_framework-0.3.2}/pyproject.toml +1 -1
  3. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/mbpp.py +2 -1
  4. {eval_framework-0.3.1 → eval_framework-0.3.2}/LICENSE +0 -0
  5. {eval_framework-0.3.1 → eval_framework-0.3.2}/README.md +0 -0
  6. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/__init__.py +0 -0
  7. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/base_config.py +0 -0
  8. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/context/__init__.py +0 -0
  9. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/context/determined.py +0 -0
  10. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/context/eval.py +0 -0
  11. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/context/local.py +0 -0
  12. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/evaluation_generator.py +0 -0
  13. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/exceptions.py +0 -0
  14. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/external/drop_process_results.py +0 -0
  15. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/external/ifeval_impl/README.md +0 -0
  16. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
  17. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
  18. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
  19. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
  20. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/llm/__init__.py +0 -0
  21. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/llm/aleph_alpha.py +0 -0
  22. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/llm/base.py +0 -0
  23. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/llm/huggingface.py +0 -0
  24. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/llm/mistral.py +0 -0
  25. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/llm/models.py +0 -0
  26. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/llm/openai.py +0 -0
  27. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/llm/vllm.py +0 -0
  28. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/logger.py +0 -0
  29. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/main.py +0 -0
  30. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/__init__.py +0 -0
  31. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/aggregators/__init__.py +0 -0
  32. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/aggregators/aggregators.py +0 -0
  33. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/base.py +0 -0
  34. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/__init__.py +0 -0
  35. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
  36. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
  37. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/bleu.py +0 -0
  38. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/chrf.py +0 -0
  39. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
  40. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
  41. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/comet.py +0 -0
  42. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
  43. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/csv_format.py +0 -0
  44. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
  45. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/drop_completion.py +0 -0
  46. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
  47. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/f1.py +0 -0
  48. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/format_checker.py +0 -0
  49. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
  50. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/ifeval.py +0 -0
  51. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/json_format.py +0 -0
  52. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/language_checker.py +0 -0
  53. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/length_control.py +0 -0
  54. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/math_minerva_completion.py +0 -0
  55. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
  56. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/minerva_math_utils.py +0 -0
  57. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/multipl_e_assertion.py +0 -0
  58. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
  59. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
  60. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/repetition.py +0 -0
  61. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
  62. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
  63. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
  64. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
  65. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
  66. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/ter.py +0 -0
  67. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/text_counter.py +0 -0
  68. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
  69. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
  70. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/__init__.py +0 -0
  71. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/base.py +0 -0
  72. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
  73. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
  74. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
  75. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
  76. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
  77. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
  78. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
  79. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/graders/language.py +0 -0
  80. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
  81. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/graders/models.py +0 -0
  82. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
  83. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
  84. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
  85. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
  86. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
  87. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
  88. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
  89. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
  90. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
  91. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
  92. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
  93. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
  94. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
  95. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
  96. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
  97. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/utils.py +0 -0
  98. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
  99. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
  100. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
  101. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/loglikelihood/bits_per_byte.py +0 -0
  102. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
  103. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
  104. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
  105. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
  106. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/py.typed +0 -0
  107. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/response_generator.py +0 -0
  108. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/result_processors/__init__.py +0 -0
  109. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/result_processors/base.py +0 -0
  110. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/result_processors/hf_uploader.py +0 -0
  111. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/result_processors/result_processor.py +0 -0
  112. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
  113. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/run.py +0 -0
  114. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/run_direct.py +0 -0
  115. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/shared/types.py +0 -0
  116. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/suite.py +0 -0
  117. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/Dockerfile_codebench +0 -0
  118. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/__init__.py +0 -0
  119. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/base.py +0 -0
  120. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
  121. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/aidanbench.py +0 -0
  122. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
  123. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
  124. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
  125. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/balancedcopa.py +0 -0
  126. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
  127. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
  128. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
  129. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
  130. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
  131. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/csqa.py +0 -0
  132. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/drop.py +0 -0
  133. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
  134. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/flores200.py +0 -0
  135. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
  136. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/global_mmlu.py +0 -0
  137. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/goldenswag.py +0 -0
  138. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
  139. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/gsm8k.py +0 -0
  140. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
  141. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
  142. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
  143. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
  144. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/include.py +0 -0
  145. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
  146. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/lab_bench.py +0 -0
  147. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/math_reasoning.py +0 -0
  148. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/medqa.py +0 -0
  149. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
  150. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
  151. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
  152. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
  153. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/multipl_e.py +0 -0
  154. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/naturalqs_open.py +0 -0
  155. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
  156. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
  157. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
  158. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
  159. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
  160. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
  161. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/social_iqa.py +0 -0
  162. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
  163. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/squad.py +0 -0
  164. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
  165. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
  166. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
  167. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/truthfulqa.py +0 -0
  168. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
  169. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
  170. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
  171. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
  172. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
  173. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/eval_config.py +0 -0
  174. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/perturbation.py +0 -0
  175. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/registry.py +0 -0
  176. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/task_loader.py +0 -0
  177. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/task_names.py +0 -0
  178. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/task_style.py +0 -0
  179. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/tasks/utils.py +0 -0
  180. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/utils/constants.py +0 -0
  181. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/utils/file_ops.py +0 -0
  182. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/utils/generate_task_docs.py +0 -0
  183. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/utils/helpers.py +0 -0
  184. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/utils/logging.py +0 -0
  185. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/utils/packaging.py +0 -0
  186. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/eval_framework/utils/tqdm_handler.py +0 -0
  187. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/template_formatting/README.md +0 -0
  188. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/template_formatting/__init__.py +0 -0
  189. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/template_formatting/formatter.py +0 -0
  190. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/template_formatting/mistral_formatter.py +0 -0
  191. {eval_framework-0.3.1 → eval_framework-0.3.2}/src/template_formatting/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: eval-framework
3
- Version: 0.3.1
3
+ Version: 0.3.2
4
4
  Summary: Evalulation Framework
5
5
  Author: Aleph Alpha Research
6
6
  License: Apache License
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "eval-framework"
3
- version = "0.3.1"
3
+ version = "0.3.2"
4
4
  description = "Evalulation Framework"
5
5
  readme = "README.md"
6
6
  license = { file = "LICENSE" }
@@ -276,7 +276,8 @@ class MBPP_OLMES(MBPP):
276
276
 
277
277
  def __init__(self, num_fewshot: int = 3) -> None:
278
278
  super().__init__(num_fewshot)
279
- assert num_fewshot == 3, "MBPP_OLMES requires exactly 3 fewshot examples"
279
+ if num_fewshot != 3:
280
+ logger.warning(f"MBPP_OLMES supports only 3-shot, got {num_fewshot}")
280
281
  self.stop_sequences = ["```", '\n"""', "\nassert", "\n#"]
281
282
 
282
283
  def _get_instruction_text(self, item: dict[str, Any]) -> str:
File without changes
File without changes