eval-framework 0.2.12__tar.gz → 0.2.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (184) hide show
  1. {eval_framework-0.2.12 → eval_framework-0.2.14}/PKG-INFO +2 -2
  2. {eval_framework-0.2.12 → eval_framework-0.2.14}/README.md +1 -1
  3. {eval_framework-0.2.12 → eval_framework-0.2.14}/pyproject.toml +3 -1
  4. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/evaluation_generator.py +32 -6
  5. eval_framework-0.2.14/src/eval_framework/external/drop_process_results.py +250 -0
  6. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/llm/aleph_alpha.py +7 -2
  7. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/llm/base.py +5 -2
  8. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/llm/huggingface.py +26 -3
  9. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/llm/openai.py +15 -0
  10. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/llm/vllm.py +14 -4
  11. eval_framework-0.2.14/src/eval_framework/metrics/completion/drop_completion.py +47 -0
  12. eval_framework-0.2.14/src/eval_framework/metrics/completion/math_minerva_completion.py +103 -0
  13. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/math_reasoning_completion.py +2 -8
  14. eval_framework-0.2.14/src/eval_framework/metrics/completion/minerva_math_utils.py +394 -0
  15. eval_framework-0.2.14/src/eval_framework/metrics/loglikelihood/bits_per_byte.py +64 -0
  16. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/base.py +4 -0
  17. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/arc.py +28 -2
  18. eval_framework-0.2.14/src/eval_framework/tasks/benchmarks/balancedcopa.py +56 -0
  19. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/copa.py +54 -6
  20. eval_framework-0.2.14/src/eval_framework/tasks/benchmarks/csqa.py +90 -0
  21. eval_framework-0.2.14/src/eval_framework/tasks/benchmarks/drop.py +207 -0
  22. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/flores200.py +23 -33
  23. eval_framework-0.2.14/src/eval_framework/tasks/benchmarks/global_mmlu.py +532 -0
  24. eval_framework-0.2.14/src/eval_framework/tasks/benchmarks/goldenswag.py +42 -0
  25. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/gpqa.py +27 -0
  26. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/hellaswag.py +2 -1
  27. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/humaneval.py +44 -0
  28. eval_framework-0.2.14/src/eval_framework/tasks/benchmarks/lab_bench.py +89 -0
  29. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/math_reasoning.py +149 -9
  30. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/mbpp.py +123 -6
  31. eval_framework-0.2.14/src/eval_framework/tasks/benchmarks/medqa.py +83 -0
  32. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/mmlu.py +15 -0
  33. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +13 -0
  34. eval_framework-0.2.14/src/eval_framework/tasks/benchmarks/naturalqs_open.py +100 -0
  35. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/openbookqa.py +45 -0
  36. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/piqa.py +29 -1
  37. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/sciq.py +50 -8
  38. eval_framework-0.2.14/src/eval_framework/tasks/benchmarks/social_iqa.py +231 -0
  39. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/squad.py +26 -0
  40. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/truthfulqa.py +31 -0
  41. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/winogrande.py +29 -1
  42. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +2 -2
  43. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/task_names.py +50 -1
  44. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/utils/generate_task_docs.py +8 -4
  45. {eval_framework-0.2.12 → eval_framework-0.2.14}/LICENSE +0 -0
  46. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/__init__.py +0 -0
  47. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/base_config.py +0 -0
  48. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/context/__init__.py +0 -0
  49. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/context/determined.py +0 -0
  50. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/context/eval.py +0 -0
  51. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/context/local.py +0 -0
  52. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/exceptions.py +0 -0
  53. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/external/ifeval_impl/README.md +0 -0
  54. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
  55. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
  56. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
  57. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
  58. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/llm/__init__.py +0 -0
  59. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/llm/mistral.py +0 -0
  60. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/llm/models.py +0 -0
  61. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/logger.py +0 -0
  62. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/main.py +0 -0
  63. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/__init__.py +0 -0
  64. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/base.py +0 -0
  65. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/__init__.py +0 -0
  66. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
  67. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
  68. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/bleu.py +0 -0
  69. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/chrf.py +0 -0
  70. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
  71. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
  72. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/comet.py +0 -0
  73. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
  74. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/csv_format.py +0 -0
  75. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
  76. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
  77. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/f1.py +0 -0
  78. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/format_checker.py +0 -0
  79. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
  80. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/ifeval.py +0 -0
  81. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/json_format.py +0 -0
  82. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/language_checker.py +0 -0
  83. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/length_control.py +0 -0
  84. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
  85. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
  86. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/repetition.py +0 -0
  87. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
  88. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
  89. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
  90. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
  91. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
  92. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/ter.py +0 -0
  93. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/text_counter.py +0 -0
  94. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
  95. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
  96. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/__init__.py +0 -0
  97. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/base.py +0 -0
  98. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
  99. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
  100. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
  101. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
  102. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
  103. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
  104. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
  105. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/language.py +0 -0
  106. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
  107. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/models.py +0 -0
  108. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
  109. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
  110. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
  111. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
  112. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
  113. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
  114. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
  115. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
  116. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
  117. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
  118. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
  119. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
  120. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
  121. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
  122. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
  123. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/utils.py +0 -0
  124. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
  125. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
  126. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
  127. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
  128. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
  129. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
  130. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
  131. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/py.typed +0 -0
  132. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/response_generator.py +0 -0
  133. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/result_processors/__init__.py +0 -0
  134. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/result_processors/base.py +0 -0
  135. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/result_processors/hf_uploader.py +0 -0
  136. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/result_processors/result_processor.py +0 -0
  137. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
  138. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/run.py +0 -0
  139. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/run_direct.py +0 -0
  140. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/shared/types.py +0 -0
  141. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/__init__.py +0 -0
  142. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
  143. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/aidanbench.py +0 -0
  144. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
  145. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
  146. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
  147. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
  148. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
  149. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
  150. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
  151. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
  152. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/gsm8k.py +0 -0
  153. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
  154. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
  155. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/include.py +0 -0
  156. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
  157. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
  158. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
  159. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
  160. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
  161. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
  162. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
  163. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
  164. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
  165. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
  166. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
  167. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
  168. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
  169. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/eval_config.py +0 -0
  170. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/perturbation.py +0 -0
  171. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/registry.py +0 -0
  172. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/task_loader.py +0 -0
  173. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/utils.py +0 -0
  174. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/utils/constants.py +0 -0
  175. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/utils/file_ops.py +0 -0
  176. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/utils/helpers.py +0 -0
  177. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/utils/logging.py +0 -0
  178. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/utils/packaging.py +0 -0
  179. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/utils/tqdm_handler.py +0 -0
  180. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/template_formatting/README.md +0 -0
  181. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/template_formatting/__init__.py +0 -0
  182. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/template_formatting/formatter.py +0 -0
  183. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/template_formatting/mistral_formatter.py +0 -0
  184. {eval_framework-0.2.12 → eval_framework-0.2.14}/src/template_formatting/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: eval-framework
3
- Version: 0.2.12
3
+ Version: 0.2.14
4
4
  Summary: Evalulation Framework
5
5
  Author: Aleph Alpha Research
6
6
  License: Apache License
@@ -377,7 +377,7 @@ Subset of core capabilities benchmarks coverd by `eval-framework`:
377
377
 
378
378
  | **Reasoning** | **Knowledge** | **Math** | **Coding** | **Structured outputs** | **Long Context** |
379
379
  |---------------|---------------|----------|------------|------------------------|------------------|
380
- | COPA | ARC | AIME | BigCodeBench | IFEval | InfiniteBench |
380
+ | COPA, BalancedCOPA | ARC | AIME | BigCodeBench | IFEval | InfiniteBench |
381
381
  | Hellaswag | MMLU | GSM8K | HumanEval | StructEval | QUALITY |
382
382
  | Winogrande | Openbook QA| MATH-500 | MBPP | | ZeroSCROLLS |
383
383
 
@@ -106,7 +106,7 @@ Subset of core capabilities benchmarks coverd by `eval-framework`:
106
106
 
107
107
  | **Reasoning** | **Knowledge** | **Math** | **Coding** | **Structured outputs** | **Long Context** |
108
108
  |---------------|---------------|----------|------------|------------------------|------------------|
109
- | COPA | ARC | AIME | BigCodeBench | IFEval | InfiniteBench |
109
+ | COPA, BalancedCOPA | ARC | AIME | BigCodeBench | IFEval | InfiniteBench |
110
110
  | Hellaswag | MMLU | GSM8K | HumanEval | StructEval | QUALITY |
111
111
  | Winogrande | Openbook QA| MATH-500 | MBPP | | ZeroSCROLLS |
112
112
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "eval-framework"
3
- version = "0.2.12"
3
+ version = "0.2.14"
4
4
  description = "Evalulation Framework"
5
5
  readme = "README.md"
6
6
  license = { file = "LICENSE" }
@@ -105,6 +105,7 @@ dev = [
105
105
  "types-requests>=2.32.0.20250328,<3",
106
106
  "plotly>=5.24.1,<6",
107
107
  "ruff>=0.12.8",
108
+ "scipy>=1.14.0,<2", # for tests comparing our Hungarian implementation to scipy
108
109
  ]
109
110
  flash-attn = [
110
111
  "flash-attn>=2.7.2.post1,<2.8",
@@ -178,6 +179,7 @@ addopts = "-p 'no:legacypath' --doctest-modules"
178
179
  markers = [
179
180
  "gpu: needs a GPU runner, otherwise test can not be run",
180
181
  "cpu_slow: runs for a long time (on CPU)",
182
+ "slow_download: smoke tests that download large datasets (>15s); excluded from CI, run manually with -m slow_download",
181
183
  "external_api: needs external services for execution",
182
184
  "vllm: tests that specifically require vLLM functionality",
183
185
  "formatter_hash: formatter consistency tests using hash comparisons",
@@ -134,16 +134,27 @@ class EvaluationGenerator:
134
134
  # filter and count errors
135
135
  total_count = len(data_subset)
136
136
 
137
- mask = data["error"].isnull()
137
+ mask = data_subset["error"].isnull()
138
138
  data_subset_error_free = data_subset.loc[mask, ["subject", "key", "value"]]
139
- # data_subset_error_free = data_subset[data_subset["error"].isnull()][["subject", "key", "value"]]
140
139
 
141
- aggregated_results[f"ErrorFreeRatio {metric}"] = float(len(data_subset_error_free) / total_count)
140
+ error_free_ratio = float(len(data_subset_error_free) / total_count)
141
+ aggregated_results[f"ErrorFreeRatio {metric}"] = error_free_ratio
142
142
 
143
143
  # aggregate by key and subject first to have equal weights for all key / subject combinations
144
144
  key_subject_mean = data_subset_error_free.groupby(["key", "subject"]).mean()
145
145
  aggregated_results[f"Average {metric}"] = float(key_subject_mean[["value"]].mean()["value"])
146
146
 
147
+ if error_free_ratio < 1.0:
148
+ # Treat error samples (with value=None) as 0 for the "including errors" average
149
+ data_subset_with_errors = data_subset[["key", "subject", "value", "error"]].copy()
150
+ # Only fill value with 0 where there's an error (not for all None values)
151
+ error_mask = data_subset_with_errors["error"].notna()
152
+ data_subset_with_errors.loc[error_mask, "value"] = data_subset_with_errors.loc[
153
+ error_mask, "value"
154
+ ].fillna(0.0)
155
+ key_subject_mean_with_errors = data_subset_with_errors.groupby(["key", "subject"])["value"].mean()
156
+ aggregated_results[f"Average {metric} (including Errors)"] = float(key_subject_mean_with_errors.mean())
157
+
147
158
  std_err_mean_sum_of_squares = 0.0
148
159
  std_err_mean_total_num_samples = 0.0
149
160
  std_err_mean_num_subjects = 0
@@ -156,14 +167,29 @@ class EvaluationGenerator:
156
167
  # group = data_subset[data[column] == name][["subject", "key", "value", "error"]]
157
168
  group_total_count = len(group)
158
169
  group_error_free = group[group["error"].isnull()][["subject", "key", "value"]]
159
- aggregated_results[f"ErrorFreeRatio {metric} - {name[0]}"] = float(
160
- len(group_error_free) / group_total_count
161
- )
170
+ group_error_free_ratio = float(len(group_error_free) / group_total_count)
171
+ aggregated_results[f"ErrorFreeRatio {metric} - {name[0]}"] = group_error_free_ratio
162
172
 
163
173
  group_key_subject_mean = group_error_free.groupby(["key", "subject"]).mean()
164
174
  value = float(group_key_subject_mean[["value"]].mean()["value"])
165
175
  aggregated_results[f"Average {metric} - {name[0]}"] = value if not math.isnan(value) else None
166
176
 
177
+ if group_error_free_ratio < 1.0:
178
+ # Treat error samples (with value=None) as 0 for the "including errors" average
179
+ group_with_errors = group[["key", "subject", "value", "error"]].copy()
180
+ # Only fill value with 0 where there's an error (not for all None values)
181
+ error_mask = group_with_errors["error"].notna()
182
+ group_with_errors.loc[error_mask, "value"] = group_with_errors.loc[
183
+ error_mask, "value"
184
+ ].fillna(0.0)
185
+ group_key_subject_mean_with_errors = group_with_errors.groupby(["key", "subject"])[
186
+ "value"
187
+ ].mean()
188
+ value_with_errors = float(group_key_subject_mean_with_errors.mean())
189
+ aggregated_results[f"Average {metric} (including Errors) - {name[0]}"] = (
190
+ value_with_errors if not math.isnan(value_with_errors) else None
191
+ )
192
+
167
193
  if not ("SequencePositions" in metric or "Bytes" in metric):
168
194
  # calculate standard error for selected metrics
169
195
  group_key_subject_std = group_error_free.groupby(["key", "subject"]).std()
@@ -0,0 +1,250 @@
1
+ """DROP F1 and exact match evaluation.
2
+
3
+ Logic adapted from AllenNLP DROP evaluation:
4
+ https://github.com/allenai/allennlp-reading-comprehension/blob/master/allennlp_rc/eval/drop_eval.py
5
+ """
6
+
7
+ import re
8
+ import string
9
+
10
+ import numpy as np
11
+
12
+ _ARTICLES = re.compile(r"\b(a|an|the)\b", re.UNICODE)
13
+
14
+
15
+ def _linear_sum_assignment(cost_matrix: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
16
+ """Solve the linear sum assignment problem (minimize cost) using the Hungarian algorithm.
17
+
18
+ Pure NumPy implementation to avoid scipy dependency. Returns (row_ind, col_ind) with row_ind sorted,
19
+ matching scipy.optimize.linear_sum_assignment API for rectangular matrices.
20
+ """
21
+ cost = np.atleast_2d(np.asarray(cost_matrix, dtype=np.float64))
22
+ n_rows, n_cols = cost.shape
23
+ n = max(n_rows, n_cols)
24
+ # Pad to square with zeros so unassigned rows/cols contribute zero cost
25
+ C = np.zeros((n, n))
26
+ C[:n_rows, :n_cols] = cost
27
+
28
+ # Row and column reductions
29
+ u = np.min(C, axis=1)
30
+ u[u == np.inf] = 0
31
+ C = C - u[:, np.newaxis]
32
+ v = np.min(C, axis=0)
33
+ v[v == np.inf] = 0
34
+ C = C - v[np.newaxis, :]
35
+
36
+ # Starred zeros: assignment (1 = starred). Start with no stars.
37
+ star = np.zeros((n, n), dtype=np.intp)
38
+ row_covered = np.zeros(n, dtype=bool)
39
+ col_covered = np.zeros(n, dtype=bool)
40
+ prime = np.zeros((n, n), dtype=np.intp)
41
+
42
+ def find_zero() -> tuple[int, int] | None:
43
+ for i in range(n):
44
+ if row_covered[i]:
45
+ continue
46
+ for j in range(n):
47
+ if not col_covered[j] and C[i, j] == 0:
48
+ return (i, j)
49
+ return None
50
+
51
+ def star_in_row(i: int) -> int | None:
52
+ for j in range(n):
53
+ if star[i, j]:
54
+ return j
55
+ return None
56
+
57
+ def star_in_col(j: int) -> int | None:
58
+ for i in range(n):
59
+ if star[i, j]:
60
+ return i
61
+ return None
62
+
63
+ def prime_in_row(i: int) -> int | None:
64
+ for j in range(n):
65
+ if prime[i, j]:
66
+ return j
67
+ return None
68
+
69
+ while True:
70
+ # Cover columns containing a starred zero
71
+ col_covered[:] = False
72
+ for j in range(n):
73
+ for i in range(n):
74
+ if star[i, j]:
75
+ col_covered[j] = True
76
+ break
77
+ if np.all(col_covered):
78
+ break
79
+ prime[:] = 0
80
+ row_covered[:] = False
81
+ while True:
82
+ z = find_zero()
83
+ if z is None:
84
+ # No uncovered zero: find minimum uncovered value and adjust
85
+ min_val = np.inf
86
+ for i in range(n):
87
+ if not row_covered[i]:
88
+ for j in range(n):
89
+ if not col_covered[j] and C[i, j] < min_val:
90
+ min_val = C[i, j]
91
+ if min_val == np.inf or min_val <= 0:
92
+ min_val = 1e-10
93
+ for i in range(n):
94
+ if row_covered[i]:
95
+ C[i, :] += min_val
96
+ for j in range(n):
97
+ if not col_covered[j]:
98
+ C[:, j] -= min_val
99
+ continue
100
+ i, j = z
101
+ prime[i, j] = 1
102
+ cj = star_in_row(i)
103
+ if cj is None:
104
+ # Augmenting path: unstar starred, star primed along path
105
+ path = [(i, j)]
106
+ while True:
107
+ ji = star_in_col(path[-1][1])
108
+ if ji is None:
109
+ break
110
+ path.append((ji, path[-1][1]))
111
+ pj = prime_in_row(ji)
112
+ if pj is None:
113
+ break
114
+ path.append((ji, pj))
115
+ for pi, pj in path:
116
+ star[pi, pj] = 1 - star[pi, pj]
117
+ prime[:] = 0
118
+ row_covered[:] = False
119
+ col_covered[:] = False
120
+ break
121
+ row_covered[i] = True
122
+ col_covered[cj] = False
123
+
124
+ # Extract assignment: (row_ind, col_ind) for starred zeros, row_ind sorted
125
+ row_ind = np.array([i for i in range(n) for j in range(n) if star[i, j]], dtype=np.intp)
126
+ col_ind = np.array([j for i in range(n) for j in range(n) if star[i, j]], dtype=np.intp)
127
+ # Keep only assignments within original matrix
128
+ mask = (row_ind < n_rows) & (col_ind < n_cols)
129
+ row_ind = row_ind[mask]
130
+ col_ind = col_ind[mask]
131
+ # Sort by row index (scipy API)
132
+ perm = np.argsort(row_ind)
133
+ return row_ind[perm], col_ind[perm]
134
+
135
+
136
+ def process_results(doc: dict, results: list) -> dict[str, float]:
137
+ """Compute DROP exact_match and F1 between predictions and gold answers.
138
+
139
+ doc["answers"] is a list of gold answers (each a tuple or list of strings).
140
+ results is a list of predicted answers (one per prediction; for one completion use [pred]).
141
+ """
142
+ preds, golds = results, doc["answers"]
143
+ max_em = 0.0
144
+ max_f1 = 0.0
145
+ for gold_answer in golds:
146
+ exact_match, f1_score = get_metrics(preds, gold_answer)
147
+ if gold_answer and (gold_answer[0].strip() if isinstance(gold_answer[0], str) else True):
148
+ max_em = max(max_em, exact_match)
149
+ max_f1 = max(max_f1, f1_score)
150
+ return {"exact_match": max_em, "f1": max_f1}
151
+
152
+
153
+ def get_metrics(predicted: list | str, gold: tuple | list) -> tuple[float, float]:
154
+ """Return (exact_match, f1) for one gold answer. predicted and gold can be string or list of strings."""
155
+ predicted_bags = _answer_to_bags(predicted)
156
+ gold_bags = _answer_to_bags(gold)
157
+
158
+ if set(predicted_bags[0]) == set(gold_bags[0]) and len(predicted_bags[0]) == len(gold_bags[0]):
159
+ exact_match = 1.0
160
+ else:
161
+ exact_match = 0.0
162
+
163
+ f1_per_bag = _align_bags(predicted_bags[1], gold_bags[1])
164
+ f1 = float(np.mean(f1_per_bag))
165
+ f1 = round(f1, 2)
166
+ return exact_match, f1
167
+
168
+
169
+ def _answer_to_bags(answer: list | tuple | str) -> tuple[list[str], list[set]]:
170
+ if isinstance(answer, list | tuple):
171
+ raw_spans = list(answer)
172
+ else:
173
+ raw_spans = [answer]
174
+ normalized_spans = []
175
+ token_bags = []
176
+ for raw_span in raw_spans:
177
+ normalized_span = _normalize(str(raw_span))
178
+ normalized_spans.append(normalized_span)
179
+ token_bags.append(set(normalized_span.split()))
180
+ return normalized_spans, token_bags
181
+
182
+
183
+ def _align_bags(predicted: list[set], gold: list[set]) -> np.ndarray:
184
+ scores = np.zeros([len(gold), len(predicted)])
185
+ for gold_index, gold_item in enumerate(gold):
186
+ for pred_index, pred_item in enumerate(predicted):
187
+ if _match_numbers_if_present(gold_item, pred_item):
188
+ scores[gold_index, pred_index] = _compute_f1(pred_item, gold_item)
189
+ row_ind, col_ind = _linear_sum_assignment(-scores)
190
+
191
+ max_scores = np.zeros([max(len(gold), len(predicted))])
192
+ for row, column in zip(row_ind, col_ind):
193
+ max_scores[row] = max(max_scores[row], scores[row, column])
194
+ return max_scores
195
+
196
+
197
+ def _compute_f1(predicted_bag: set, gold_bag: set) -> float:
198
+ intersection = len(gold_bag.intersection(predicted_bag))
199
+ if not predicted_bag:
200
+ precision = 1.0
201
+ else:
202
+ precision = intersection / float(len(predicted_bag))
203
+ if not gold_bag:
204
+ recall = 1.0
205
+ else:
206
+ recall = intersection / float(len(gold_bag))
207
+ return (2 * precision * recall) / (precision + recall) if (precision or recall) else 0.0
208
+
209
+
210
+ def _match_numbers_if_present(gold_bag: set, predicted_bag: set) -> bool:
211
+ gold_numbers = {w for w in gold_bag if _is_number(w)}
212
+ predicted_numbers = {w for w in predicted_bag if _is_number(w)}
213
+ return (not gold_numbers) or bool(gold_numbers.intersection(predicted_numbers))
214
+
215
+
216
+ def _is_number(text: str) -> bool:
217
+ try:
218
+ float(text)
219
+ return True
220
+ except ValueError:
221
+ return False
222
+
223
+
224
+ def _remove_articles(text: str) -> str:
225
+ return _ARTICLES.sub(" ", text)
226
+
227
+
228
+ def _white_space_fix(text: str) -> str:
229
+ return " ".join(text.split())
230
+
231
+
232
+ def _remove_punc(text: str) -> str:
233
+ exclude = set(string.punctuation)
234
+ return "".join(ch for ch in text if ch not in exclude) if not _is_number(text) else text
235
+
236
+
237
+ def _fix_number(text: str) -> str:
238
+ return str(float(text)) if _is_number(text) else text
239
+
240
+
241
+ def _tokenize(text: str) -> list[str]:
242
+ return re.split(" |-", text)
243
+
244
+
245
+ def _normalize(answer: str) -> str:
246
+ tokens = [
247
+ _white_space_fix(_remove_articles(_fix_number(_remove_punc(token.lower())))) for token in _tokenize(answer)
248
+ ]
249
+ tokens = [t for t in tokens if t.strip()]
250
+ return " ".join(tokens).strip()
@@ -200,11 +200,16 @@ class AlephAlphaAPIModel(BaseLLM):
200
200
  stop_sequences: list[str] | None = None,
201
201
  max_tokens: int | None = None,
202
202
  temperature: float | None = None,
203
+ top_p: float | None = None,
203
204
  ) -> list[RawCompletion]:
204
205
  effective_temperature = temperature if temperature is not None else self._temperature
206
+ if effective_temperature is not None and not (0 <= effective_temperature <= 2):
207
+ raise ValueError(f"temperature must be between 0 and 2, got {effective_temperature}")
208
+ effective_top_p = top_p if top_p is not None else self._top_p
209
+ if effective_top_p is not None and not (0 < effective_top_p <= 1):
210
+ raise ValueError(f"top_p must be between 0 and 1 (exclusive), got {effective_top_p}")
205
211
 
206
212
  requests: list[CompletionRequest] = []
207
-
208
213
  # Adjust max tokens based on bytes_per_token_scalar so that non-standard models generate full responses
209
214
  scaled_max_tokens = math.ceil(max_tokens * self.bytes_per_token_scalar) if max_tokens is not None else None
210
215
 
@@ -215,7 +220,7 @@ class AlephAlphaAPIModel(BaseLLM):
215
220
  maximum_tokens=scaled_max_tokens,
216
221
  stop_sequences=stop_sequences,
217
222
  temperature=effective_temperature,
218
- top_p=self._top_p,
223
+ top_p=effective_top_p,
219
224
  )
220
225
  )
221
226
 
@@ -24,6 +24,7 @@ class BaseLLM(ABC):
24
24
  stop_sequences: list[str] | None = None,
25
25
  max_tokens: int | None = None,
26
26
  temperature: float | None = None,
27
+ top_p: float | None = None,
27
28
  ) -> list[RawCompletion]:
28
29
  """
29
30
  stop_sequences and max_tokens are injected by the task if exist. They should be overwritten or
@@ -47,6 +48,7 @@ class BaseLLM(ABC):
47
48
  stop_sequences: list[str] | None = None,
48
49
  max_tokens: int | None = None,
49
50
  temperature: float | None = None,
51
+ top_p: float | None = None,
50
52
  ) -> list[RawCompletion]:
51
53
  """
52
54
  stop_sequences and max_tokens are injected by the task if exist. They should be overwritten or
@@ -79,6 +81,7 @@ class BaseLLM(ABC):
79
81
  stop_sequences: list[str] | None = None,
80
82
  max_tokens: int | None = None,
81
83
  temperature: float | None = None,
84
+ top_p: float | None = None,
82
85
  ) -> list[RawCompletion]:
83
86
  """Generates a model response for each sample.
84
87
 
@@ -86,10 +89,10 @@ class BaseLLM(ABC):
86
89
  otherwise falls back to 'generate_from_messages'.
87
90
  """
88
91
  try:
89
- return self.generate_from_samples(samples, stop_sequences, max_tokens, temperature)
92
+ return self.generate_from_samples(samples, stop_sequences, max_tokens, temperature, top_p)
90
93
  except NotImplementedError:
91
94
  messages: list[Sequence[Message]] = [sample.messages for sample in samples]
92
- return self.generate_from_messages(messages, stop_sequences, max_tokens, temperature)
95
+ return self.generate_from_messages(messages, stop_sequences, max_tokens, temperature, top_p)
93
96
 
94
97
  def post_process_completion(self, completion: str, sample: Sample) -> str:
95
98
  """
@@ -10,7 +10,14 @@ from typing import Any
10
10
 
11
11
  import torch
12
12
  from tokenizers import Tokenizer
13
- from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList
13
+ from transformers import (
14
+ AutoModelForCausalLM,
15
+ AutoTokenizer,
16
+ StoppingCriteria,
17
+ StoppingCriteriaList,
18
+ )
19
+ from transformers.models.gpt2 import GPT2Tokenizer
20
+ from transformers.tokenization_utils import PreTrainedTokenizerBase
14
21
 
15
22
  from eval_framework.llm.base import BaseLLM
16
23
  from eval_framework.shared.types import (
@@ -83,9 +90,13 @@ class BaseHFLLM(BaseLLM):
83
90
  SEQ_LENGTH: int | None = None
84
91
  BYTES_PER_TOKEN: float = 4.0 # rule of thumb according to https://platform.openai.com/tokenizer
85
92
 
93
+ def _load_tokenizer(self) -> PreTrainedTokenizerBase:
94
+ """Load the tokenizer. Override in subclasses to use a specific tokenizer class."""
95
+ return AutoTokenizer.from_pretrained(self.LLM_NAME)
96
+
86
97
  def __init__(self, formatter: BaseFormatter | None = None, bytes_per_token: float | None = None) -> None:
87
98
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
88
- self.tokenizer = AutoTokenizer.from_pretrained(self.LLM_NAME)
99
+ self.tokenizer = self._load_tokenizer()
89
100
  self.model = AutoModelForCausalLM.from_pretrained(self.LLM_NAME, device_map="auto")
90
101
  logger.info(f"{RED}[ Model initialized --------------------- {RESET}{self.LLM_NAME} {RED}]{RESET}")
91
102
  self._set_formatter(formatter)
@@ -135,6 +146,7 @@ class BaseHFLLM(BaseLLM):
135
146
  stop_sequences: list[str] | None = None,
136
147
  max_tokens: int | None = None,
137
148
  temperature: float | None = None,
149
+ top_p: float | None = None,
138
150
  ) -> list[RawCompletion]:
139
151
  if temperature is None:
140
152
  effective_temperature = 0.0 # Current default, TODO: refactor to use model's default
@@ -143,7 +155,8 @@ class BaseHFLLM(BaseLLM):
143
155
  )
144
156
  else:
145
157
  effective_temperature = temperature
146
-
158
+ if top_p is not None:
159
+ logger.warning("Huggingface LLM does not support top_p. Ignoring top_p value.")
147
160
  raw_completions = []
148
161
  for single_messages in messages:
149
162
  # format
@@ -403,14 +416,24 @@ class Pythia410m(HFLLM):
403
416
 
404
417
 
405
418
  class SmolLM135M(HFLLM):
419
+ """SmolLM-135M uses a GPT2-style tokenizer; AutoTokenizer can incorrectly select LlamaTokenizer."""
420
+
406
421
  LLM_NAME = "HuggingFaceTB/SmolLM-135M"
407
422
  DEFAULT_FORMATTER = ConcatFormatter
408
423
 
424
+ def _load_tokenizer(self) -> PreTrainedTokenizerBase:
425
+ return GPT2Tokenizer.from_pretrained(self.LLM_NAME)
426
+
409
427
 
410
428
  class Smollm135MInstruct(HFLLM):
429
+ """SmolLM-135M-Instruct uses a GPT2-style tokenizer; AutoTokenizer can incorrectly select LlamaTokenizer."""
430
+
411
431
  LLM_NAME = "HuggingFaceTB/SmolLM-135M-Instruct"
412
432
  DEFAULT_FORMATTER = partial(HFFormatter, LLM_NAME)
413
433
 
434
+ def _load_tokenizer(self) -> PreTrainedTokenizerBase:
435
+ return GPT2Tokenizer.from_pretrained(self.LLM_NAME)
436
+
414
437
 
415
438
  class Qwen3_0_6B(HFLLM):
416
439
  LLM_NAME = "Qwen/Qwen3-0.6B"
@@ -34,6 +34,7 @@ class OpenAIModel(BaseLLM):
34
34
  model_name: str | None = None,
35
35
  formatter: BaseFormatter | None = None,
36
36
  temperature: float | None = None,
37
+ top_p: float | None = None,
37
38
  api_key: str | None = os.getenv("OPENAI_API_KEY", ""),
38
39
  organization: str | None = None,
39
40
  base_url: str | None = None,
@@ -46,6 +47,7 @@ class OpenAIModel(BaseLLM):
46
47
  model_name: OpenAI model name (e.g., "gpt-4o", "gpt-3.5-turbo"). If None, uses LLM_NAME class attribute.
47
48
  formatter: Optional message formatter.
48
49
  temperature: Sampling temperature used when not passed to generate methods (from 0.0 to 2.0).
50
+ top_p: Nucleus sampling probability mass (from 0.0 to 1.0). If None, the API default is used.
49
51
  api_key: OpenAI API key (defaults to OPENAI_API_KEY env variable).
50
52
  organization: Optional OpenAI organization ID.
51
53
  base_url: Optional API base URL for Azure or alternate endpoints.
@@ -59,6 +61,10 @@ class OpenAIModel(BaseLLM):
59
61
  self._temperature = temperature if temperature is not None else 0.0
60
62
  assert 0.0 <= self._temperature <= 2.0, "Temperature must be between 0.0 and 2.0"
61
63
 
64
+ if top_p is not None:
65
+ assert 0.0 <= top_p <= 1.0, "top_p must be between 0.0 and 1.0"
66
+ self._top_p = top_p
67
+
62
68
  self._client = OpenAI(
63
69
  api_key=api_key,
64
70
  organization=organization,
@@ -97,6 +103,7 @@ class OpenAIModel(BaseLLM):
97
103
  stop_sequences: list[str] | None = None,
98
104
  max_tokens: int | None = None,
99
105
  temperature: float | None = None,
106
+ top_p: float | None = None,
100
107
  ) -> list[RawCompletion]:
101
108
  """
102
109
  Generate completions for a list of message sequences concurrently.
@@ -108,6 +115,7 @@ class OpenAIModel(BaseLLM):
108
115
  stop_sequences: Optional list of stop sequences.
109
116
  max_tokens: Optional maximum number of tokens to generate.
110
117
  temperature: Sampling temperature.
118
+ top_p: Nucleus sampling probability mass (0.0 to 1.0). Overrides instance default if provided.
111
119
 
112
120
  Returns:
113
121
  List of RawCompletion objects containing prompts and completions.
@@ -116,6 +124,10 @@ class OpenAIModel(BaseLLM):
116
124
  effective_temperature = temperature if temperature is not None else self._temperature
117
125
  assert 0.0 <= effective_temperature <= 2.0, "Temperature must be between 0.0 and 2.0"
118
126
 
127
+ effective_top_p = top_p if top_p is not None else self._top_p
128
+ if effective_top_p is not None:
129
+ assert 0.0 <= effective_top_p <= 1.0, "top_p must be between 0.0 and 1.0"
130
+
119
131
  def _process_one(single_messages: Sequence[Message]) -> RawCompletion:
120
132
  # Adjust max tokens based on bytes_per_token_scalar so that non-standard models generate full responses
121
133
  scaled_max_tokens = math.ceil(max_tokens * self.bytes_per_token_scalar) if max_tokens is not None else None
@@ -129,6 +141,7 @@ class OpenAIModel(BaseLLM):
129
141
  model=self._model_name,
130
142
  prompt=prompt,
131
143
  temperature=effective_temperature,
144
+ top_p=effective_top_p,
132
145
  max_tokens=scaled_max_tokens,
133
146
  stop=stop_sequences,
134
147
  )
@@ -158,6 +171,7 @@ class OpenAIModel(BaseLLM):
158
171
  model=self._model_name,
159
172
  messages=chat_messages,
160
173
  temperature=effective_temperature,
174
+ top_p=effective_top_p,
161
175
  max_tokens=scaled_max_tokens,
162
176
  stop=stop_sequences,
163
177
  )
@@ -300,6 +314,7 @@ class OpenAIEmbeddingModel(BaseLLM):
300
314
  stop_sequences: list[str] | None = None,
301
315
  max_tokens: int | None = None,
302
316
  temperature: float | None = None,
317
+ top_p: float | None = None,
303
318
  ) -> list[RawCompletion]:
304
319
  raise NotImplementedError(
305
320
  "Embedding model does not support generate_from_messages. Use generate_embeddings instead."
@@ -134,11 +134,12 @@ class BaseVLLMModel(BaseLLM):
134
134
  **kwargs,
135
135
  }
136
136
 
137
- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
138
-
139
137
  self.batch_size = batch_size
140
138
 
141
- self.model = LLM(**model_args, device=device)
139
+ if "VLLM_TARGET_DEVICE" not in os.environ and not torch.cuda.is_available():
140
+ os.environ["VLLM_TARGET_DEVICE"] = "cpu"
141
+
142
+ self.model = LLM(**model_args)
142
143
 
143
144
  self._tokenizer: None | VLLMTokenizerAPI = None
144
145
  _ = self.tokenizer # make sure tokenizer is initialized
@@ -225,6 +226,7 @@ class BaseVLLMModel(BaseLLM):
225
226
  stop_sequences: list[str] | None = None,
226
227
  max_tokens: int | None = None,
227
228
  temperature: float | None = None,
229
+ top_p: float | None = None,
228
230
  ) -> list[RawCompletion]:
229
231
  raw_completions: list[RawCompletion | None] = [None] * len(messages)
230
232
  prompt_objs = []
@@ -234,7 +236,7 @@ class BaseVLLMModel(BaseLLM):
234
236
  scaled_max_tokens = math.ceil(max_tokens * self.bytes_per_token_scalar) if max_tokens is not None else None
235
237
 
236
238
  sampling_params = self._resolve_sampling_params(
237
- self.sampling_params, scaled_max_tokens, stop_sequences, temperature
239
+ self.sampling_params, scaled_max_tokens, stop_sequences, temperature, top_p
238
240
  )
239
241
 
240
242
  for i, single_messages in enumerate(messages):
@@ -294,6 +296,7 @@ class BaseVLLMModel(BaseLLM):
294
296
  max_tokens: int | None,
295
297
  stop_sequences: list[str] | None,
296
298
  temperature: float | None,
299
+ top_p: float | None = None,
297
300
  ) -> SamplingParams:
298
301
  sampling_params.max_tokens = max_tokens
299
302
  sampling_params.stop = stop_sequences
@@ -307,6 +310,13 @@ class BaseVLLMModel(BaseLLM):
307
310
  f"Using sampling params temperature value: {sampling_params.temperature} "
308
311
  f"as no custom temperature value was provided"
309
312
  )
313
+ if top_p is not None:
314
+ logger.warning(f"Overriding sampling params top_p {sampling_params.top_p} with custom value {top_p}")
315
+ sampling_params.top_p = top_p
316
+ else:
317
+ logger.info(
318
+ f"Using sampling params top_p value: {sampling_params.top_p} as no custom top_p value was provided"
319
+ )
310
320
  return sampling_params
311
321
 
312
322
  def _model_generate(