eval-framework 0.3.7__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (195) hide show
  1. {eval_framework-0.3.7 → eval_framework-0.5.0}/PKG-INFO +30 -34
  2. {eval_framework-0.3.7 → eval_framework-0.5.0}/README.md +0 -1
  3. {eval_framework-0.3.7 → eval_framework-0.5.0}/pyproject.toml +43 -63
  4. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/evaluation_generator.py +1 -0
  5. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/llm/openai.py +65 -14
  6. eval_framework-0.5.0/src/eval_framework/llm/vllm_local_server.py +217 -0
  7. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/base.py +18 -0
  8. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/code_assertion.py +11 -14
  9. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +20 -26
  10. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/multipl_e_assertion.py +7 -1
  11. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/comparison_grader.py +2 -2
  12. eval_framework-0.5.0/src/eval_framework/tasks/__init__.py +12 -0
  13. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/base.py +14 -5
  14. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/aidanbench.py +2 -2
  15. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/flores200.py +3 -3
  16. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/flores_plus.py +1 -2
  17. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/gsm8k.py +0 -1
  18. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/squad.py +21 -0
  19. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/tablebench.py +5 -1
  20. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/triviaqa.py +27 -1
  21. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/wmt.py +2 -2
  22. eval_framework-0.5.0/src/eval_framework/tasks/dataset_revisions.py +106 -0
  23. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/perturbation.py +2 -2
  24. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/registry.py +76 -45
  25. eval_framework-0.5.0/src/eval_framework/tasks/task-dataset-revisions.json +62 -0
  26. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/task_names.py +2 -122
  27. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/task_style.py +64 -2
  28. eval_framework-0.3.7/src/eval_framework/metrics/completion/comet.py +0 -56
  29. eval_framework-0.3.7/src/eval_framework/tasks/__init__.py +0 -6
  30. {eval_framework-0.3.7 → eval_framework-0.5.0}/LICENSE +0 -0
  31. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/__init__.py +0 -0
  32. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/base_config.py +0 -0
  33. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/context/__init__.py +0 -0
  34. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/context/determined.py +0 -0
  35. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/context/eval.py +0 -0
  36. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/context/local.py +0 -0
  37. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/exceptions.py +0 -0
  38. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/external/drop_process_results.py +0 -0
  39. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/external/ifeval_impl/README.md +0 -0
  40. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
  41. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
  42. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
  43. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
  44. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/llm/__init__.py +0 -0
  45. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/llm/aleph_alpha.py +0 -0
  46. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/llm/base.py +0 -0
  47. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/llm/huggingface.py +0 -0
  48. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/llm/mistral.py +0 -0
  49. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/llm/models.py +0 -0
  50. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/llm/vllm.py +0 -0
  51. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/logger.py +0 -0
  52. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/main.py +0 -0
  53. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/__init__.py +0 -0
  54. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/aggregators/__init__.py +0 -0
  55. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/aggregators/aggregators.py +0 -0
  56. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/__init__.py +0 -0
  57. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
  58. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
  59. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/bleu.py +0 -0
  60. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/chrf.py +0 -0
  61. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
  62. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/csv_format.py +0 -0
  63. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
  64. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/drop_completion.py +0 -0
  65. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
  66. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/f1.py +0 -0
  67. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/format_checker.py +0 -0
  68. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
  69. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/ifeval.py +0 -0
  70. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/json_format.py +0 -0
  71. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/language_checker.py +0 -0
  72. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/length_control.py +0 -0
  73. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/math_minerva_completion.py +0 -0
  74. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
  75. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/minerva_math_utils.py +0 -0
  76. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
  77. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
  78. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/repetition.py +0 -0
  79. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
  80. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
  81. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
  82. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
  83. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
  84. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/ter.py +0 -0
  85. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/text_counter.py +0 -0
  86. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
  87. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
  88. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/__init__.py +0 -0
  89. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/base.py +0 -0
  90. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
  91. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
  92. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
  93. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
  94. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
  95. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
  96. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/language.py +0 -0
  97. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
  98. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/models.py +0 -0
  99. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
  100. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
  101. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
  102. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
  103. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
  104. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
  105. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
  106. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
  107. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
  108. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
  109. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
  110. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
  111. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
  112. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
  113. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
  114. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/utils.py +0 -0
  115. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
  116. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
  117. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
  118. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/loglikelihood/bits_per_byte.py +0 -0
  119. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
  120. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
  121. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
  122. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
  123. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/py.typed +0 -0
  124. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/response_generator.py +0 -0
  125. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/result_processors/__init__.py +0 -0
  126. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/result_processors/base.py +0 -0
  127. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/result_processors/hf_uploader.py +0 -0
  128. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/result_processors/result_processor.py +0 -0
  129. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
  130. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/run.py +0 -0
  131. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/run_direct.py +0 -0
  132. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/shared/types.py +0 -0
  133. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/suite.py +0 -0
  134. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/Dockerfile_codebench +0 -0
  135. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
  136. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
  137. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
  138. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
  139. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/balancedcopa.py +0 -0
  140. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
  141. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
  142. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
  143. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
  144. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
  145. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/csqa.py +0 -0
  146. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/drop.py +0 -0
  147. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
  148. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/global_mmlu.py +0 -0
  149. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/goldenswag.py +0 -0
  150. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
  151. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
  152. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
  153. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
  154. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
  155. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/include.py +0 -0
  156. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
  157. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/lab_bench.py +0 -0
  158. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/math_reasoning.py +0 -0
  159. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
  160. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/medqa.py +0 -0
  161. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
  162. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
  163. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
  164. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
  165. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/multipl_e.py +0 -0
  166. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/naturalqs_open.py +0 -0
  167. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
  168. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
  169. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
  170. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
  171. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
  172. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
  173. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/social_iqa.py +0 -0
  174. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
  175. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
  176. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/truthfulqa.py +0 -0
  177. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
  178. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
  179. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
  180. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
  181. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/eval_config.py +0 -0
  182. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/task_loader.py +0 -0
  183. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/utils.py +0 -0
  184. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/utils/constants.py +0 -0
  185. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/utils/file_ops.py +0 -0
  186. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/utils/generate_task_docs.py +0 -0
  187. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/utils/helpers.py +0 -0
  188. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/utils/logging.py +0 -0
  189. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/utils/packaging.py +0 -0
  190. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/utils/tqdm_handler.py +0 -0
  191. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/template_formatting/README.md +0 -0
  192. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/template_formatting/__init__.py +0 -0
  193. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/template_formatting/formatter.py +0 -0
  194. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/template_formatting/mistral_formatter.py +0 -0
  195. {eval_framework-0.3.7 → eval_framework-0.5.0}/src/template_formatting/py.typed +0 -0
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: eval-framework
3
- Version: 0.3.7
4
- Summary: Evalulation Framework
3
+ Version: 0.5.0
4
+ Summary: Evaluation Framework
5
5
  Author: Aleph Alpha Research
6
6
  License: Apache License
7
7
  Version 2.0, January 2004
@@ -211,49 +211,47 @@ Classifier: Programming Language :: Python :: 3.12
211
211
  Classifier: Programming Language :: Python :: 3 :: Only
212
212
  Classifier: Topic :: Software Development :: Libraries
213
213
  Classifier: Typing :: Typed
214
- Requires-Dist: pyyaml>=6.0.1,<7
215
- Requires-Dist: xmltodict>=0.13.0,<0.16
216
- Requires-Dist: pydantic>=2.7,<3
217
- Requires-Dist: datasets>=4.0.0,<5
218
- Requires-Dist: sacrebleu>=2.4.3,<3
219
- Requires-Dist: pycountry>=24.6.1,<25
220
- Requires-Dist: nltk>=3.9.1,<4
221
- Requires-Dist: python-dotenv>=1.0.1,<2
222
- Requires-Dist: lingua-language-detector>=2.0.2,<3
223
- Requires-Dist: google-crc32c>=1.5.0,<2
224
- Requires-Dist: kubernetes>=31.0.0,<32
214
+ Requires-Dist: pyyaml>=6.0.3,<7
215
+ Requires-Dist: xmltodict>=1.0.4,<1.1
216
+ Requires-Dist: pydantic>=2.13.4,<3
217
+ Requires-Dist: datasets>=5.0.0,<6
218
+ Requires-Dist: sacrebleu>=2.6.0,<3
219
+ Requires-Dist: pycountry>=26.2.16,<27
220
+ Requires-Dist: nltk>=3.9.4,<4
221
+ Requires-Dist: python-dotenv>=1.2.2,<2
222
+ Requires-Dist: lingua-language-detector>=2.2.0,<3
223
+ Requires-Dist: google-crc32c>=1.8.0,<2
225
224
  Requires-Dist: langdetect>=1.0.9,<2
226
- Requires-Dist: spacy>=3.8.3,<4
227
- Requires-Dist: jsonschema>=4.23.0,<5
228
- Requires-Dist: mysql-connector-python>=9.0.0,<10
229
- Requires-Dist: psycopg2-binary>=2.9.9,<3
225
+ Requires-Dist: spacy>=3.8.14,<4
226
+ Requires-Dist: jsonschema>=4.26.0,<5
227
+ Requires-Dist: mysql-connector-python>=9.7.0,<10
228
+ Requires-Dist: psycopg2-binary>=2.9.12,<3
230
229
  Requires-Dist: sympy>=1.13.1,<2
231
- Requires-Dist: llm-sandbox[docker]==0.3.37
230
+ Requires-Dist: llm-sandbox[docker]==0.3.39
232
231
  Requires-Dist: jsonlines>=4,<5
233
- Requires-Dist: lxml>=6,<7
234
- Requires-Dist: python-iso639>=2025.2.18
235
- Requires-Dist: wandb>=0.23.0,<1
236
- Requires-Dist: boto3>=1.40.54,<2
237
- Requires-Dist: numpy>=1.26.4
232
+ Requires-Dist: lxml>=6.1.1,<7
233
+ Requires-Dist: python-iso639>=2026.4.20
234
+ Requires-Dist: wandb>=0.27.2,<1
235
+ Requires-Dist: boto3>=1.43.19,<2
236
+ Requires-Dist: numpy>=2.2.6
238
237
  Requires-Dist: antlr4-python3-runtime==4.11.0
239
- Requires-Dist: scipy>=1.14.0,<2
238
+ Requires-Dist: scipy>=1.17.1,<2
240
239
  Requires-Dist: accelerate ; extra == 'accelerate'
241
- Requires-Dist: eval-framework[determined,api,openai,transformers,accelerate,vllm,comet,optional,mistral] ; extra == 'all'
240
+ Requires-Dist: eval-framework[determined,api,openai,transformers,accelerate,vllm,optional,mistral] ; extra == 'all'
242
241
  Requires-Dist: aleph-alpha-client>=11.5.1 ; extra == 'api'
243
- Requires-Dist: unbabel-comet>=2.2.6,<3 ; extra == 'comet'
244
- Requires-Dist: determined>=0.38,<0.39 ; extra == 'determined'
245
- Requires-Dist: tensorboard==2.19.0 ; extra == 'determined'
246
- Requires-Dist: mistral-common>=1.7,<2 ; extra == 'mistral'
247
- Requires-Dist: huggingface-hub>=0.33.2,<0.34 ; extra == 'mistral'
242
+ Requires-Dist: determined>=0.38.1,<0.39 ; extra == 'determined'
243
+ Requires-Dist: tensorboard==2.20.0 ; extra == 'determined'
244
+ Requires-Dist: mistral-common>=1.11.3,<2 ; extra == 'mistral'
245
+ Requires-Dist: huggingface-hub>=0.36.2,<0.37 ; extra == 'mistral'
248
246
  Requires-Dist: eval-framework[vllm] ; extra == 'mistral'
249
247
  Requires-Dist: openai>=1.62,<3 ; extra == 'openai'
250
- Requires-Dist: tiktoken>=0.9,<1 ; extra == 'openai'
248
+ Requires-Dist: tiktoken>=0.13.0,<1 ; extra == 'openai'
251
249
  Requires-Dist: transformers>=4.45.2,<5 ; extra == 'openai'
252
250
  Requires-Dist: transformers>=4.45.2,<5 ; extra == 'optional'
253
251
  Requires-Dist: jinja2>=3.1.6,<4 ; extra == 'optional'
254
252
  Requires-Dist: transformers>=4.45.2,<5 ; extra == 'transformers'
255
253
  Requires-Dist: torch>=2.5,<3 ; extra == 'transformers'
256
- Requires-Dist: accelerate>=0.30.0,<1 ; extra == 'transformers'
254
+ Requires-Dist: accelerate>=1.14.0,<2 ; extra == 'transformers'
257
255
  Requires-Dist: vllm>=0.8.5,<0.9 ; extra == 'vllm'
258
256
  Requires-Dist: torch>=2.5,<3 ; extra == 'vllm'
259
257
  Requires-Python: >=3.12, <3.13
@@ -261,7 +259,6 @@ Project-URL: repository, https://github.com/Aleph-Alpha-Research/eval-framework
261
259
  Provides-Extra: accelerate
262
260
  Provides-Extra: all
263
261
  Provides-Extra: api
264
- Provides-Extra: comet
265
262
  Provides-Extra: determined
266
263
  Provides-Extra: mistral
267
264
  Provides-Extra: openai
@@ -319,7 +316,6 @@ pip install eval_framework
319
316
 
320
317
  There are optional extras available to unlock specific features of the library:
321
318
  - `api` for inference using the aleph-alpha client.
322
- - `comet` for the COMET metric.
323
319
  - `determined` for running jobs via determined.
324
320
  - `mistral` for inference on Mistral models.
325
321
  - `transformers` for inference using the transformers library.
@@ -47,7 +47,6 @@ pip install eval_framework
47
47
 
48
48
  There are optional extras available to unlock specific features of the library:
49
49
  - `api` for inference using the aleph-alpha client.
50
- - `comet` for the COMET metric.
51
50
  - `determined` for running jobs via determined.
52
51
  - `mistral` for inference on Mistral models.
53
52
  - `transformers` for inference using the transformers library.
@@ -1,7 +1,7 @@
1
1
  [project]
2
2
  name = "eval-framework"
3
- version = "0.3.7"
4
- description = "Evalulation Framework"
3
+ version = "0.5.0"
4
+ description = "Evaluation Framework"
5
5
  readme = "README.md"
6
6
  license = { file = "LICENSE" }
7
7
  requires-python = ">=3.12,<3.13"
@@ -18,53 +18,51 @@ classifiers = [
18
18
  "Typing :: Typed",
19
19
  ]
20
20
  dependencies = [
21
- "pyyaml>=6.0.1,<7",
22
- "xmltodict>=0.13.0,<0.16",
23
- "pydantic>=2.7,<3",
24
- "datasets>=4.0.0,<5",
25
- "sacrebleu>=2.4.3,<3",
26
- "pycountry>=24.6.1,<25",
27
- "nltk>=3.9.1,<4",
28
- "python-dotenv>=1.0.1,<2",
29
- "lingua-language-detector>=2.0.2,<3",
30
- "google-crc32c>=1.5.0,<2",
31
- "kubernetes>=31.0.0,<32", # required by llm-sandbox though actually not needed
21
+ "pyyaml>=6.0.3,<7",
22
+ "xmltodict>=1.0.4,<1.1",
23
+ "pydantic>=2.13.4,<3",
24
+ "datasets>=5.0.0,<6",
25
+ "sacrebleu>=2.6.0,<3",
26
+ "pycountry>=26.2.16,<27",
27
+ "nltk>=3.9.4,<4",
28
+ "python-dotenv>=1.2.2,<2",
29
+ "lingua-language-detector>=2.2.0,<3",
30
+ "google-crc32c>=1.8.0,<2",
32
31
  "langdetect>=1.0.9,<2", # required by the original ifeval implementation
33
- "spacy>=3.8.3,<4",
34
- "jsonschema>=4.23.0,<5",
35
- "mysql-connector-python>=9.0.0,<10", # required for sql-related tasks
36
- "psycopg2-binary>=2.9.9,<3", # required for sql-related tasks
32
+ "spacy>=3.8.14,<4",
33
+ "jsonschema>=4.26.0,<5",
34
+ "mysql-connector-python>=9.7.0,<10", # required for sql-related tasks
35
+ "psycopg2-binary>=2.9.12,<3", # required for sql-related tasks
37
36
  "sympy>=1.13.1,<2",
38
- "llm-sandbox[docker]==0.3.37",
37
+ "llm-sandbox[docker]==0.3.39",
39
38
  "jsonlines>=4,<5",
40
- "lxml>=6,<7",
41
- "python-iso639>=2025.2.18",
42
- "wandb>=0.23.0,<1",
43
- "boto3>=1.40.54,<2",
44
- "numpy>=1.26.4",
39
+ "lxml>=6.1.1,<7",
40
+ "python-iso639>=2026.4.20",
41
+ "wandb>=0.27.2,<1",
42
+ "boto3>=1.43.19,<2",
43
+ "numpy>=2.2.6",
45
44
  # is a dependency of sympy, but not explicitly listed in the requirements.txt
46
45
  # https://github.com/sympy/sympy/blob/0204fa34e8f6f6f8ccb4de01209be9a2345c9d6e/doc/src/contributing/dependencies.md?plain=1#L125
47
46
  "antlr4-python3-runtime==4.11.0",
48
- "scipy>=1.14.0,<2", # required for the aggregation of pass@k metrics
49
-
47
+ "scipy>=1.17.1,<2", # required for the aggregation of pass@k metrics
50
48
  ]
51
49
 
52
50
  [project.optional-dependencies]
53
51
  # Model-specific extras
54
52
  determined = [
55
- "determined>=0.38,<0.39",
56
- "tensorboard==2.19.0"
53
+ "determined>=0.38.1,<0.39",
54
+ "tensorboard==2.20.0"
57
55
  ]
58
56
  api = ["aleph-alpha-client>=11.5.1"]
59
57
  openai = [
60
58
  "openai>=1.62,<3",
61
- "tiktoken>=0.9,<1",
59
+ "tiktoken>=0.13.0,<1",
62
60
  "transformers>=4.45.2,<5",
63
61
  ]
64
62
  transformers = [
65
63
  "transformers>=4.45.2,<5",
66
64
  "torch>=2.5,<3",
67
- "accelerate>=0.30.0,<1",
65
+ "accelerate>=1.14.0,<2",
68
66
  ]
69
67
  accelerate = ["accelerate"]
70
68
  vllm = [
@@ -72,21 +70,17 @@ vllm = [
72
70
  "torch>=2.5,<3"
73
71
  ]
74
72
  mistral = [
75
- "mistral-common>=1.7,<2",
76
- "huggingface-hub>=0.33.2,<0.34",
73
+ "mistral-common>=1.11.3,<2",
74
+ "huggingface-hub>=0.36.2,<0.37",
77
75
  "eval_framework[vllm]",
78
76
  ]
79
- # Benchmark/metric specific extras
80
- comet = [
81
- "unbabel-comet>=2.2.6,<3",
82
- ]
83
77
  # from template-formatting
84
78
  optional = [
85
79
  "transformers>=4.45.2,<5",
86
80
  "jinja2>=3.1.6,<4"
87
81
  ]
88
82
  all = [
89
- "eval_framework[determined,api,openai,transformers,accelerate,vllm,comet,optional,mistral]"
83
+ "eval_framework[determined,api,openai,transformers,accelerate,vllm,optional,mistral]"
90
84
  ]
91
85
 
92
86
  [project.urls]
@@ -97,24 +91,25 @@ eval_framework = "eval_framework.run:run"
97
91
 
98
92
  [dependency-groups]
99
93
  dev = [
100
- "mypy>=1.10,<2",
101
- "pytest>=8.3.3,<9",
102
- "pytest-mock>=3.14.1",
103
- "pytest-xdist>=3.6.1,<4",
94
+ "mypy>=2.1.0,<3",
95
+ "pytest>=9.1.0,<10",
96
+ "pytest-mock>=3.15.1",
97
+ "pytest-xdist>=3.8.0,<4",
104
98
  "pytest-sugar>1.1,<2",
105
- "types-pyyaml>=6.0.12.20240917,<7",
106
- "types-python-dateutil>=2.9.0.20241206,<3",
107
- "types-requests>=2.32.0.20250328,<3",
108
- "plotly>=5.24.1,<6",
109
- "ruff>=0.12.8",
99
+ "types-pyyaml>=6.0.12.20260518,<7",
100
+ "types-python-dateutil>=2.9.0.20260518,<3",
101
+ "types-requests>=2.33.0.20260518,<3",
102
+ "plotly>=6.8.0,<7",
103
+ "ruff>=0.15.18",
104
+ "pip-licenses>=5.5.5",
110
105
  ]
111
106
  flash-attn = [
112
- "flash-attn>=2.7.2.post1,<2.8",
107
+ "flash-attn>=2.8.3,<2.9",
113
108
  "torch"
114
109
  ]
115
110
 
116
111
  [build-system]
117
- requires = ["uv_build>=0.9.0,<0.10.0"]
112
+ requires = ["uv_build>=0.11.22,<0.11.23"]
118
113
  build-backend = "uv_build"
119
114
 
120
115
  [tool.uv.build-backend]
@@ -125,22 +120,6 @@ override-dependencies = [
125
120
  "requests>=2.32,<3", # fix for determined
126
121
  ]
127
122
 
128
- [tool.uv.sources]
129
- torch = [
130
- { index = "pytorch-default", marker = "sys_platform != 'linux'" },
131
- { index = "pytorch-cu124", marker = "sys_platform == 'linux'" },
132
- ]
133
-
134
- [[tool.uv.index]]
135
- name = "pytorch-cu124"
136
- url = "https://download.pytorch.org/whl/cu124"
137
- explicit = true
138
-
139
- [[tool.uv.index]]
140
- name = "pytorch-default"
141
- url = "https://pypi.org/simple"
142
- explicit = true
143
-
144
123
  [tool.uv.extra-build-dependencies]
145
124
  # Build flash-attn with the same torch version as in the container. Details at:
146
125
  # https://docs.astral.sh/uv/concepts/projects/config/#augmenting-build-dependencies
@@ -166,6 +145,7 @@ known-third-party = ["wandb"]
166
145
 
167
146
  [tool.ruff.lint.extend-per-file-ignores]
168
147
  "__init__.py" = ["F401"]
148
+ "tests/tests_eval_framework/tasks/benchmarks/test_mmlu_de.py" = ["E501"]
169
149
 
170
150
  [tool.mypy]
171
151
  plugins = "pydantic.mypy"
@@ -80,6 +80,7 @@ class EvaluationGenerator:
80
80
  )
81
81
  else:
82
82
  metric = metric_class()
83
+ metric.fail_on_error = self.config.fail_on_error
83
84
 
84
85
  logger.info(f"Starting calculation of {metric.NAME}")
85
86
  safe_tqdm_write(f"INFO: Calculating {metric.NAME}")
@@ -55,7 +55,7 @@ class OpenAIModel(BaseLLM):
55
55
  formatter: BaseFormatter | None = None,
56
56
  temperature: float | None = None,
57
57
  top_p: float | None = None,
58
- api_key: str | None = os.getenv("OPENAI_API_KEY", ""),
58
+ api_key: str | None = None,
59
59
  organization: str | None = None,
60
60
  base_url: str | None = None,
61
61
  bytes_per_token: float | None = None,
@@ -86,13 +86,12 @@ class OpenAIModel(BaseLLM):
86
86
  self._top_p = top_p
87
87
 
88
88
  self._client = OpenAI(
89
- api_key=api_key,
89
+ api_key=api_key if api_key is not None else os.getenv("OPENAI_API_KEY", ""),
90
90
  organization=organization,
91
91
  base_url=base_url,
92
92
  )
93
93
 
94
- # Initialize tokenizer for the model
95
- self._encoder = self._get_encoder()
94
+ self._encoder: tiktoken.Encoding | Tokenizer | None = self._get_encoder_or_none()
96
95
 
97
96
  # set bytes_per_token_scalar for non-standard models
98
97
  if bytes_per_token is not None and bytes_per_token <= 0:
@@ -101,9 +100,23 @@ class OpenAIModel(BaseLLM):
101
100
  4.0 / bytes_per_token if bytes_per_token is not None else 4.0 / self.BYTES_PER_TOKEN
102
101
  )
103
102
 
104
- def _get_encoder(self) -> tiktoken.Encoding:
103
+ def _get_encoder_or_none(self) -> tiktoken.Encoding | None:
105
104
  assert self._model_name is not None
106
- return tiktoken.encoding_for_model(self._model_name)
105
+ try:
106
+ return tiktoken.encoding_for_model(self._model_name)
107
+ except KeyError:
108
+ logger.info(
109
+ "tiktoken could not map model_name=%r. Disabling token counting for this model.",
110
+ self._model_name,
111
+ )
112
+ return None
113
+ except Exception as e:
114
+ logger.warning(
115
+ "Failed to initialize tiktoken encoder for model_name=%r (%s). Disabling token counting.",
116
+ self._model_name,
117
+ e.__class__.__name__,
118
+ )
119
+ return None
107
120
 
108
121
  def _count_tokens(self, text: str) -> int:
109
122
  """
@@ -115,6 +128,8 @@ class OpenAIModel(BaseLLM):
115
128
  Returns:
116
129
  Number of tokens.
117
130
  """
131
+ if self._encoder is None:
132
+ raise RuntimeError("Token counting is not available (no encoder configured).")
118
133
  return len(self._encoder.encode(text))
119
134
 
120
135
  def generate_from_messages(
@@ -166,14 +181,31 @@ class OpenAIModel(BaseLLM):
166
181
  stop=stop_sequences,
167
182
  )
168
183
  completion = response.choices[0].text
184
+ usage = getattr(response, "usage", None)
185
+ prompt_tokens = getattr(usage, "prompt_tokens", None) if usage is not None else None
186
+ completion_tokens = getattr(usage, "completion_tokens", None) if usage is not None else None
169
187
  return RawCompletion(
170
188
  prompt=prompt,
171
- prompt_sequence_positions=self._count_tokens(prompt),
172
- concat_compression=ConcatCompression.calculate(
173
- single_messages, count_tokens=self._count_tokens, completion=completion
189
+ prompt_sequence_positions=(
190
+ prompt_tokens
191
+ if prompt_tokens is not None
192
+ else (self._count_tokens(prompt) if self._encoder is not None else None)
193
+ ),
194
+ concat_compression=(
195
+ ConcatCompression.calculate(
196
+ single_messages,
197
+ count_tokens=self._count_tokens,
198
+ completion=completion,
199
+ )
200
+ if self._encoder is not None
201
+ else None
174
202
  ),
175
203
  completion=completion,
176
- completion_sequence_positions=self._count_tokens(completion),
204
+ completion_sequence_positions=(
205
+ completion_tokens
206
+ if completion_tokens is not None
207
+ else (self._count_tokens(completion) if self._encoder is not None else None)
208
+ ),
177
209
  )
178
210
 
179
211
  else:
@@ -190,15 +222,26 @@ class OpenAIModel(BaseLLM):
190
222
  )
191
223
  prompt = "\n".join([f"{m.get('role', '')}: {m.get('content', '')}" for m in chat_messages])
192
224
  prompt_tokens = getattr(chat_response.usage, "prompt_tokens", None)
225
+ completion_tokens = getattr(chat_response.usage, "completion_tokens", None)
193
226
  completion = chat_response.choices[0].message.content or ""
194
227
  return RawCompletion(
195
228
  prompt=prompt,
196
229
  prompt_sequence_positions=prompt_tokens,
197
- concat_compression=ConcatCompression.calculate(
198
- single_messages, count_tokens=self._count_tokens, completion=completion
230
+ concat_compression=(
231
+ ConcatCompression.calculate(
232
+ single_messages,
233
+ count_tokens=self._count_tokens,
234
+ completion=completion,
235
+ )
236
+ if self._encoder is not None
237
+ else None
199
238
  ),
200
239
  completion=completion,
201
- completion_sequence_positions=self._count_tokens(completion),
240
+ completion_sequence_positions=(
241
+ completion_tokens
242
+ if completion_tokens is not None
243
+ else (self._count_tokens(completion) if self._encoder is not None else None)
244
+ ),
202
245
  )
203
246
 
204
247
  with concurrent.futures.ThreadPoolExecutor() as executor:
@@ -218,6 +261,10 @@ class OpenAIModel(BaseLLM):
218
261
  Note:
219
262
  Uses the OpenAI completions API with echo=True; chat logprobs are not supported.
220
263
  """
264
+ if self._encoder is None:
265
+ raise NotImplementedError(
266
+ "OpenAIModel.logprobs() requires a local tokenizer/encoder, but none is available."
267
+ )
221
268
  assert self._model_name in ["babbage-002", "davinci-002"], (
222
269
  "Log-probs for prompt tokens are only supported for a limited set of models."
223
270
  )
@@ -383,12 +430,16 @@ class DeepseekModel(OpenAIModel):
383
430
  base_url="https://api.deepseek.com/beta",
384
431
  )
385
432
  self._tokenizer_name = tokenizer_name if tokenizer_name is not None else "deepseek-ai/DeepSeek-V3.2-Exp"
433
+ # DeepSeek uses HF tokenization; override the base encoder (which may be None).
434
+ self._encoder = self._get_encoder()
386
435
 
387
436
  def _get_encoder(self) -> Tokenizer:
388
437
  return AutoTokenizer.from_pretrained(self._tokenizer_name)
389
438
 
390
439
  def _count_tokens(self, text: str) -> int:
391
- return len(self._encoder.encode(text))
440
+ encoder = self._encoder
441
+ assert encoder is not None
442
+ return len(encoder.encode(text)) # type: ignore[union-attr]
392
443
 
393
444
 
394
445
  ### Model Aliases ###
@@ -0,0 +1,217 @@
1
+ from __future__ import annotations
2
+
3
+ import atexit
4
+ import logging
5
+ import os
6
+ import signal
7
+ import socket
8
+ import subprocess
9
+ import time
10
+ import urllib.error
11
+ import urllib.request
12
+ from collections.abc import Sequence
13
+
14
+ from eval_framework.llm.base import BaseLLM
15
+ from eval_framework.llm.openai import OpenAIModel
16
+ from eval_framework.shared.types import RawCompletion, RawLoglikelihood
17
+ from eval_framework.tasks.base import Sample
18
+ from template_formatting.formatter import BaseFormatter, Message
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ def _pick_free_port(host: str) -> int:
24
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
25
+ s.bind((host, 0))
26
+ return int(s.getsockname()[1])
27
+
28
+
29
+ def _wait_for_http_ready(url: str, *, timeout_s: float) -> None:
30
+ deadline = time.time() + timeout_s
31
+ last_err: Exception | None = None
32
+ while time.time() < deadline:
33
+ try:
34
+ with urllib.request.urlopen(url, timeout=2) as resp:
35
+ if 200 <= resp.status < 500:
36
+ return
37
+ except (urllib.error.URLError, TimeoutError, ConnectionError, OSError) as e:
38
+ last_err = e
39
+ time.sleep(0.25)
40
+ raise RuntimeError(f"Timed out waiting for server readiness at {url}. Last error: {last_err}")
41
+
42
+
43
+ def _wait_for_http_ready_or_proc_exit(url: str, *, timeout_s: float, proc: subprocess.Popen[str]) -> None:
44
+ """
45
+ Like `_wait_for_http_ready`, but fail fast if the server process exits.
46
+
47
+ This avoids long timeouts that hide the real root cause (e.g. invalid CLI flags,
48
+ missing dependencies, CUDA issues).
49
+ """
50
+ deadline = time.time() + timeout_s
51
+ last_err: Exception | None = None
52
+
53
+ while time.time() < deadline:
54
+ if proc.poll() is not None:
55
+ out = ""
56
+ try:
57
+ if proc.stdout is not None:
58
+ out = proc.stdout.read() or ""
59
+ except Exception:
60
+ out = ""
61
+ tail = out.strip()
62
+ if len(tail) > 8000:
63
+ tail = tail[-8000:]
64
+ raise RuntimeError(
65
+ f"vLLM server process exited before becoming ready. exit_code={proc.returncode}. Output (tail):\n{tail}"
66
+ )
67
+
68
+ try:
69
+ with urllib.request.urlopen(url, timeout=2) as resp:
70
+ if 200 <= resp.status < 500:
71
+ return
72
+ except (urllib.error.URLError, TimeoutError, ConnectionError, OSError) as e:
73
+ last_err = e
74
+ time.sleep(0.25)
75
+
76
+ raise RuntimeError(f"Timed out waiting for server readiness at {url}. Last error: {last_err}")
77
+
78
+
79
+ class VLLMLocalServerModel(BaseLLM):
80
+ """
81
+ Provider-style model: start a local vLLM OpenAI-compatible server, then talk to it via `OpenAIModel(base_url=...)`.
82
+
83
+ This gives you a stable HTTP boundary (good for VCR cassettes) while keeping "local vLLM" as a selectable backend.
84
+
85
+ Notes:
86
+ - The server is started in a subprocess using `vllm serve`.
87
+ - Cleanup is best-effort (SIGTERM then SIGKILL).
88
+ - Not all OpenAI API features are guaranteed to be supported by the local server (e.g. logprobs).
89
+ """
90
+
91
+ def __init__(
92
+ self,
93
+ *,
94
+ model_name: str,
95
+ host: str = "127.0.0.1",
96
+ port: int | None = None,
97
+ startup_timeout_s: float = 120.0,
98
+ # `OpenAIModel` parameters:
99
+ formatter: BaseFormatter | None = None,
100
+ temperature: float | None = None,
101
+ top_p: float | None = None,
102
+ api_key: str | None = None,
103
+ bytes_per_token: float | None = None,
104
+ # vLLM "serve" parameters (subset, passed through):
105
+ tensor_parallel_size: int | None = None,
106
+ dtype: str | None = None,
107
+ max_model_len: int | None = None,
108
+ gpu_memory_utilization: float | None = None,
109
+ enforce_eager: bool | None = None,
110
+ # Escape hatch:
111
+ vllm_command: str | None = None,
112
+ vllm_extra_args: list[str] | None = None,
113
+ env: dict[str, str] | None = None,
114
+ ) -> None:
115
+ self._model_name = model_name
116
+ self._host = host
117
+ self._port = port if port is not None else _pick_free_port(host)
118
+ self._startup_timeout_s = float(startup_timeout_s)
119
+
120
+ self._proc: subprocess.Popen[str] | None = None
121
+
122
+ self._server_url = f"http://{self._host}:{self._port}/v1"
123
+
124
+ cmd = [vllm_command or "vllm", "serve", self._model_name, "--host", self._host, "--port", str(self._port)]
125
+
126
+ # A small, intentionally conservative subset of flags.
127
+ if tensor_parallel_size is not None:
128
+ cmd += ["--tensor-parallel-size", str(tensor_parallel_size)]
129
+ if dtype is not None:
130
+ cmd += ["--dtype", str(dtype)]
131
+ if max_model_len is not None:
132
+ cmd += ["--max-model-len", str(max_model_len)]
133
+ if gpu_memory_utilization is not None:
134
+ cmd += ["--gpu-memory-utilization", str(gpu_memory_utilization)]
135
+ if enforce_eager is not None:
136
+ # vLLM exposes this as a boolean flag; passing a value breaks CLI parsing.
137
+ if enforce_eager:
138
+ cmd += ["--enforce-eager"]
139
+
140
+ if vllm_extra_args:
141
+ cmd += list(vllm_extra_args)
142
+
143
+ merged_env = os.environ.copy()
144
+ if env:
145
+ merged_env.update(env)
146
+
147
+ logger.info("Starting local vLLM server: %s", " ".join(cmd))
148
+ self._proc = subprocess.Popen(
149
+ cmd,
150
+ env=merged_env,
151
+ stdout=subprocess.PIPE,
152
+ stderr=subprocess.STDOUT,
153
+ text=True,
154
+ )
155
+
156
+ # Ensure we don't leave it around if the process exits abruptly.
157
+ atexit.register(self._cleanup)
158
+
159
+ # Wait until the OpenAI-compatible endpoints respond.
160
+ if self._proc is None:
161
+ raise RuntimeError("Failed to start vLLM server process.")
162
+ _wait_for_http_ready_or_proc_exit(
163
+ f"{self._server_url}/models",
164
+ timeout_s=self._startup_timeout_s,
165
+ proc=self._proc,
166
+ )
167
+
168
+ # Configure client to talk to the local server.
169
+ # For local servers, any non-empty API key typically works; allow explicit override.
170
+ effective_api_key = api_key if api_key is not None else os.getenv("OPENAI_API_KEY") or "local-vllm"
171
+
172
+ self._client = OpenAIModel(
173
+ model_name=self._model_name,
174
+ formatter=formatter,
175
+ temperature=temperature,
176
+ top_p=top_p,
177
+ api_key=effective_api_key,
178
+ base_url=self._server_url,
179
+ bytes_per_token=bytes_per_token,
180
+ )
181
+
182
+ @property
183
+ def name(self) -> str:
184
+ return f"vllm_local::{self._model_name}"
185
+
186
+ def generate_from_messages(
187
+ self,
188
+ messages: list[Sequence[Message]],
189
+ stop_sequences: list[str] | None = None,
190
+ max_tokens: int | None = None,
191
+ temperature: float | None = None,
192
+ top_p: float | None = None,
193
+ ) -> list[RawCompletion]:
194
+ return self._client.generate_from_messages(messages, stop_sequences, max_tokens, temperature, top_p)
195
+
196
+ def logprobs(self, samples: list[Sample]) -> list[RawLoglikelihood]:
197
+ return self._client.logprobs(samples)
198
+
199
+ def _cleanup(self) -> None:
200
+ proc = self._proc
201
+ self._proc = None
202
+ if proc is None:
203
+ return
204
+ if proc.poll() is not None:
205
+ return
206
+
207
+ try:
208
+ proc.terminate()
209
+ proc.wait(timeout=10)
210
+ except Exception:
211
+ try:
212
+ proc.send_signal(signal.SIGKILL)
213
+ except Exception:
214
+ pass
215
+
216
+ def __del__(self) -> None:
217
+ self._cleanup()