eval-framework 0.3.6__tar.gz → 0.3.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. {eval_framework-0.3.6 → eval_framework-0.3.8}/PKG-INFO +28 -28
  2. {eval_framework-0.3.6 → eval_framework-0.3.8}/pyproject.toml +39 -38
  3. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/evaluation_generator.py +1 -0
  4. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/llm/openai.py +86 -22
  5. eval_framework-0.3.8/src/eval_framework/llm/vllm_local_server.py +217 -0
  6. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/base.py +18 -0
  7. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/code_assertion.py +8 -1
  8. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +20 -26
  9. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/multipl_e_assertion.py +7 -1
  10. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/graders/comparison_grader.py +2 -2
  11. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/base.py +13 -4
  12. eval_framework-0.3.8/src/eval_framework/tasks/benchmarks/dataset_revisions.py +83 -0
  13. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/ifeval.py +2 -0
  14. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/tablebench.py +5 -1
  15. eval_framework-0.3.8/src/eval_framework/tasks/benchmarks/task-dataset-revisions.json +179 -0
  16. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/perturbation.py +2 -2
  17. {eval_framework-0.3.6 → eval_framework-0.3.8}/LICENSE +0 -0
  18. {eval_framework-0.3.6 → eval_framework-0.3.8}/README.md +0 -0
  19. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/__init__.py +0 -0
  20. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/base_config.py +0 -0
  21. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/context/__init__.py +0 -0
  22. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/context/determined.py +0 -0
  23. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/context/eval.py +0 -0
  24. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/context/local.py +0 -0
  25. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/exceptions.py +0 -0
  26. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/external/drop_process_results.py +0 -0
  27. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/external/ifeval_impl/README.md +0 -0
  28. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
  29. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
  30. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
  31. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
  32. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/llm/__init__.py +0 -0
  33. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/llm/aleph_alpha.py +0 -0
  34. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/llm/base.py +0 -0
  35. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/llm/huggingface.py +0 -0
  36. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/llm/mistral.py +0 -0
  37. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/llm/models.py +0 -0
  38. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/llm/vllm.py +0 -0
  39. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/logger.py +0 -0
  40. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/main.py +0 -0
  41. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/__init__.py +0 -0
  42. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/aggregators/__init__.py +0 -0
  43. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/aggregators/aggregators.py +0 -0
  44. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/__init__.py +0 -0
  45. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
  46. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
  47. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/bleu.py +0 -0
  48. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/chrf.py +0 -0
  49. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/comet.py +0 -0
  50. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
  51. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/csv_format.py +0 -0
  52. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
  53. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/drop_completion.py +0 -0
  54. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
  55. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/f1.py +0 -0
  56. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/format_checker.py +0 -0
  57. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
  58. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/ifeval.py +0 -0
  59. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/json_format.py +0 -0
  60. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/language_checker.py +0 -0
  61. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/length_control.py +0 -0
  62. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/math_minerva_completion.py +0 -0
  63. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
  64. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/minerva_math_utils.py +0 -0
  65. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
  66. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
  67. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/repetition.py +0 -0
  68. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
  69. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
  70. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
  71. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
  72. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
  73. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/ter.py +0 -0
  74. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/text_counter.py +0 -0
  75. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
  76. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
  77. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/__init__.py +0 -0
  78. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/base.py +0 -0
  79. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
  80. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
  81. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
  82. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
  83. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
  84. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
  85. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/graders/language.py +0 -0
  86. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
  87. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/graders/models.py +0 -0
  88. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
  89. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
  90. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
  91. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
  92. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
  93. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
  94. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
  95. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
  96. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
  97. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
  98. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
  99. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
  100. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
  101. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
  102. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
  103. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/utils.py +0 -0
  104. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
  105. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
  106. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
  107. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/loglikelihood/bits_per_byte.py +0 -0
  108. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
  109. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
  110. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
  111. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
  112. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/py.typed +0 -0
  113. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/response_generator.py +0 -0
  114. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/result_processors/__init__.py +0 -0
  115. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/result_processors/base.py +0 -0
  116. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/result_processors/hf_uploader.py +0 -0
  117. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/result_processors/result_processor.py +0 -0
  118. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
  119. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/run.py +0 -0
  120. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/run_direct.py +0 -0
  121. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/shared/types.py +0 -0
  122. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/suite.py +0 -0
  123. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/Dockerfile_codebench +0 -0
  124. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/__init__.py +0 -0
  125. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
  126. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/aidanbench.py +0 -0
  127. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
  128. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
  129. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
  130. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/balancedcopa.py +0 -0
  131. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
  132. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
  133. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
  134. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
  135. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
  136. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/csqa.py +0 -0
  137. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/drop.py +0 -0
  138. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
  139. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/flores200.py +0 -0
  140. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
  141. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/global_mmlu.py +0 -0
  142. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/goldenswag.py +0 -0
  143. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
  144. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/gsm8k.py +0 -0
  145. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
  146. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
  147. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
  148. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/include.py +0 -0
  149. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
  150. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/lab_bench.py +0 -0
  151. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/math_reasoning.py +0 -0
  152. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
  153. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/medqa.py +0 -0
  154. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
  155. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
  156. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
  157. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
  158. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/multipl_e.py +0 -0
  159. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/naturalqs_open.py +0 -0
  160. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
  161. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
  162. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
  163. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
  164. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
  165. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
  166. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/social_iqa.py +0 -0
  167. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
  168. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/squad.py +0 -0
  169. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
  170. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
  171. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/truthfulqa.py +0 -0
  172. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
  173. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
  174. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
  175. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
  176. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
  177. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/eval_config.py +0 -0
  178. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/registry.py +0 -0
  179. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/task_loader.py +0 -0
  180. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/task_names.py +0 -0
  181. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/task_style.py +0 -0
  182. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/tasks/utils.py +0 -0
  183. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/utils/constants.py +0 -0
  184. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/utils/file_ops.py +0 -0
  185. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/utils/generate_task_docs.py +0 -0
  186. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/utils/helpers.py +0 -0
  187. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/utils/logging.py +0 -0
  188. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/utils/packaging.py +0 -0
  189. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/eval_framework/utils/tqdm_handler.py +0 -0
  190. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/template_formatting/README.md +0 -0
  191. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/template_formatting/__init__.py +0 -0
  192. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/template_formatting/formatter.py +0 -0
  193. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/template_formatting/mistral_formatter.py +0 -0
  194. {eval_framework-0.3.6 → eval_framework-0.3.8}/src/template_formatting/py.typed +0 -0
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: eval-framework
3
- Version: 0.3.6
4
- Summary: Evalulation Framework
3
+ Version: 0.3.8
4
+ Summary: Evaluation Framework
5
5
  Author: Aleph Alpha Research
6
6
  License: Apache License
7
7
  Version 2.0, January 2004
@@ -211,49 +211,49 @@ Classifier: Programming Language :: Python :: 3.12
211
211
  Classifier: Programming Language :: Python :: 3 :: Only
212
212
  Classifier: Topic :: Software Development :: Libraries
213
213
  Classifier: Typing :: Typed
214
- Requires-Dist: pyyaml>=6.0.1,<7
215
- Requires-Dist: xmltodict>=0.13.0,<0.16
216
- Requires-Dist: pydantic>=2.7,<3
217
- Requires-Dist: datasets>=4.0.0,<5
218
- Requires-Dist: sacrebleu>=2.4.3,<3
214
+ Requires-Dist: pyyaml>=6.0.3,<7
215
+ Requires-Dist: xmltodict>=0.15.1,<0.16
216
+ Requires-Dist: pydantic>=2.13.4,<3
217
+ Requires-Dist: datasets>=4.8.5,<5
218
+ Requires-Dist: sacrebleu>=2.6.0,<3
219
219
  Requires-Dist: pycountry>=24.6.1,<25
220
- Requires-Dist: nltk>=3.9.1,<4
221
- Requires-Dist: python-dotenv>=1.0.1,<2
222
- Requires-Dist: lingua-language-detector>=2.0.2,<3
223
- Requires-Dist: google-crc32c>=1.5.0,<2
220
+ Requires-Dist: nltk>=3.9.4,<4
221
+ Requires-Dist: python-dotenv>=1.2.2,<2
222
+ Requires-Dist: lingua-language-detector>=2.2.0,<3
223
+ Requires-Dist: google-crc32c>=1.8.0,<2
224
224
  Requires-Dist: kubernetes>=31.0.0,<32
225
225
  Requires-Dist: langdetect>=1.0.9,<2
226
- Requires-Dist: spacy>=3.8.3,<4
227
- Requires-Dist: jsonschema>=4.23.0,<5
228
- Requires-Dist: mysql-connector-python>=9.0.0,<10
229
- Requires-Dist: psycopg2-binary>=2.9.9,<3
226
+ Requires-Dist: spacy>=3.8.14,<4
227
+ Requires-Dist: jsonschema>=4.26.0,<5
228
+ Requires-Dist: mysql-connector-python>=9.7.0,<10
229
+ Requires-Dist: psycopg2-binary>=2.9.12,<3
230
230
  Requires-Dist: sympy>=1.13.1,<2
231
- Requires-Dist: llm-sandbox[docker]==0.3.37
231
+ Requires-Dist: llm-sandbox[docker]==0.3.39
232
232
  Requires-Dist: jsonlines>=4,<5
233
- Requires-Dist: lxml>=6,<7
234
- Requires-Dist: python-iso639>=2025.2.18
235
- Requires-Dist: wandb>=0.23.0,<1
236
- Requires-Dist: boto3>=1.40.54,<2
233
+ Requires-Dist: lxml>=6.1.1,<7
234
+ Requires-Dist: python-iso639>=2026.4.20
235
+ Requires-Dist: wandb>=0.27.0,<1
236
+ Requires-Dist: boto3>=1.43.18,<2
237
237
  Requires-Dist: numpy>=1.26.4
238
238
  Requires-Dist: antlr4-python3-runtime==4.11.0
239
- Requires-Dist: scipy>=1.14.0,<2
239
+ Requires-Dist: scipy>=1.17.1,<2
240
240
  Requires-Dist: accelerate ; extra == 'accelerate'
241
241
  Requires-Dist: eval-framework[determined,api,openai,transformers,accelerate,vllm,comet,optional,mistral] ; extra == 'all'
242
242
  Requires-Dist: aleph-alpha-client>=11.5.1 ; extra == 'api'
243
- Requires-Dist: unbabel-comet>=2.2.6,<3 ; extra == 'comet'
244
- Requires-Dist: determined>=0.38,<0.39 ; extra == 'determined'
245
- Requires-Dist: tensorboard==2.19.0 ; extra == 'determined'
246
- Requires-Dist: mistral-common>=1.7,<2 ; extra == 'mistral'
247
- Requires-Dist: huggingface-hub>=0.33.2,<0.34 ; extra == 'mistral'
243
+ Requires-Dist: unbabel-comet>=2.2.7,<3 ; extra == 'comet'
244
+ Requires-Dist: determined>=0.38.1,<0.39 ; extra == 'determined'
245
+ Requires-Dist: tensorboard==2.20.0 ; extra == 'determined'
246
+ Requires-Dist: mistral-common>=1.11.2,<2 ; extra == 'mistral'
247
+ Requires-Dist: huggingface-hub>=0.36.2,<0.37 ; extra == 'mistral'
248
248
  Requires-Dist: eval-framework[vllm] ; extra == 'mistral'
249
249
  Requires-Dist: openai>=1.62,<3 ; extra == 'openai'
250
- Requires-Dist: tiktoken>=0.9,<1 ; extra == 'openai'
250
+ Requires-Dist: tiktoken>=0.13.0,<1 ; extra == 'openai'
251
251
  Requires-Dist: transformers>=4.45.2,<5 ; extra == 'openai'
252
252
  Requires-Dist: transformers>=4.45.2,<5 ; extra == 'optional'
253
253
  Requires-Dist: jinja2>=3.1.6,<4 ; extra == 'optional'
254
254
  Requires-Dist: transformers>=4.45.2,<5 ; extra == 'transformers'
255
255
  Requires-Dist: torch>=2.5,<3 ; extra == 'transformers'
256
- Requires-Dist: accelerate>=0.30.0,<1 ; extra == 'transformers'
256
+ Requires-Dist: accelerate>=0.34.2,<1 ; extra == 'transformers'
257
257
  Requires-Dist: vllm>=0.8.5,<0.9 ; extra == 'vllm'
258
258
  Requires-Dist: torch>=2.5,<3 ; extra == 'vllm'
259
259
  Requires-Python: >=3.12, <3.13
@@ -1,7 +1,7 @@
1
1
  [project]
2
2
  name = "eval-framework"
3
- version = "0.3.6"
4
- description = "Evalulation Framework"
3
+ version = "0.3.8"
4
+ description = "Evaluation Framework"
5
5
  readme = "README.md"
6
6
  license = { file = "LICENSE" }
7
7
  requires-python = ">=3.12,<3.13"
@@ -18,53 +18,53 @@ classifiers = [
18
18
  "Typing :: Typed",
19
19
  ]
20
20
  dependencies = [
21
- "pyyaml>=6.0.1,<7",
22
- "xmltodict>=0.13.0,<0.16",
23
- "pydantic>=2.7,<3",
24
- "datasets>=4.0.0,<5",
25
- "sacrebleu>=2.4.3,<3",
21
+ "pyyaml>=6.0.3,<7",
22
+ "xmltodict>=0.15.1,<0.16",
23
+ "pydantic>=2.13.4,<3",
24
+ "datasets>=4.8.5,<5",
25
+ "sacrebleu>=2.6.0,<3",
26
26
  "pycountry>=24.6.1,<25",
27
- "nltk>=3.9.1,<4",
28
- "python-dotenv>=1.0.1,<2",
29
- "lingua-language-detector>=2.0.2,<3",
30
- "google-crc32c>=1.5.0,<2",
27
+ "nltk>=3.9.4,<4",
28
+ "python-dotenv>=1.2.2,<2",
29
+ "lingua-language-detector>=2.2.0,<3",
30
+ "google-crc32c>=1.8.0,<2",
31
31
  "kubernetes>=31.0.0,<32", # required by llm-sandbox though actually not needed
32
32
  "langdetect>=1.0.9,<2", # required by the original ifeval implementation
33
- "spacy>=3.8.3,<4",
34
- "jsonschema>=4.23.0,<5",
35
- "mysql-connector-python>=9.0.0,<10", # required for sql-related tasks
36
- "psycopg2-binary>=2.9.9,<3", # required for sql-related tasks
33
+ "spacy>=3.8.14,<4",
34
+ "jsonschema>=4.26.0,<5",
35
+ "mysql-connector-python>=9.7.0,<10", # required for sql-related tasks
36
+ "psycopg2-binary>=2.9.12,<3", # required for sql-related tasks
37
37
  "sympy>=1.13.1,<2",
38
- "llm-sandbox[docker]==0.3.37",
38
+ "llm-sandbox[docker]==0.3.39",
39
39
  "jsonlines>=4,<5",
40
- "lxml>=6,<7",
41
- "python-iso639>=2025.2.18",
42
- "wandb>=0.23.0,<1",
43
- "boto3>=1.40.54,<2",
40
+ "lxml>=6.1.1,<7",
41
+ "python-iso639>=2026.4.20",
42
+ "wandb>=0.27.0,<1",
43
+ "boto3>=1.43.18,<2",
44
44
  "numpy>=1.26.4",
45
45
  # is a dependency of sympy, but not explicitly listed in the requirements.txt
46
46
  # https://github.com/sympy/sympy/blob/0204fa34e8f6f6f8ccb4de01209be9a2345c9d6e/doc/src/contributing/dependencies.md?plain=1#L125
47
47
  "antlr4-python3-runtime==4.11.0",
48
- "scipy>=1.14.0,<2", # required for the aggregation of pass@k metrics
48
+ "scipy>=1.17.1,<2", # required for the aggregation of pass@k metrics
49
49
 
50
50
  ]
51
51
 
52
52
  [project.optional-dependencies]
53
53
  # Model-specific extras
54
54
  determined = [
55
- "determined>=0.38,<0.39",
56
- "tensorboard==2.19.0"
55
+ "determined>=0.38.1,<0.39",
56
+ "tensorboard==2.20.0"
57
57
  ]
58
58
  api = ["aleph-alpha-client>=11.5.1"]
59
59
  openai = [
60
60
  "openai>=1.62,<3",
61
- "tiktoken>=0.9,<1",
61
+ "tiktoken>=0.13.0,<1",
62
62
  "transformers>=4.45.2,<5",
63
63
  ]
64
64
  transformers = [
65
65
  "transformers>=4.45.2,<5",
66
66
  "torch>=2.5,<3",
67
- "accelerate>=0.30.0,<1",
67
+ "accelerate>=0.34.2,<1",
68
68
  ]
69
69
  accelerate = ["accelerate"]
70
70
  vllm = [
@@ -72,13 +72,13 @@ vllm = [
72
72
  "torch>=2.5,<3"
73
73
  ]
74
74
  mistral = [
75
- "mistral-common>=1.7,<2",
76
- "huggingface-hub>=0.33.2,<0.34",
75
+ "mistral-common>=1.11.2,<2",
76
+ "huggingface-hub>=0.36.2,<0.37",
77
77
  "eval_framework[vllm]",
78
78
  ]
79
79
  # Benchmark/metric specific extras
80
80
  comet = [
81
- "unbabel-comet>=2.2.6,<3",
81
+ "unbabel-comet>=2.2.7,<3",
82
82
  ]
83
83
  # from template-formatting
84
84
  optional = [
@@ -97,24 +97,25 @@ eval_framework = "eval_framework.run:run"
97
97
 
98
98
  [dependency-groups]
99
99
  dev = [
100
- "mypy>=1.10,<2",
101
- "pytest>=8.3.3,<9",
102
- "pytest-mock>=3.14.1",
103
- "pytest-xdist>=3.6.1,<4",
100
+ "mypy>=2.1.0,<3",
101
+ "pytest>=9.0.3,<10",
102
+ "pytest-mock>=3.15.1",
103
+ "pytest-xdist>=3.8.0,<4",
104
104
  "pytest-sugar>1.1,<2",
105
- "types-pyyaml>=6.0.12.20240917,<7",
106
- "types-python-dateutil>=2.9.0.20241206,<3",
107
- "types-requests>=2.32.0.20250328,<3",
105
+ "types-pyyaml>=6.0.12.20260518,<7",
106
+ "types-python-dateutil>=2.9.0.20260518,<3",
107
+ "types-requests>=2.33.0.20260518,<3",
108
108
  "plotly>=5.24.1,<6",
109
- "ruff>=0.12.8",
109
+ "ruff>=0.15.15",
110
+ "pip-licenses>=5.5.5",
110
111
  ]
111
112
  flash-attn = [
112
- "flash-attn>=2.7.2.post1,<2.8",
113
+ "flash-attn>=2.8.3,<2.9",
113
114
  "torch"
114
115
  ]
115
116
 
116
117
  [build-system]
117
- requires = ["uv_build>=0.9.0,<0.10.0"]
118
+ requires = ["uv_build>=0.11.17,<0.11.18"]
118
119
  build-backend = "uv_build"
119
120
 
120
121
  [tool.uv.build-backend]
@@ -80,6 +80,7 @@ class EvaluationGenerator:
80
80
  )
81
81
  else:
82
82
  metric = metric_class()
83
+ metric.fail_on_error = self.config.fail_on_error
83
84
 
84
85
  logger.info(f"Starting calculation of {metric.NAME}")
85
86
  safe_tqdm_write(f"INFO: Calculating {metric.NAME}")
@@ -8,18 +8,38 @@ from functools import partial
8
8
 
9
9
  import tiktoken
10
10
  from openai import OpenAI
11
- from openai.types.chat import ChatCompletionAssistantMessageParam, ChatCompletionUserMessageParam
11
+ from openai.types.chat import (
12
+ ChatCompletionAssistantMessageParam,
13
+ ChatCompletionMessageParam,
14
+ ChatCompletionSystemMessageParam,
15
+ ChatCompletionUserMessageParam,
16
+ )
12
17
  from tokenizers import Tokenizer
13
18
  from transformers import AutoTokenizer
14
19
 
15
20
  from eval_framework.llm.base import BaseLLM
16
21
  from eval_framework.shared.types import ConcatCompression, Error, RawCompletion, RawLoglikelihood
17
22
  from eval_framework.tasks.base import Sample
18
- from template_formatting.formatter import BaseFormatter, ConcatFormatter, HFFormatter, Message
23
+ from template_formatting.formatter import BaseFormatter, ConcatFormatter, HFFormatter, Message, Role
19
24
 
20
25
  logger = logging.getLogger(__name__)
21
26
 
22
27
 
28
+ def _to_chat_completion_message(message: Message) -> ChatCompletionMessageParam:
29
+ match message.role:
30
+ case Role.SYSTEM:
31
+ return ChatCompletionSystemMessageParam(role="system", content=message.content)
32
+ case Role.USER:
33
+ return ChatCompletionUserMessageParam(role="user", content=message.content)
34
+ case Role.ASSISTANT:
35
+ return ChatCompletionAssistantMessageParam(role="assistant", content=message.content)
36
+ case None:
37
+ raise ValueError(
38
+ "Cannot send a Message without a role through the chat completion API; "
39
+ "the legacy roleless format is only supported for fine-tuning."
40
+ )
41
+
42
+
23
43
  class OpenAIModel(BaseLLM):
24
44
  """
25
45
  LLM wrapper for OpenAI API providing text/chat completions and log-probability evaluation output.
@@ -71,8 +91,7 @@ class OpenAIModel(BaseLLM):
71
91
  base_url=base_url,
72
92
  )
73
93
 
74
- # Initialize tokenizer for the model
75
- self._encoder = self._get_encoder()
94
+ self._encoder: tiktoken.Encoding | Tokenizer | None = self._get_encoder_or_none()
76
95
 
77
96
  # set bytes_per_token_scalar for non-standard models
78
97
  if bytes_per_token is not None and bytes_per_token <= 0:
@@ -81,9 +100,23 @@ class OpenAIModel(BaseLLM):
81
100
  4.0 / bytes_per_token if bytes_per_token is not None else 4.0 / self.BYTES_PER_TOKEN
82
101
  )
83
102
 
84
- def _get_encoder(self) -> tiktoken.Encoding:
103
+ def _get_encoder_or_none(self) -> tiktoken.Encoding | None:
85
104
  assert self._model_name is not None
86
- return tiktoken.encoding_for_model(self._model_name)
105
+ try:
106
+ return tiktoken.encoding_for_model(self._model_name)
107
+ except KeyError:
108
+ logger.info(
109
+ "tiktoken could not map model_name=%r. Disabling token counting for this model.",
110
+ self._model_name,
111
+ )
112
+ return None
113
+ except Exception as e:
114
+ logger.warning(
115
+ "Failed to initialize tiktoken encoder for model_name=%r (%s). Disabling token counting.",
116
+ self._model_name,
117
+ e.__class__.__name__,
118
+ )
119
+ return None
87
120
 
88
121
  def _count_tokens(self, text: str) -> int:
89
122
  """
@@ -95,6 +128,8 @@ class OpenAIModel(BaseLLM):
95
128
  Returns:
96
129
  Number of tokens.
97
130
  """
131
+ if self._encoder is None:
132
+ raise RuntimeError("Token counting is not available (no encoder configured).")
98
133
  return len(self._encoder.encode(text))
99
134
 
100
135
  def generate_from_messages(
@@ -146,26 +181,36 @@ class OpenAIModel(BaseLLM):
146
181
  stop=stop_sequences,
147
182
  )
148
183
  completion = response.choices[0].text
184
+ usage = getattr(response, "usage", None)
185
+ prompt_tokens = getattr(usage, "prompt_tokens", None) if usage is not None else None
186
+ completion_tokens = getattr(usage, "completion_tokens", None) if usage is not None else None
149
187
  return RawCompletion(
150
188
  prompt=prompt,
151
- prompt_sequence_positions=self._count_tokens(prompt),
152
- concat_compression=ConcatCompression.calculate(
153
- single_messages, count_tokens=self._count_tokens, completion=completion
189
+ prompt_sequence_positions=(
190
+ prompt_tokens
191
+ if prompt_tokens is not None
192
+ else (self._count_tokens(prompt) if self._encoder is not None else None)
193
+ ),
194
+ concat_compression=(
195
+ ConcatCompression.calculate(
196
+ single_messages,
197
+ count_tokens=self._count_tokens,
198
+ completion=completion,
199
+ )
200
+ if self._encoder is not None
201
+ else None
154
202
  ),
155
203
  completion=completion,
156
- completion_sequence_positions=self._count_tokens(completion),
204
+ completion_sequence_positions=(
205
+ completion_tokens
206
+ if completion_tokens is not None
207
+ else (self._count_tokens(completion) if self._encoder is not None else None)
208
+ ),
157
209
  )
158
210
 
159
211
  else:
160
212
  # Use chat completion API
161
- chat_messages = [
162
- (
163
- ChatCompletionUserMessageParam(role="user", content=m.content)
164
- if m.role is not None and m.role.value.lower() == "user"
165
- else ChatCompletionAssistantMessageParam(role="assistant", content=m.content)
166
- )
167
- for m in single_messages
168
- ]
213
+ chat_messages = [_to_chat_completion_message(m) for m in single_messages]
169
214
  assert self._model_name is not None
170
215
  chat_response = self._client.chat.completions.create(
171
216
  model=self._model_name,
@@ -177,15 +222,26 @@ class OpenAIModel(BaseLLM):
177
222
  )
178
223
  prompt = "\n".join([f"{m.get('role', '')}: {m.get('content', '')}" for m in chat_messages])
179
224
  prompt_tokens = getattr(chat_response.usage, "prompt_tokens", None)
225
+ completion_tokens = getattr(chat_response.usage, "completion_tokens", None)
180
226
  completion = chat_response.choices[0].message.content or ""
181
227
  return RawCompletion(
182
228
  prompt=prompt,
183
229
  prompt_sequence_positions=prompt_tokens,
184
- concat_compression=ConcatCompression.calculate(
185
- single_messages, count_tokens=self._count_tokens, completion=completion
230
+ concat_compression=(
231
+ ConcatCompression.calculate(
232
+ single_messages,
233
+ count_tokens=self._count_tokens,
234
+ completion=completion,
235
+ )
236
+ if self._encoder is not None
237
+ else None
186
238
  ),
187
239
  completion=completion,
188
- completion_sequence_positions=self._count_tokens(completion),
240
+ completion_sequence_positions=(
241
+ completion_tokens
242
+ if completion_tokens is not None
243
+ else (self._count_tokens(completion) if self._encoder is not None else None)
244
+ ),
189
245
  )
190
246
 
191
247
  with concurrent.futures.ThreadPoolExecutor() as executor:
@@ -205,6 +261,10 @@ class OpenAIModel(BaseLLM):
205
261
  Note:
206
262
  Uses the OpenAI completions API with echo=True; chat logprobs are not supported.
207
263
  """
264
+ if self._encoder is None:
265
+ raise NotImplementedError(
266
+ "OpenAIModel.logprobs() requires a local tokenizer/encoder, but none is available."
267
+ )
208
268
  assert self._model_name in ["babbage-002", "davinci-002"], (
209
269
  "Log-probs for prompt tokens are only supported for a limited set of models."
210
270
  )
@@ -370,12 +430,16 @@ class DeepseekModel(OpenAIModel):
370
430
  base_url="https://api.deepseek.com/beta",
371
431
  )
372
432
  self._tokenizer_name = tokenizer_name if tokenizer_name is not None else "deepseek-ai/DeepSeek-V3.2-Exp"
433
+ # DeepSeek uses HF tokenization; override the base encoder (which may be None).
434
+ self._encoder = self._get_encoder()
373
435
 
374
436
  def _get_encoder(self) -> Tokenizer:
375
437
  return AutoTokenizer.from_pretrained(self._tokenizer_name)
376
438
 
377
439
  def _count_tokens(self, text: str) -> int:
378
- return len(self._encoder.encode(text))
440
+ encoder = self._encoder
441
+ assert encoder is not None
442
+ return len(encoder.encode(text)) # type: ignore[union-attr]
379
443
 
380
444
 
381
445
  ### Model Aliases ###
@@ -0,0 +1,217 @@
1
+ from __future__ import annotations
2
+
3
+ import atexit
4
+ import logging
5
+ import os
6
+ import signal
7
+ import socket
8
+ import subprocess
9
+ import time
10
+ import urllib.error
11
+ import urllib.request
12
+ from collections.abc import Sequence
13
+
14
+ from eval_framework.llm.base import BaseLLM
15
+ from eval_framework.llm.openai import OpenAIModel
16
+ from eval_framework.shared.types import RawCompletion, RawLoglikelihood
17
+ from eval_framework.tasks.base import Sample
18
+ from template_formatting.formatter import BaseFormatter, Message
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ def _pick_free_port(host: str) -> int:
24
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
25
+ s.bind((host, 0))
26
+ return int(s.getsockname()[1])
27
+
28
+
29
+ def _wait_for_http_ready(url: str, *, timeout_s: float) -> None:
30
+ deadline = time.time() + timeout_s
31
+ last_err: Exception | None = None
32
+ while time.time() < deadline:
33
+ try:
34
+ with urllib.request.urlopen(url, timeout=2) as resp:
35
+ if 200 <= resp.status < 500:
36
+ return
37
+ except (urllib.error.URLError, TimeoutError, ConnectionError, OSError) as e:
38
+ last_err = e
39
+ time.sleep(0.25)
40
+ raise RuntimeError(f"Timed out waiting for server readiness at {url}. Last error: {last_err}")
41
+
42
+
43
+ def _wait_for_http_ready_or_proc_exit(url: str, *, timeout_s: float, proc: subprocess.Popen[str]) -> None:
44
+ """
45
+ Like `_wait_for_http_ready`, but fail fast if the server process exits.
46
+
47
+ This avoids long timeouts that hide the real root cause (e.g. invalid CLI flags,
48
+ missing dependencies, CUDA issues).
49
+ """
50
+ deadline = time.time() + timeout_s
51
+ last_err: Exception | None = None
52
+
53
+ while time.time() < deadline:
54
+ if proc.poll() is not None:
55
+ out = ""
56
+ try:
57
+ if proc.stdout is not None:
58
+ out = proc.stdout.read() or ""
59
+ except Exception:
60
+ out = ""
61
+ tail = out.strip()
62
+ if len(tail) > 8000:
63
+ tail = tail[-8000:]
64
+ raise RuntimeError(
65
+ f"vLLM server process exited before becoming ready. exit_code={proc.returncode}. Output (tail):\n{tail}"
66
+ )
67
+
68
+ try:
69
+ with urllib.request.urlopen(url, timeout=2) as resp:
70
+ if 200 <= resp.status < 500:
71
+ return
72
+ except (urllib.error.URLError, TimeoutError, ConnectionError, OSError) as e:
73
+ last_err = e
74
+ time.sleep(0.25)
75
+
76
+ raise RuntimeError(f"Timed out waiting for server readiness at {url}. Last error: {last_err}")
77
+
78
+
79
+ class VLLMLocalServerModel(BaseLLM):
80
+ """
81
+ Provider-style model: start a local vLLM OpenAI-compatible server, then talk to it via `OpenAIModel(base_url=...)`.
82
+
83
+ This gives you a stable HTTP boundary (good for VCR cassettes) while keeping "local vLLM" as a selectable backend.
84
+
85
+ Notes:
86
+ - The server is started in a subprocess using `vllm serve`.
87
+ - Cleanup is best-effort (SIGTERM then SIGKILL).
88
+ - Not all OpenAI API features are guaranteed to be supported by the local server (e.g. logprobs).
89
+ """
90
+
91
+ def __init__(
92
+ self,
93
+ *,
94
+ model_name: str,
95
+ host: str = "127.0.0.1",
96
+ port: int | None = None,
97
+ startup_timeout_s: float = 120.0,
98
+ # `OpenAIModel` parameters:
99
+ formatter: BaseFormatter | None = None,
100
+ temperature: float | None = None,
101
+ top_p: float | None = None,
102
+ api_key: str | None = None,
103
+ bytes_per_token: float | None = None,
104
+ # vLLM "serve" parameters (subset, passed through):
105
+ tensor_parallel_size: int | None = None,
106
+ dtype: str | None = None,
107
+ max_model_len: int | None = None,
108
+ gpu_memory_utilization: float | None = None,
109
+ enforce_eager: bool | None = None,
110
+ # Escape hatch:
111
+ vllm_command: str | None = None,
112
+ vllm_extra_args: list[str] | None = None,
113
+ env: dict[str, str] | None = None,
114
+ ) -> None:
115
+ self._model_name = model_name
116
+ self._host = host
117
+ self._port = port if port is not None else _pick_free_port(host)
118
+ self._startup_timeout_s = float(startup_timeout_s)
119
+
120
+ self._proc: subprocess.Popen[str] | None = None
121
+
122
+ self._server_url = f"http://{self._host}:{self._port}/v1"
123
+
124
+ cmd = [vllm_command or "vllm", "serve", self._model_name, "--host", self._host, "--port", str(self._port)]
125
+
126
+ # A small, intentionally conservative subset of flags.
127
+ if tensor_parallel_size is not None:
128
+ cmd += ["--tensor-parallel-size", str(tensor_parallel_size)]
129
+ if dtype is not None:
130
+ cmd += ["--dtype", str(dtype)]
131
+ if max_model_len is not None:
132
+ cmd += ["--max-model-len", str(max_model_len)]
133
+ if gpu_memory_utilization is not None:
134
+ cmd += ["--gpu-memory-utilization", str(gpu_memory_utilization)]
135
+ if enforce_eager is not None:
136
+ # vLLM exposes this as a boolean flag; passing a value breaks CLI parsing.
137
+ if enforce_eager:
138
+ cmd += ["--enforce-eager"]
139
+
140
+ if vllm_extra_args:
141
+ cmd += list(vllm_extra_args)
142
+
143
+ merged_env = os.environ.copy()
144
+ if env:
145
+ merged_env.update(env)
146
+
147
+ logger.info("Starting local vLLM server: %s", " ".join(cmd))
148
+ self._proc = subprocess.Popen(
149
+ cmd,
150
+ env=merged_env,
151
+ stdout=subprocess.PIPE,
152
+ stderr=subprocess.STDOUT,
153
+ text=True,
154
+ )
155
+
156
+ # Ensure we don't leave it around if the process exits abruptly.
157
+ atexit.register(self._cleanup)
158
+
159
+ # Wait until the OpenAI-compatible endpoints respond.
160
+ if self._proc is None:
161
+ raise RuntimeError("Failed to start vLLM server process.")
162
+ _wait_for_http_ready_or_proc_exit(
163
+ f"{self._server_url}/models",
164
+ timeout_s=self._startup_timeout_s,
165
+ proc=self._proc,
166
+ )
167
+
168
+ # Configure client to talk to the local server.
169
+ # For local servers, any non-empty API key typically works; allow explicit override.
170
+ effective_api_key = api_key if api_key is not None else os.getenv("OPENAI_API_KEY") or "local-vllm"
171
+
172
+ self._client = OpenAIModel(
173
+ model_name=self._model_name,
174
+ formatter=formatter,
175
+ temperature=temperature,
176
+ top_p=top_p,
177
+ api_key=effective_api_key,
178
+ base_url=self._server_url,
179
+ bytes_per_token=bytes_per_token,
180
+ )
181
+
182
+ @property
183
+ def name(self) -> str:
184
+ return f"vllm_local::{self._model_name}"
185
+
186
+ def generate_from_messages(
187
+ self,
188
+ messages: list[Sequence[Message]],
189
+ stop_sequences: list[str] | None = None,
190
+ max_tokens: int | None = None,
191
+ temperature: float | None = None,
192
+ top_p: float | None = None,
193
+ ) -> list[RawCompletion]:
194
+ return self._client.generate_from_messages(messages, stop_sequences, max_tokens, temperature, top_p)
195
+
196
+ def logprobs(self, samples: list[Sample]) -> list[RawLoglikelihood]:
197
+ return self._client.logprobs(samples)
198
+
199
+ def _cleanup(self) -> None:
200
+ proc = self._proc
201
+ self._proc = None
202
+ if proc is None:
203
+ return
204
+ if proc.poll() is not None:
205
+ return
206
+
207
+ try:
208
+ proc.terminate()
209
+ proc.wait(timeout=10)
210
+ except Exception:
211
+ try:
212
+ proc.send_signal(signal.SIGKILL)
213
+ except Exception:
214
+ pass
215
+
216
+ def __del__(self) -> None:
217
+ self._cleanup()