eval-framework 0.3.7__tar.gz → 0.3.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. {eval_framework-0.3.7 → eval_framework-0.3.8}/PKG-INFO +28 -28
  2. {eval_framework-0.3.7 → eval_framework-0.3.8}/pyproject.toml +39 -38
  3. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/evaluation_generator.py +1 -0
  4. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/llm/openai.py +63 -12
  5. eval_framework-0.3.8/src/eval_framework/llm/vllm_local_server.py +217 -0
  6. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/base.py +18 -0
  7. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/code_assertion.py +8 -1
  8. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +20 -26
  9. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/multipl_e_assertion.py +7 -1
  10. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/graders/comparison_grader.py +2 -2
  11. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/base.py +13 -4
  12. eval_framework-0.3.8/src/eval_framework/tasks/benchmarks/dataset_revisions.py +83 -0
  13. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/tablebench.py +5 -1
  14. eval_framework-0.3.8/src/eval_framework/tasks/benchmarks/task-dataset-revisions.json +179 -0
  15. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/perturbation.py +2 -2
  16. {eval_framework-0.3.7 → eval_framework-0.3.8}/LICENSE +0 -0
  17. {eval_framework-0.3.7 → eval_framework-0.3.8}/README.md +0 -0
  18. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/__init__.py +0 -0
  19. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/base_config.py +0 -0
  20. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/context/__init__.py +0 -0
  21. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/context/determined.py +0 -0
  22. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/context/eval.py +0 -0
  23. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/context/local.py +0 -0
  24. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/exceptions.py +0 -0
  25. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/external/drop_process_results.py +0 -0
  26. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/external/ifeval_impl/README.md +0 -0
  27. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
  28. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
  29. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
  30. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
  31. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/llm/__init__.py +0 -0
  32. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/llm/aleph_alpha.py +0 -0
  33. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/llm/base.py +0 -0
  34. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/llm/huggingface.py +0 -0
  35. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/llm/mistral.py +0 -0
  36. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/llm/models.py +0 -0
  37. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/llm/vllm.py +0 -0
  38. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/logger.py +0 -0
  39. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/main.py +0 -0
  40. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/__init__.py +0 -0
  41. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/aggregators/__init__.py +0 -0
  42. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/aggregators/aggregators.py +0 -0
  43. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/__init__.py +0 -0
  44. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
  45. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
  46. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/bleu.py +0 -0
  47. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/chrf.py +0 -0
  48. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/comet.py +0 -0
  49. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
  50. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/csv_format.py +0 -0
  51. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
  52. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/drop_completion.py +0 -0
  53. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
  54. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/f1.py +0 -0
  55. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/format_checker.py +0 -0
  56. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
  57. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/ifeval.py +0 -0
  58. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/json_format.py +0 -0
  59. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/language_checker.py +0 -0
  60. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/length_control.py +0 -0
  61. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/math_minerva_completion.py +0 -0
  62. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
  63. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/minerva_math_utils.py +0 -0
  64. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
  65. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
  66. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/repetition.py +0 -0
  67. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
  68. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
  69. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
  70. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
  71. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
  72. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/ter.py +0 -0
  73. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/text_counter.py +0 -0
  74. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
  75. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
  76. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/__init__.py +0 -0
  77. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/base.py +0 -0
  78. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
  79. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
  80. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
  81. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
  82. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
  83. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
  84. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/graders/language.py +0 -0
  85. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
  86. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/graders/models.py +0 -0
  87. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
  88. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
  89. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
  90. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
  91. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
  92. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
  93. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
  94. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
  95. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
  96. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
  97. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
  98. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
  99. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
  100. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
  101. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
  102. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/llm/utils.py +0 -0
  103. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
  104. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
  105. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
  106. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/loglikelihood/bits_per_byte.py +0 -0
  107. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
  108. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
  109. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
  110. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
  111. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/py.typed +0 -0
  112. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/response_generator.py +0 -0
  113. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/result_processors/__init__.py +0 -0
  114. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/result_processors/base.py +0 -0
  115. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/result_processors/hf_uploader.py +0 -0
  116. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/result_processors/result_processor.py +0 -0
  117. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
  118. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/run.py +0 -0
  119. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/run_direct.py +0 -0
  120. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/shared/types.py +0 -0
  121. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/suite.py +0 -0
  122. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/Dockerfile_codebench +0 -0
  123. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/__init__.py +0 -0
  124. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
  125. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/aidanbench.py +0 -0
  126. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
  127. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
  128. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
  129. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/balancedcopa.py +0 -0
  130. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
  131. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
  132. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
  133. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
  134. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
  135. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/csqa.py +0 -0
  136. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/drop.py +0 -0
  137. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
  138. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/flores200.py +0 -0
  139. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
  140. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/global_mmlu.py +0 -0
  141. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/goldenswag.py +0 -0
  142. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
  143. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/gsm8k.py +0 -0
  144. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
  145. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
  146. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
  147. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
  148. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/include.py +0 -0
  149. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
  150. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/lab_bench.py +0 -0
  151. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/math_reasoning.py +0 -0
  152. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
  153. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/medqa.py +0 -0
  154. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
  155. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
  156. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
  157. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
  158. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/multipl_e.py +0 -0
  159. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/naturalqs_open.py +0 -0
  160. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
  161. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
  162. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
  163. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
  164. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
  165. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
  166. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/social_iqa.py +0 -0
  167. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
  168. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/squad.py +0 -0
  169. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
  170. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
  171. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/truthfulqa.py +0 -0
  172. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
  173. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
  174. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
  175. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
  176. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
  177. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/eval_config.py +0 -0
  178. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/registry.py +0 -0
  179. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/task_loader.py +0 -0
  180. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/task_names.py +0 -0
  181. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/task_style.py +0 -0
  182. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/tasks/utils.py +0 -0
  183. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/utils/constants.py +0 -0
  184. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/utils/file_ops.py +0 -0
  185. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/utils/generate_task_docs.py +0 -0
  186. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/utils/helpers.py +0 -0
  187. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/utils/logging.py +0 -0
  188. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/utils/packaging.py +0 -0
  189. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/utils/tqdm_handler.py +0 -0
  190. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/template_formatting/README.md +0 -0
  191. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/template_formatting/__init__.py +0 -0
  192. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/template_formatting/formatter.py +0 -0
  193. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/template_formatting/mistral_formatter.py +0 -0
  194. {eval_framework-0.3.7 → eval_framework-0.3.8}/src/template_formatting/py.typed +0 -0
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: eval-framework
3
- Version: 0.3.7
4
- Summary: Evalulation Framework
3
+ Version: 0.3.8
4
+ Summary: Evaluation Framework
5
5
  Author: Aleph Alpha Research
6
6
  License: Apache License
7
7
  Version 2.0, January 2004
@@ -211,49 +211,49 @@ Classifier: Programming Language :: Python :: 3.12
211
211
  Classifier: Programming Language :: Python :: 3 :: Only
212
212
  Classifier: Topic :: Software Development :: Libraries
213
213
  Classifier: Typing :: Typed
214
- Requires-Dist: pyyaml>=6.0.1,<7
215
- Requires-Dist: xmltodict>=0.13.0,<0.16
216
- Requires-Dist: pydantic>=2.7,<3
217
- Requires-Dist: datasets>=4.0.0,<5
218
- Requires-Dist: sacrebleu>=2.4.3,<3
214
+ Requires-Dist: pyyaml>=6.0.3,<7
215
+ Requires-Dist: xmltodict>=0.15.1,<0.16
216
+ Requires-Dist: pydantic>=2.13.4,<3
217
+ Requires-Dist: datasets>=4.8.5,<5
218
+ Requires-Dist: sacrebleu>=2.6.0,<3
219
219
  Requires-Dist: pycountry>=24.6.1,<25
220
- Requires-Dist: nltk>=3.9.1,<4
221
- Requires-Dist: python-dotenv>=1.0.1,<2
222
- Requires-Dist: lingua-language-detector>=2.0.2,<3
223
- Requires-Dist: google-crc32c>=1.5.0,<2
220
+ Requires-Dist: nltk>=3.9.4,<4
221
+ Requires-Dist: python-dotenv>=1.2.2,<2
222
+ Requires-Dist: lingua-language-detector>=2.2.0,<3
223
+ Requires-Dist: google-crc32c>=1.8.0,<2
224
224
  Requires-Dist: kubernetes>=31.0.0,<32
225
225
  Requires-Dist: langdetect>=1.0.9,<2
226
- Requires-Dist: spacy>=3.8.3,<4
227
- Requires-Dist: jsonschema>=4.23.0,<5
228
- Requires-Dist: mysql-connector-python>=9.0.0,<10
229
- Requires-Dist: psycopg2-binary>=2.9.9,<3
226
+ Requires-Dist: spacy>=3.8.14,<4
227
+ Requires-Dist: jsonschema>=4.26.0,<5
228
+ Requires-Dist: mysql-connector-python>=9.7.0,<10
229
+ Requires-Dist: psycopg2-binary>=2.9.12,<3
230
230
  Requires-Dist: sympy>=1.13.1,<2
231
- Requires-Dist: llm-sandbox[docker]==0.3.37
231
+ Requires-Dist: llm-sandbox[docker]==0.3.39
232
232
  Requires-Dist: jsonlines>=4,<5
233
- Requires-Dist: lxml>=6,<7
234
- Requires-Dist: python-iso639>=2025.2.18
235
- Requires-Dist: wandb>=0.23.0,<1
236
- Requires-Dist: boto3>=1.40.54,<2
233
+ Requires-Dist: lxml>=6.1.1,<7
234
+ Requires-Dist: python-iso639>=2026.4.20
235
+ Requires-Dist: wandb>=0.27.0,<1
236
+ Requires-Dist: boto3>=1.43.18,<2
237
237
  Requires-Dist: numpy>=1.26.4
238
238
  Requires-Dist: antlr4-python3-runtime==4.11.0
239
- Requires-Dist: scipy>=1.14.0,<2
239
+ Requires-Dist: scipy>=1.17.1,<2
240
240
  Requires-Dist: accelerate ; extra == 'accelerate'
241
241
  Requires-Dist: eval-framework[determined,api,openai,transformers,accelerate,vllm,comet,optional,mistral] ; extra == 'all'
242
242
  Requires-Dist: aleph-alpha-client>=11.5.1 ; extra == 'api'
243
- Requires-Dist: unbabel-comet>=2.2.6,<3 ; extra == 'comet'
244
- Requires-Dist: determined>=0.38,<0.39 ; extra == 'determined'
245
- Requires-Dist: tensorboard==2.19.0 ; extra == 'determined'
246
- Requires-Dist: mistral-common>=1.7,<2 ; extra == 'mistral'
247
- Requires-Dist: huggingface-hub>=0.33.2,<0.34 ; extra == 'mistral'
243
+ Requires-Dist: unbabel-comet>=2.2.7,<3 ; extra == 'comet'
244
+ Requires-Dist: determined>=0.38.1,<0.39 ; extra == 'determined'
245
+ Requires-Dist: tensorboard==2.20.0 ; extra == 'determined'
246
+ Requires-Dist: mistral-common>=1.11.2,<2 ; extra == 'mistral'
247
+ Requires-Dist: huggingface-hub>=0.36.2,<0.37 ; extra == 'mistral'
248
248
  Requires-Dist: eval-framework[vllm] ; extra == 'mistral'
249
249
  Requires-Dist: openai>=1.62,<3 ; extra == 'openai'
250
- Requires-Dist: tiktoken>=0.9,<1 ; extra == 'openai'
250
+ Requires-Dist: tiktoken>=0.13.0,<1 ; extra == 'openai'
251
251
  Requires-Dist: transformers>=4.45.2,<5 ; extra == 'openai'
252
252
  Requires-Dist: transformers>=4.45.2,<5 ; extra == 'optional'
253
253
  Requires-Dist: jinja2>=3.1.6,<4 ; extra == 'optional'
254
254
  Requires-Dist: transformers>=4.45.2,<5 ; extra == 'transformers'
255
255
  Requires-Dist: torch>=2.5,<3 ; extra == 'transformers'
256
- Requires-Dist: accelerate>=0.30.0,<1 ; extra == 'transformers'
256
+ Requires-Dist: accelerate>=0.34.2,<1 ; extra == 'transformers'
257
257
  Requires-Dist: vllm>=0.8.5,<0.9 ; extra == 'vllm'
258
258
  Requires-Dist: torch>=2.5,<3 ; extra == 'vllm'
259
259
  Requires-Python: >=3.12, <3.13
@@ -1,7 +1,7 @@
1
1
  [project]
2
2
  name = "eval-framework"
3
- version = "0.3.7"
4
- description = "Evalulation Framework"
3
+ version = "0.3.8"
4
+ description = "Evaluation Framework"
5
5
  readme = "README.md"
6
6
  license = { file = "LICENSE" }
7
7
  requires-python = ">=3.12,<3.13"
@@ -18,53 +18,53 @@ classifiers = [
18
18
  "Typing :: Typed",
19
19
  ]
20
20
  dependencies = [
21
- "pyyaml>=6.0.1,<7",
22
- "xmltodict>=0.13.0,<0.16",
23
- "pydantic>=2.7,<3",
24
- "datasets>=4.0.0,<5",
25
- "sacrebleu>=2.4.3,<3",
21
+ "pyyaml>=6.0.3,<7",
22
+ "xmltodict>=0.15.1,<0.16",
23
+ "pydantic>=2.13.4,<3",
24
+ "datasets>=4.8.5,<5",
25
+ "sacrebleu>=2.6.0,<3",
26
26
  "pycountry>=24.6.1,<25",
27
- "nltk>=3.9.1,<4",
28
- "python-dotenv>=1.0.1,<2",
29
- "lingua-language-detector>=2.0.2,<3",
30
- "google-crc32c>=1.5.0,<2",
27
+ "nltk>=3.9.4,<4",
28
+ "python-dotenv>=1.2.2,<2",
29
+ "lingua-language-detector>=2.2.0,<3",
30
+ "google-crc32c>=1.8.0,<2",
31
31
  "kubernetes>=31.0.0,<32", # required by llm-sandbox though actually not needed
32
32
  "langdetect>=1.0.9,<2", # required by the original ifeval implementation
33
- "spacy>=3.8.3,<4",
34
- "jsonschema>=4.23.0,<5",
35
- "mysql-connector-python>=9.0.0,<10", # required for sql-related tasks
36
- "psycopg2-binary>=2.9.9,<3", # required for sql-related tasks
33
+ "spacy>=3.8.14,<4",
34
+ "jsonschema>=4.26.0,<5",
35
+ "mysql-connector-python>=9.7.0,<10", # required for sql-related tasks
36
+ "psycopg2-binary>=2.9.12,<3", # required for sql-related tasks
37
37
  "sympy>=1.13.1,<2",
38
- "llm-sandbox[docker]==0.3.37",
38
+ "llm-sandbox[docker]==0.3.39",
39
39
  "jsonlines>=4,<5",
40
- "lxml>=6,<7",
41
- "python-iso639>=2025.2.18",
42
- "wandb>=0.23.0,<1",
43
- "boto3>=1.40.54,<2",
40
+ "lxml>=6.1.1,<7",
41
+ "python-iso639>=2026.4.20",
42
+ "wandb>=0.27.0,<1",
43
+ "boto3>=1.43.18,<2",
44
44
  "numpy>=1.26.4",
45
45
  # is a dependency of sympy, but not explicitly listed in the requirements.txt
46
46
  # https://github.com/sympy/sympy/blob/0204fa34e8f6f6f8ccb4de01209be9a2345c9d6e/doc/src/contributing/dependencies.md?plain=1#L125
47
47
  "antlr4-python3-runtime==4.11.0",
48
- "scipy>=1.14.0,<2", # required for the aggregation of pass@k metrics
48
+ "scipy>=1.17.1,<2", # required for the aggregation of pass@k metrics
49
49
 
50
50
  ]
51
51
 
52
52
  [project.optional-dependencies]
53
53
  # Model-specific extras
54
54
  determined = [
55
- "determined>=0.38,<0.39",
56
- "tensorboard==2.19.0"
55
+ "determined>=0.38.1,<0.39",
56
+ "tensorboard==2.20.0"
57
57
  ]
58
58
  api = ["aleph-alpha-client>=11.5.1"]
59
59
  openai = [
60
60
  "openai>=1.62,<3",
61
- "tiktoken>=0.9,<1",
61
+ "tiktoken>=0.13.0,<1",
62
62
  "transformers>=4.45.2,<5",
63
63
  ]
64
64
  transformers = [
65
65
  "transformers>=4.45.2,<5",
66
66
  "torch>=2.5,<3",
67
- "accelerate>=0.30.0,<1",
67
+ "accelerate>=0.34.2,<1",
68
68
  ]
69
69
  accelerate = ["accelerate"]
70
70
  vllm = [
@@ -72,13 +72,13 @@ vllm = [
72
72
  "torch>=2.5,<3"
73
73
  ]
74
74
  mistral = [
75
- "mistral-common>=1.7,<2",
76
- "huggingface-hub>=0.33.2,<0.34",
75
+ "mistral-common>=1.11.2,<2",
76
+ "huggingface-hub>=0.36.2,<0.37",
77
77
  "eval_framework[vllm]",
78
78
  ]
79
79
  # Benchmark/metric specific extras
80
80
  comet = [
81
- "unbabel-comet>=2.2.6,<3",
81
+ "unbabel-comet>=2.2.7,<3",
82
82
  ]
83
83
  # from template-formatting
84
84
  optional = [
@@ -97,24 +97,25 @@ eval_framework = "eval_framework.run:run"
97
97
 
98
98
  [dependency-groups]
99
99
  dev = [
100
- "mypy>=1.10,<2",
101
- "pytest>=8.3.3,<9",
102
- "pytest-mock>=3.14.1",
103
- "pytest-xdist>=3.6.1,<4",
100
+ "mypy>=2.1.0,<3",
101
+ "pytest>=9.0.3,<10",
102
+ "pytest-mock>=3.15.1",
103
+ "pytest-xdist>=3.8.0,<4",
104
104
  "pytest-sugar>1.1,<2",
105
- "types-pyyaml>=6.0.12.20240917,<7",
106
- "types-python-dateutil>=2.9.0.20241206,<3",
107
- "types-requests>=2.32.0.20250328,<3",
105
+ "types-pyyaml>=6.0.12.20260518,<7",
106
+ "types-python-dateutil>=2.9.0.20260518,<3",
107
+ "types-requests>=2.33.0.20260518,<3",
108
108
  "plotly>=5.24.1,<6",
109
- "ruff>=0.12.8",
109
+ "ruff>=0.15.15",
110
+ "pip-licenses>=5.5.5",
110
111
  ]
111
112
  flash-attn = [
112
- "flash-attn>=2.7.2.post1,<2.8",
113
+ "flash-attn>=2.8.3,<2.9",
113
114
  "torch"
114
115
  ]
115
116
 
116
117
  [build-system]
117
- requires = ["uv_build>=0.9.0,<0.10.0"]
118
+ requires = ["uv_build>=0.11.17,<0.11.18"]
118
119
  build-backend = "uv_build"
119
120
 
120
121
  [tool.uv.build-backend]
@@ -80,6 +80,7 @@ class EvaluationGenerator:
80
80
  )
81
81
  else:
82
82
  metric = metric_class()
83
+ metric.fail_on_error = self.config.fail_on_error
83
84
 
84
85
  logger.info(f"Starting calculation of {metric.NAME}")
85
86
  safe_tqdm_write(f"INFO: Calculating {metric.NAME}")
@@ -91,8 +91,7 @@ class OpenAIModel(BaseLLM):
91
91
  base_url=base_url,
92
92
  )
93
93
 
94
- # Initialize tokenizer for the model
95
- self._encoder = self._get_encoder()
94
+ self._encoder: tiktoken.Encoding | Tokenizer | None = self._get_encoder_or_none()
96
95
 
97
96
  # set bytes_per_token_scalar for non-standard models
98
97
  if bytes_per_token is not None and bytes_per_token <= 0:
@@ -101,9 +100,23 @@ class OpenAIModel(BaseLLM):
101
100
  4.0 / bytes_per_token if bytes_per_token is not None else 4.0 / self.BYTES_PER_TOKEN
102
101
  )
103
102
 
104
- def _get_encoder(self) -> tiktoken.Encoding:
103
+ def _get_encoder_or_none(self) -> tiktoken.Encoding | None:
105
104
  assert self._model_name is not None
106
- return tiktoken.encoding_for_model(self._model_name)
105
+ try:
106
+ return tiktoken.encoding_for_model(self._model_name)
107
+ except KeyError:
108
+ logger.info(
109
+ "tiktoken could not map model_name=%r. Disabling token counting for this model.",
110
+ self._model_name,
111
+ )
112
+ return None
113
+ except Exception as e:
114
+ logger.warning(
115
+ "Failed to initialize tiktoken encoder for model_name=%r (%s). Disabling token counting.",
116
+ self._model_name,
117
+ e.__class__.__name__,
118
+ )
119
+ return None
107
120
 
108
121
  def _count_tokens(self, text: str) -> int:
109
122
  """
@@ -115,6 +128,8 @@ class OpenAIModel(BaseLLM):
115
128
  Returns:
116
129
  Number of tokens.
117
130
  """
131
+ if self._encoder is None:
132
+ raise RuntimeError("Token counting is not available (no encoder configured).")
118
133
  return len(self._encoder.encode(text))
119
134
 
120
135
  def generate_from_messages(
@@ -166,14 +181,31 @@ class OpenAIModel(BaseLLM):
166
181
  stop=stop_sequences,
167
182
  )
168
183
  completion = response.choices[0].text
184
+ usage = getattr(response, "usage", None)
185
+ prompt_tokens = getattr(usage, "prompt_tokens", None) if usage is not None else None
186
+ completion_tokens = getattr(usage, "completion_tokens", None) if usage is not None else None
169
187
  return RawCompletion(
170
188
  prompt=prompt,
171
- prompt_sequence_positions=self._count_tokens(prompt),
172
- concat_compression=ConcatCompression.calculate(
173
- single_messages, count_tokens=self._count_tokens, completion=completion
189
+ prompt_sequence_positions=(
190
+ prompt_tokens
191
+ if prompt_tokens is not None
192
+ else (self._count_tokens(prompt) if self._encoder is not None else None)
193
+ ),
194
+ concat_compression=(
195
+ ConcatCompression.calculate(
196
+ single_messages,
197
+ count_tokens=self._count_tokens,
198
+ completion=completion,
199
+ )
200
+ if self._encoder is not None
201
+ else None
174
202
  ),
175
203
  completion=completion,
176
- completion_sequence_positions=self._count_tokens(completion),
204
+ completion_sequence_positions=(
205
+ completion_tokens
206
+ if completion_tokens is not None
207
+ else (self._count_tokens(completion) if self._encoder is not None else None)
208
+ ),
177
209
  )
178
210
 
179
211
  else:
@@ -190,15 +222,26 @@ class OpenAIModel(BaseLLM):
190
222
  )
191
223
  prompt = "\n".join([f"{m.get('role', '')}: {m.get('content', '')}" for m in chat_messages])
192
224
  prompt_tokens = getattr(chat_response.usage, "prompt_tokens", None)
225
+ completion_tokens = getattr(chat_response.usage, "completion_tokens", None)
193
226
  completion = chat_response.choices[0].message.content or ""
194
227
  return RawCompletion(
195
228
  prompt=prompt,
196
229
  prompt_sequence_positions=prompt_tokens,
197
- concat_compression=ConcatCompression.calculate(
198
- single_messages, count_tokens=self._count_tokens, completion=completion
230
+ concat_compression=(
231
+ ConcatCompression.calculate(
232
+ single_messages,
233
+ count_tokens=self._count_tokens,
234
+ completion=completion,
235
+ )
236
+ if self._encoder is not None
237
+ else None
199
238
  ),
200
239
  completion=completion,
201
- completion_sequence_positions=self._count_tokens(completion),
240
+ completion_sequence_positions=(
241
+ completion_tokens
242
+ if completion_tokens is not None
243
+ else (self._count_tokens(completion) if self._encoder is not None else None)
244
+ ),
202
245
  )
203
246
 
204
247
  with concurrent.futures.ThreadPoolExecutor() as executor:
@@ -218,6 +261,10 @@ class OpenAIModel(BaseLLM):
218
261
  Note:
219
262
  Uses the OpenAI completions API with echo=True; chat logprobs are not supported.
220
263
  """
264
+ if self._encoder is None:
265
+ raise NotImplementedError(
266
+ "OpenAIModel.logprobs() requires a local tokenizer/encoder, but none is available."
267
+ )
221
268
  assert self._model_name in ["babbage-002", "davinci-002"], (
222
269
  "Log-probs for prompt tokens are only supported for a limited set of models."
223
270
  )
@@ -383,12 +430,16 @@ class DeepseekModel(OpenAIModel):
383
430
  base_url="https://api.deepseek.com/beta",
384
431
  )
385
432
  self._tokenizer_name = tokenizer_name if tokenizer_name is not None else "deepseek-ai/DeepSeek-V3.2-Exp"
433
+ # DeepSeek uses HF tokenization; override the base encoder (which may be None).
434
+ self._encoder = self._get_encoder()
386
435
 
387
436
  def _get_encoder(self) -> Tokenizer:
388
437
  return AutoTokenizer.from_pretrained(self._tokenizer_name)
389
438
 
390
439
  def _count_tokens(self, text: str) -> int:
391
- return len(self._encoder.encode(text))
440
+ encoder = self._encoder
441
+ assert encoder is not None
442
+ return len(encoder.encode(text)) # type: ignore[union-attr]
392
443
 
393
444
 
394
445
  ### Model Aliases ###
@@ -0,0 +1,217 @@
1
+ from __future__ import annotations
2
+
3
+ import atexit
4
+ import logging
5
+ import os
6
+ import signal
7
+ import socket
8
+ import subprocess
9
+ import time
10
+ import urllib.error
11
+ import urllib.request
12
+ from collections.abc import Sequence
13
+
14
+ from eval_framework.llm.base import BaseLLM
15
+ from eval_framework.llm.openai import OpenAIModel
16
+ from eval_framework.shared.types import RawCompletion, RawLoglikelihood
17
+ from eval_framework.tasks.base import Sample
18
+ from template_formatting.formatter import BaseFormatter, Message
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ def _pick_free_port(host: str) -> int:
24
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
25
+ s.bind((host, 0))
26
+ return int(s.getsockname()[1])
27
+
28
+
29
+ def _wait_for_http_ready(url: str, *, timeout_s: float) -> None:
30
+ deadline = time.time() + timeout_s
31
+ last_err: Exception | None = None
32
+ while time.time() < deadline:
33
+ try:
34
+ with urllib.request.urlopen(url, timeout=2) as resp:
35
+ if 200 <= resp.status < 500:
36
+ return
37
+ except (urllib.error.URLError, TimeoutError, ConnectionError, OSError) as e:
38
+ last_err = e
39
+ time.sleep(0.25)
40
+ raise RuntimeError(f"Timed out waiting for server readiness at {url}. Last error: {last_err}")
41
+
42
+
43
+ def _wait_for_http_ready_or_proc_exit(url: str, *, timeout_s: float, proc: subprocess.Popen[str]) -> None:
44
+ """
45
+ Like `_wait_for_http_ready`, but fail fast if the server process exits.
46
+
47
+ This avoids long timeouts that hide the real root cause (e.g. invalid CLI flags,
48
+ missing dependencies, CUDA issues).
49
+ """
50
+ deadline = time.time() + timeout_s
51
+ last_err: Exception | None = None
52
+
53
+ while time.time() < deadline:
54
+ if proc.poll() is not None:
55
+ out = ""
56
+ try:
57
+ if proc.stdout is not None:
58
+ out = proc.stdout.read() or ""
59
+ except Exception:
60
+ out = ""
61
+ tail = out.strip()
62
+ if len(tail) > 8000:
63
+ tail = tail[-8000:]
64
+ raise RuntimeError(
65
+ f"vLLM server process exited before becoming ready. exit_code={proc.returncode}. Output (tail):\n{tail}"
66
+ )
67
+
68
+ try:
69
+ with urllib.request.urlopen(url, timeout=2) as resp:
70
+ if 200 <= resp.status < 500:
71
+ return
72
+ except (urllib.error.URLError, TimeoutError, ConnectionError, OSError) as e:
73
+ last_err = e
74
+ time.sleep(0.25)
75
+
76
+ raise RuntimeError(f"Timed out waiting for server readiness at {url}. Last error: {last_err}")
77
+
78
+
79
+ class VLLMLocalServerModel(BaseLLM):
80
+ """
81
+ Provider-style model: start a local vLLM OpenAI-compatible server, then talk to it via `OpenAIModel(base_url=...)`.
82
+
83
+ This gives you a stable HTTP boundary (good for VCR cassettes) while keeping "local vLLM" as a selectable backend.
84
+
85
+ Notes:
86
+ - The server is started in a subprocess using `vllm serve`.
87
+ - Cleanup is best-effort (SIGTERM then SIGKILL).
88
+ - Not all OpenAI API features are guaranteed to be supported by the local server (e.g. logprobs).
89
+ """
90
+
91
+ def __init__(
92
+ self,
93
+ *,
94
+ model_name: str,
95
+ host: str = "127.0.0.1",
96
+ port: int | None = None,
97
+ startup_timeout_s: float = 120.0,
98
+ # `OpenAIModel` parameters:
99
+ formatter: BaseFormatter | None = None,
100
+ temperature: float | None = None,
101
+ top_p: float | None = None,
102
+ api_key: str | None = None,
103
+ bytes_per_token: float | None = None,
104
+ # vLLM "serve" parameters (subset, passed through):
105
+ tensor_parallel_size: int | None = None,
106
+ dtype: str | None = None,
107
+ max_model_len: int | None = None,
108
+ gpu_memory_utilization: float | None = None,
109
+ enforce_eager: bool | None = None,
110
+ # Escape hatch:
111
+ vllm_command: str | None = None,
112
+ vllm_extra_args: list[str] | None = None,
113
+ env: dict[str, str] | None = None,
114
+ ) -> None:
115
+ self._model_name = model_name
116
+ self._host = host
117
+ self._port = port if port is not None else _pick_free_port(host)
118
+ self._startup_timeout_s = float(startup_timeout_s)
119
+
120
+ self._proc: subprocess.Popen[str] | None = None
121
+
122
+ self._server_url = f"http://{self._host}:{self._port}/v1"
123
+
124
+ cmd = [vllm_command or "vllm", "serve", self._model_name, "--host", self._host, "--port", str(self._port)]
125
+
126
+ # A small, intentionally conservative subset of flags.
127
+ if tensor_parallel_size is not None:
128
+ cmd += ["--tensor-parallel-size", str(tensor_parallel_size)]
129
+ if dtype is not None:
130
+ cmd += ["--dtype", str(dtype)]
131
+ if max_model_len is not None:
132
+ cmd += ["--max-model-len", str(max_model_len)]
133
+ if gpu_memory_utilization is not None:
134
+ cmd += ["--gpu-memory-utilization", str(gpu_memory_utilization)]
135
+ if enforce_eager is not None:
136
+ # vLLM exposes this as a boolean flag; passing a value breaks CLI parsing.
137
+ if enforce_eager:
138
+ cmd += ["--enforce-eager"]
139
+
140
+ if vllm_extra_args:
141
+ cmd += list(vllm_extra_args)
142
+
143
+ merged_env = os.environ.copy()
144
+ if env:
145
+ merged_env.update(env)
146
+
147
+ logger.info("Starting local vLLM server: %s", " ".join(cmd))
148
+ self._proc = subprocess.Popen(
149
+ cmd,
150
+ env=merged_env,
151
+ stdout=subprocess.PIPE,
152
+ stderr=subprocess.STDOUT,
153
+ text=True,
154
+ )
155
+
156
+ # Ensure we don't leave it around if the process exits abruptly.
157
+ atexit.register(self._cleanup)
158
+
159
+ # Wait until the OpenAI-compatible endpoints respond.
160
+ if self._proc is None:
161
+ raise RuntimeError("Failed to start vLLM server process.")
162
+ _wait_for_http_ready_or_proc_exit(
163
+ f"{self._server_url}/models",
164
+ timeout_s=self._startup_timeout_s,
165
+ proc=self._proc,
166
+ )
167
+
168
+ # Configure client to talk to the local server.
169
+ # For local servers, any non-empty API key typically works; allow explicit override.
170
+ effective_api_key = api_key if api_key is not None else os.getenv("OPENAI_API_KEY") or "local-vllm"
171
+
172
+ self._client = OpenAIModel(
173
+ model_name=self._model_name,
174
+ formatter=formatter,
175
+ temperature=temperature,
176
+ top_p=top_p,
177
+ api_key=effective_api_key,
178
+ base_url=self._server_url,
179
+ bytes_per_token=bytes_per_token,
180
+ )
181
+
182
+ @property
183
+ def name(self) -> str:
184
+ return f"vllm_local::{self._model_name}"
185
+
186
+ def generate_from_messages(
187
+ self,
188
+ messages: list[Sequence[Message]],
189
+ stop_sequences: list[str] | None = None,
190
+ max_tokens: int | None = None,
191
+ temperature: float | None = None,
192
+ top_p: float | None = None,
193
+ ) -> list[RawCompletion]:
194
+ return self._client.generate_from_messages(messages, stop_sequences, max_tokens, temperature, top_p)
195
+
196
+ def logprobs(self, samples: list[Sample]) -> list[RawLoglikelihood]:
197
+ return self._client.logprobs(samples)
198
+
199
+ def _cleanup(self) -> None:
200
+ proc = self._proc
201
+ self._proc = None
202
+ if proc is None:
203
+ return
204
+ if proc.poll() is not None:
205
+ return
206
+
207
+ try:
208
+ proc.terminate()
209
+ proc.wait(timeout=10)
210
+ except Exception:
211
+ try:
212
+ proc.send_signal(signal.SIGKILL)
213
+ except Exception:
214
+ pass
215
+
216
+ def __del__(self) -> None:
217
+ self._cleanup()
@@ -1,3 +1,4 @@
1
+ import traceback
1
2
  from abc import ABC, abstractmethod
2
3
  from typing import Any
3
4
 
@@ -5,6 +6,7 @@ from pydantic import BaseModel, ConfigDict
5
6
 
6
7
  from eval_framework.metrics.aggregators.aggregators import Aggregator
7
8
  from eval_framework.shared.types import Error
9
+ from eval_framework.tasks.utils import raise_errors
8
10
 
9
11
 
10
12
  class MetricResult(BaseModel):
@@ -33,6 +35,8 @@ class BaseMetric[Response](ABC):
33
35
  # sample over multiple runs (LLM calls). We default to averaging and thus making
34
36
  # macro averaging the overall computation default.
35
37
  AGGREGATORS: list[Aggregator] = []
38
+ # Set by the evaluation generator before calculate(); controls how infra failures are handled.
39
+ fail_on_error: bool = False
36
40
 
37
41
  @classproperty
38
42
  def NAMES(cls) -> list[str]:
@@ -43,3 +47,17 @@ class BaseMetric[Response](ABC):
43
47
  @abstractmethod
44
48
  def calculate(self, response: Response) -> list[MetricResult]:
45
49
  raise NotImplementedError
50
+
51
+ def _record_or_raise(self, exc: Exception) -> list[MetricResult]:
52
+ """Infra failure (e.g. a Docker image-pull rate limit): abort when fail_on_error is set,
53
+ otherwise record a per-sample error so the run continues."""
54
+ if raise_errors() or self.fail_on_error:
55
+ raise exc
56
+ return [
57
+ MetricResult(
58
+ metric_name=self.NAME,
59
+ value=None,
60
+ higher_is_better=True,
61
+ error=Error(error_class=exc.__class__.__name__, message=str(exc), traceback=traceback.format_exc()),
62
+ )
63
+ ]
@@ -1,3 +1,5 @@
1
+ from llm_sandbox.exceptions import SandboxTimeoutError
2
+
1
3
  from eval_framework.metrics.base import BaseMetric, MetricResult
2
4
  from eval_framework.shared.types import Completion, Error
3
5
  from eval_framework.tasks.utils import run_python_code
@@ -14,7 +16,9 @@ class CodeCompletionAssertion(BaseMetric[Completion]):
14
16
  code = response.completion
15
17
  try:
16
18
  output = run_python_code(code, image="python:3.12-slim")
17
- except Exception as e:
19
+ except SandboxTimeoutError as e:
20
+ # The submitted code timed out (e.g. an infinite loop) -- a failing sample, not an infra
21
+ # problem.
18
22
  import traceback
19
23
 
20
24
  return [
@@ -25,6 +29,9 @@ class CodeCompletionAssertion(BaseMetric[Completion]):
25
29
  error=Error(error_class=e.__class__.__name__, message=str(e), traceback=traceback.format_exc()),
26
30
  )
27
31
  ]
32
+ except Exception as e:
33
+ # Any other sandbox/Docker error (e.g. an image pull rate limit) is an infra failure.
34
+ return self._record_or_raise(e)
28
35
 
29
36
  # Split and filter out empty strings
30
37
  output_parts = [part for part in output.split() if part.strip()]