eval-framework 0.2.3__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. {eval_framework-0.2.3 → eval_framework-0.2.4}/PKG-INFO +20 -15
  2. {eval_framework-0.2.3 → eval_framework-0.2.4}/README.md +15 -12
  3. {eval_framework-0.2.3 → eval_framework-0.2.4}/pyproject.toml +8 -5
  4. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/evaluation_generator.py +4 -4
  5. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/llm/aleph_alpha.py +3 -7
  6. eval_framework-0.2.4/src/eval_framework/llm/openai.py +400 -0
  7. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/main.py +3 -2
  8. eval_framework-0.2.4/src/eval_framework/metrics/completion/aidanbench.py +28 -0
  9. eval_framework-0.2.4/src/eval_framework/metrics/llm/graders/coherence_grader.py +115 -0
  10. eval_framework-0.2.4/src/eval_framework/metrics/llm/llm_judge_coherence.py +44 -0
  11. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/response_generator.py +11 -83
  12. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/result_processors/wandb_uploader.py +2 -6
  13. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/run.py +14 -3
  14. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/base.py +79 -1
  15. eval_framework-0.2.4/src/eval_framework/tasks/benchmarks/aidanbench.py +211 -0
  16. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/math_reasoning.py +14 -3
  17. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/task_names.py +2 -0
  18. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/utils/generate_task_docs.py +1 -4
  19. eval_framework-0.2.4/src/eval_framework/utils/helpers.py +32 -0
  20. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/utils/logging.py +17 -5
  21. eval_framework-0.2.4/src/eval_framework/utils/tqdm_handler.py +14 -0
  22. eval_framework-0.2.3/src/eval_framework/llm/openai.py +0 -226
  23. eval_framework-0.2.3/src/eval_framework/utils/helpers.py +0 -3
  24. {eval_framework-0.2.3 → eval_framework-0.2.4}/LICENSE +0 -0
  25. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/__init__.py +0 -0
  26. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/base_config.py +0 -0
  27. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/context/__init__.py +0 -0
  28. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/context/determined.py +0 -0
  29. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/context/eval.py +0 -0
  30. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/context/local.py +0 -0
  31. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/exceptions.py +0 -0
  32. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/external/ifeval_impl/README.md +0 -0
  33. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
  34. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
  35. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
  36. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
  37. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/llm/__init__.py +0 -0
  38. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/llm/base.py +0 -0
  39. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/llm/huggingface.py +0 -0
  40. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/llm/mistral.py +0 -0
  41. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/llm/models.py +0 -0
  42. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/llm/vllm.py +0 -0
  43. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/logger.py +0 -0
  44. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/__init__.py +0 -0
  45. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/base.py +0 -0
  46. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/__init__.py +0 -0
  47. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
  48. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/bleu.py +0 -0
  49. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/chrf.py +0 -0
  50. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
  51. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
  52. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/comet.py +0 -0
  53. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
  54. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/csv_format.py +0 -0
  55. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
  56. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
  57. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/f1.py +0 -0
  58. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/format_checker.py +0 -0
  59. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
  60. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/ifeval.py +0 -0
  61. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/json_format.py +0 -0
  62. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/language_checker.py +0 -0
  63. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/length_control.py +0 -0
  64. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
  65. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
  66. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
  67. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/repetition.py +0 -0
  68. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
  69. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
  70. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
  71. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
  72. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
  73. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/ter.py +0 -0
  74. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/text_counter.py +0 -0
  75. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
  76. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
  77. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/__init__.py +0 -0
  78. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/base.py +0 -0
  79. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
  80. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
  81. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
  82. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
  83. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
  84. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
  85. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/graders/language.py +0 -0
  86. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
  87. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/graders/models.py +0 -0
  88. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
  89. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
  90. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
  91. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
  92. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
  93. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
  94. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
  95. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
  96. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
  97. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
  98. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
  99. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
  100. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
  101. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
  102. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
  103. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
  104. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
  105. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
  106. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
  107. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
  108. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
  109. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/py.typed +0 -0
  110. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/result_processors/__init__.py +0 -0
  111. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/result_processors/base.py +0 -0
  112. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/result_processors/hf_uploader.py +0 -0
  113. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/result_processors/result_processor.py +0 -0
  114. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/run_direct.py +0 -0
  115. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/shared/types.py +0 -0
  116. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/__init__.py +0 -0
  117. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
  118. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
  119. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
  120. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
  121. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
  122. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
  123. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
  124. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
  125. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
  126. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
  127. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/flores200.py +0 -0
  128. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
  129. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
  130. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/gsm8k.py +0 -0
  131. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
  132. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
  133. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
  134. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
  135. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/include.py +0 -0
  136. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
  137. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
  138. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
  139. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
  140. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
  141. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
  142. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
  143. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
  144. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
  145. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
  146. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
  147. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
  148. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
  149. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/squad.py +0 -0
  150. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
  151. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
  152. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
  153. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/truthfulqa.py +0 -0
  154. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
  155. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
  156. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
  157. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
  158. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
  159. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/eval_config.py +0 -0
  160. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/perturbation.py +0 -0
  161. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/registry.py +0 -0
  162. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/task_loader.py +0 -0
  163. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/utils.py +0 -0
  164. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/utils/constants.py +0 -0
  165. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/utils/file_ops.py +0 -0
  166. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/utils/packaging.py +0 -0
  167. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/template_formatting/README.md +0 -0
  168. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/template_formatting/__init__.py +0 -0
  169. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/template_formatting/formatter.py +0 -0
  170. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/template_formatting/mistral_formatter.py +0 -0
  171. {eval_framework-0.2.3 → eval_framework-0.2.4}/src/template_formatting/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: eval-framework
3
- Version: 0.2.3
3
+ Version: 0.2.4
4
4
  Summary: Evalulation Framework
5
5
  Author: Aleph Alpha Research
6
6
  License: Apache License
@@ -232,8 +232,9 @@ Requires-Dist: llm-sandbox[docker]>=0.1.8,<0.2
232
232
  Requires-Dist: jsonlines>=4,<5
233
233
  Requires-Dist: lxml>=6,<7
234
234
  Requires-Dist: python-iso639>=2025.2.18
235
- Requires-Dist: wandb>=0.21.1,<1
235
+ Requires-Dist: wandb>=0.23.0,<1
236
236
  Requires-Dist: boto3>=1.40.54,<2
237
+ Requires-Dist: numpy>=1.26.4
237
238
  Requires-Dist: accelerate ; extra == 'accelerate'
238
239
  Requires-Dist: eval-framework[determined,api,openai,transformers,accelerate,vllm,comet,optional,mistral] ; extra == 'all'
239
240
  Requires-Dist: aleph-alpha-client>=10,<11 ; extra == 'api'
@@ -243,8 +244,9 @@ Requires-Dist: tensorboard==2.19.0 ; extra == 'determined'
243
244
  Requires-Dist: mistral-common>=1.7,<2 ; extra == 'mistral'
244
245
  Requires-Dist: huggingface-hub>=0.33.2,<0.34 ; extra == 'mistral'
245
246
  Requires-Dist: eval-framework[vllm] ; extra == 'mistral'
246
- Requires-Dist: openai>=1.62,<2 ; extra == 'openai'
247
+ Requires-Dist: openai>=1.62,<2.8 ; extra == 'openai'
247
248
  Requires-Dist: tiktoken>=0.9,<0.10 ; extra == 'openai'
249
+ Requires-Dist: transformers>=4.45.2,<5 ; extra == 'openai'
248
250
  Requires-Dist: transformers>=4.45.2,<5 ; extra == 'optional'
249
251
  Requires-Dist: jinja2>=3.1.6,<4 ; extra == 'optional'
250
252
  Requires-Dist: transformers>=4.45.2,<5 ; extra == 'transformers'
@@ -354,22 +356,25 @@ For more detailed CLI usage instructions, see the [CLI Usage Guide](docs/cli_usa
354
356
 
355
357
  ### Core Capabilities
356
358
 
357
- | **Reasoning** | **Knowledge** | **Coding** | **Long Context** |
358
- |---------------|---------------|------------|------------------|
359
- | MMLU (57 subjects) | TriviaQA | HumanEval | InfiniteBench |
360
- | SQuAD v1/v2 | MBPP |
361
- | ARC | Natural Questions | CodeT5 | ZeroSCROLLS |
362
- | HellaSwag | QuAC | Programming | QuALITY |
363
- | Winogrande | COPA | Debugging |
359
+ Subset of core capabilities benchmarks coverd by `eval-framework`:
360
+
361
+ | **Reasoning** | **Knowledge** | **Math** | **Coding** | **Structured outputs** | **Long Context** |
362
+ |---------------|---------------|----------|------------|------------------------|------------------|
363
+ | COPA | ARC | AIME | BigCodeBench | IFEval | InfiniteBench |
364
+ | Hellaswag | MMLU | GSM8K | HumanEval | StructEval | QUALITY |
365
+ | Winogrande | Openbook QA| MATH-500 | MBPP | | ZeroSCROLLS |
366
+
364
367
 
365
368
  ### Languages & Domains
366
369
 
367
- | **Multilingual** | **Specialized** | **Safety & Bias** | **Efficiency** |
370
+ Subset of language-specific and domain-specific benchmarks coverd by `eval-framework`:
371
+
372
+ | **Multilingual** | **Specialized** | **Safety & Bias** | **Efficiency Metrics** |
368
373
  |------------------|-----------------|-------------------|----------------|
369
- | WMT Translation | Legal (CaseHold) | TruthfulQA | Token counting |
370
- | FLORES-200 | Winogender | Latency metrics |
371
- | Multilingual MMLU | Medical (MedQA) | Stereotype detection | Memory usage |
372
- | German/Finnish tasks | Scientific (SciQ) | Harmful content | Cost analysis |
374
+ | WMT Translation | MMLU | TruthfulQA | Compression ratios |
375
+ | FLORES-200 | Legal (CaseHold) | Winogender | Runtime |
376
+ | Multilingual MMLU | Scientific (SciQ) | | |
377
+ | German/Finnish tasks | | | |
373
378
 
374
379
  ### Completion
375
380
 
@@ -86,22 +86,25 @@ For more detailed CLI usage instructions, see the [CLI Usage Guide](docs/cli_usa
86
86
 
87
87
  ### Core Capabilities
88
88
 
89
- | **Reasoning** | **Knowledge** | **Coding** | **Long Context** |
90
- |---------------|---------------|------------|------------------|
91
- | MMLU (57 subjects) | TriviaQA | HumanEval | InfiniteBench |
92
- | SQuAD v1/v2 | MBPP |
93
- | ARC | Natural Questions | CodeT5 | ZeroSCROLLS |
94
- | HellaSwag | QuAC | Programming | QuALITY |
95
- | Winogrande | COPA | Debugging |
89
+ Subset of core capabilities benchmarks coverd by `eval-framework`:
90
+
91
+ | **Reasoning** | **Knowledge** | **Math** | **Coding** | **Structured outputs** | **Long Context** |
92
+ |---------------|---------------|----------|------------|------------------------|------------------|
93
+ | COPA | ARC | AIME | BigCodeBench | IFEval | InfiniteBench |
94
+ | Hellaswag | MMLU | GSM8K | HumanEval | StructEval | QUALITY |
95
+ | Winogrande | Openbook QA| MATH-500 | MBPP | | ZeroSCROLLS |
96
+
96
97
 
97
98
  ### Languages & Domains
98
99
 
99
- | **Multilingual** | **Specialized** | **Safety & Bias** | **Efficiency** |
100
+ Subset of language-specific and domain-specific benchmarks coverd by `eval-framework`:
101
+
102
+ | **Multilingual** | **Specialized** | **Safety & Bias** | **Efficiency Metrics** |
100
103
  |------------------|-----------------|-------------------|----------------|
101
- | WMT Translation | Legal (CaseHold) | TruthfulQA | Token counting |
102
- | FLORES-200 | Winogender | Latency metrics |
103
- | Multilingual MMLU | Medical (MedQA) | Stereotype detection | Memory usage |
104
- | German/Finnish tasks | Scientific (SciQ) | Harmful content | Cost analysis |
104
+ | WMT Translation | MMLU | TruthfulQA | Compression ratios |
105
+ | FLORES-200 | Legal (CaseHold) | Winogender | Runtime |
106
+ | Multilingual MMLU | Scientific (SciQ) | | |
107
+ | German/Finnish tasks | | | |
105
108
 
106
109
  ### Completion
107
110
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "eval-framework"
3
- version = "0.2.3"
3
+ version = "0.2.4"
4
4
  description = "Evalulation Framework"
5
5
  readme = "README.md"
6
6
  license = { file = "LICENSE" }
@@ -39,8 +39,9 @@ dependencies = [
39
39
  "jsonlines>=4,<5",
40
40
  "lxml>=6,<7",
41
41
  "python-iso639>=2025.2.18",
42
- "wandb>=0.21.1,<1",
42
+ "wandb>=0.23.0,<1",
43
43
  "boto3>=1.40.54,<2",
44
+ "numpy>=1.26.4",
44
45
  ]
45
46
 
46
47
  [project.optional-dependencies]
@@ -51,8 +52,9 @@ determined = [
51
52
  ]
52
53
  api = ["aleph-alpha-client>=10,<11"]
53
54
  openai = [
54
- "openai>=1.62,<2",
55
- "tiktoken>=0.9,<0.10"
55
+ "openai>=1.62,<2.8",
56
+ "tiktoken>=0.9,<0.10",
57
+ "transformers>=4.45.2,<5",
56
58
  ]
57
59
  transformers = [
58
60
  "transformers>=4.45.2,<5",
@@ -149,6 +151,7 @@ select = [
149
151
  "UP", # Auto-upgrading of new Python features
150
152
  "I", # Sort imports
151
153
  ]
154
+
152
155
  [tool.ruff.lint.isort]
153
156
  # https://github.com/astral-sh/ruff-pre-commit/issues/121
154
157
  # https://github.com/astral-sh/ruff/issues/10519
@@ -163,7 +166,7 @@ known-third-party = ["wandb"]
163
166
  plugins = "pydantic.mypy"
164
167
  disallow_untyped_defs = true
165
168
  ignore_missing_imports = true
166
- files = ["src", "tests", "utils"]
169
+ files = ["src", "utils"]
167
170
 
168
171
  [tool.pytest.ini_options]
169
172
  testpaths = ["./tests"]
@@ -20,6 +20,7 @@ from eval_framework.tasks.base import ResponseType
20
20
  from eval_framework.tasks.eval_config import EvalConfig
21
21
  from eval_framework.tasks.registry import get_task
22
22
  from eval_framework.utils.constants import RED, RESET
23
+ from eval_framework.utils.tqdm_handler import get_disable_bar_flag, safe_tqdm_write
23
24
 
24
25
  logger = logging.getLogger(__name__)
25
26
 
@@ -71,8 +72,8 @@ class EvaluationGenerator:
71
72
  metric = metric_class()
72
73
 
73
74
  logger.info(f"Starting calculation of {metric.NAME}")
74
- tqdm.write(f"INFO: Calculating {metric.NAME}")
75
- for response in tqdm(responses, desc=f"Calculating {metric.NAME}"):
75
+ safe_tqdm_write(f"INFO: Calculating {metric.NAME}")
76
+ for response in tqdm(responses, desc=f"Calculating {metric.NAME}", disable=get_disable_bar_flag()):
76
77
  if f"{response.subject}_{response.id}_{metric.__class__.__name__}" in subject_result_id_existing:
77
78
  continue
78
79
 
@@ -109,7 +110,7 @@ class EvaluationGenerator:
109
110
  self.result_processor.save_metrics_result(result)
110
111
 
111
112
  logger.info(f"Completed calculation of {metric.NAME}")
112
- tqdm.write(f"INFO: Completed {metric.NAME}")
113
+ safe_tqdm_write(f"INFO: Completed {metric.NAME}")
113
114
 
114
115
  if not self.save_intermediate_results:
115
116
  self.result_processor.save_metrics_results(results)
@@ -224,7 +225,6 @@ class EvaluationGenerator:
224
225
  aggregated_results = self._aggregate_results(metrics_results)
225
226
 
226
227
  wandb.log(aggregated_results)
227
-
228
228
  self.result_processor.save_aggregated_results(aggregated_results)
229
229
  logger.info(aggregated_results)
230
230
  logger.info(f"{RED}[ Evaluation completed and results saved! ]{RESET}")
@@ -50,6 +50,7 @@ class AlephAlphaAPIModel(BaseLLM):
50
50
  self,
51
51
  formatter: BaseFormatter | None = None,
52
52
  checkpoint_name: str | None = None,
53
+ temperature: float | None = None,
53
54
  # Please see README.md for tips if adapting the following parameters.
54
55
  max_retries: int = 100,
55
56
  max_async_concurrent_requests: int = 32,
@@ -65,6 +66,7 @@ class AlephAlphaAPIModel(BaseLLM):
65
66
  else:
66
67
  self._formatter = formatter
67
68
  self._llm_name = checkpoint_name or self.LLM_NAME
69
+ self._temperature = temperature if temperature is not None else 0.0
68
70
  self.max_async_concurrent_requests = max_async_concurrent_requests
69
71
  self.max_retries = max_retries
70
72
  self.request_timeout_seconds = request_timeout_seconds
@@ -249,13 +251,7 @@ class AlephAlphaAPIModel(BaseLLM):
249
251
  max_tokens: int | None = None,
250
252
  temperature: float | None = None,
251
253
  ) -> list[RawCompletion]:
252
- if temperature is None:
253
- effective_temperature = 0.0 # Current default, TODO: refactor to use model's default
254
- logger.info(
255
- f"Using default temperature value: {effective_temperature} as no custom temperature value was provided"
256
- )
257
- else:
258
- effective_temperature = temperature
254
+ effective_temperature = temperature if temperature is not None else self._temperature
259
255
 
260
256
  requests = []
261
257
 
@@ -0,0 +1,400 @@
1
+ import concurrent.futures
2
+ import logging
3
+ import math
4
+ import os
5
+ import traceback
6
+ from collections.abc import Callable, Sequence
7
+ from functools import partial
8
+
9
+ import tiktoken
10
+ from openai import OpenAI
11
+ from openai.types.chat import ChatCompletionAssistantMessageParam, ChatCompletionUserMessageParam
12
+ from tokenizers import Tokenizer
13
+ from transformers import AutoTokenizer
14
+
15
+ from eval_framework.llm.base import BaseLLM
16
+ from eval_framework.shared.types import ConcatCompression, Error, RawCompletion, RawLoglikelihood
17
+ from eval_framework.tasks.base import Sample
18
+ from template_formatting.formatter import BaseFormatter, ConcatFormatter, HFFormatter, Message
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class OpenAIModel(BaseLLM):
24
+ """
25
+ LLM wrapper for OpenAI API providing text/chat completions and log-probability evaluation output.
26
+ """
27
+
28
+ LLM_NAME: str | None = None
29
+ DEFAULT_FORMATTER: Callable[[], BaseFormatter] | None = None
30
+ BYTES_PER_TOKEN: float = 4.0 # rule of thumb according to https://platform.openai.com/tokenizer
31
+
32
+ def __init__(
33
+ self,
34
+ model_name: str | None = None,
35
+ formatter: BaseFormatter | None = None,
36
+ temperature: float | None = None,
37
+ api_key: str | None = os.getenv("OPENAI_API_KEY", ""),
38
+ organization: str | None = None,
39
+ base_url: str | None = None,
40
+ bytes_per_token: float | None = None,
41
+ ) -> None:
42
+ """
43
+ Initialize the OpenAIModel.
44
+
45
+ Args:
46
+ model_name: OpenAI model name (e.g., "gpt-4o", "gpt-3.5-turbo"). If None, uses LLM_NAME class attribute.
47
+ formatter: Optional message formatter.
48
+ temperature: Sampling temperature used when not passed to generate methods (from 0.0 to 2.0).
49
+ api_key: OpenAI API key (defaults to OPENAI_API_KEY env variable).
50
+ organization: Optional OpenAI organization ID.
51
+ base_url: Optional API base URL for Azure or alternate endpoints.
52
+ bytes_per_token: Optional custom bytes per token scalar for non-standard models.
53
+ """
54
+ assert model_name is not None or self.LLM_NAME is not None, "A model name must be specified."
55
+ self._model_name = model_name if model_name else self.LLM_NAME
56
+ logger.info(f"Instantiating OpenAIModel with name: {self._model_name}")
57
+
58
+ self._formatter = formatter or (self.DEFAULT_FORMATTER() if self.DEFAULT_FORMATTER is not None else None)
59
+ self._temperature = temperature if temperature is not None else 0.0
60
+ assert 0.0 <= self._temperature <= 2.0, "Temperature must be between 0.0 and 2.0"
61
+
62
+ self._client = OpenAI(
63
+ api_key=api_key,
64
+ organization=organization,
65
+ base_url=base_url,
66
+ )
67
+
68
+ # Initialize tokenizer for the model
69
+ self._encoder = self._get_encoder()
70
+
71
+ # set bytes_per_token_scalar for non-standard models
72
+ if bytes_per_token is not None and bytes_per_token <= 0:
73
+ raise ValueError("bytes_per_token must be positive")
74
+ self.bytes_per_token_scalar = (
75
+ 4.0 / bytes_per_token if bytes_per_token is not None else 4.0 / self.BYTES_PER_TOKEN
76
+ )
77
+
78
+ def _get_encoder(self) -> tiktoken.Encoding:
79
+ assert self._model_name is not None
80
+ return tiktoken.encoding_for_model(self._model_name)
81
+
82
+ def _count_tokens(self, text: str) -> int:
83
+ """
84
+ Count tokens for the given text using the encoder.
85
+
86
+ Args:
87
+ text: Input string.
88
+
89
+ Returns:
90
+ Number of tokens.
91
+ """
92
+ return len(self._encoder.encode(text))
93
+
94
+ def generate_from_messages(
95
+ self,
96
+ messages: list[Sequence[Message]],
97
+ stop_sequences: list[str] | None = None,
98
+ max_tokens: int | None = None,
99
+ temperature: float | None = None,
100
+ ) -> list[RawCompletion]:
101
+ """
102
+ Generate completions for a list of message sequences concurrently.
103
+
104
+ Uses text completion API when a formatter is configured, otherwise uses chat completion API.
105
+
106
+ Args:
107
+ messages: Sequence of messages.
108
+ stop_sequences: Optional list of stop sequences.
109
+ max_tokens: Optional maximum number of tokens to generate.
110
+ temperature: Sampling temperature.
111
+
112
+ Returns:
113
+ List of RawCompletion objects containing prompts and completions.
114
+ """
115
+
116
+ effective_temperature = temperature if temperature is not None else self._temperature
117
+ assert 0.0 <= effective_temperature <= 2.0, "Temperature must be between 0.0 and 2.0"
118
+
119
+ def _process_one(single_messages: Sequence[Message]) -> RawCompletion:
120
+ # Adjust max tokens based on bytes_per_token_scalar so that non-standard models generate full responses
121
+ scaled_max_tokens = math.ceil(max_tokens * self.bytes_per_token_scalar) if max_tokens is not None else None
122
+
123
+ if self._formatter is not None:
124
+ # Use formatter and text completion API
125
+ prompt = self._formatter.format(single_messages, output_mode="string")
126
+ # documentation: https://platform.openai.com/docs/api-reference/completions/create
127
+ assert self._model_name is not None
128
+ response = self._client.completions.create(
129
+ model=self._model_name,
130
+ prompt=prompt,
131
+ temperature=effective_temperature,
132
+ max_tokens=scaled_max_tokens,
133
+ stop=stop_sequences,
134
+ )
135
+ completion = response.choices[0].text
136
+ return RawCompletion(
137
+ prompt=prompt,
138
+ prompt_sequence_positions=self._count_tokens(prompt),
139
+ concat_compression=ConcatCompression.calculate(
140
+ single_messages, count_tokens=self._count_tokens, completion=completion
141
+ ),
142
+ completion=completion,
143
+ completion_sequence_positions=self._count_tokens(completion),
144
+ )
145
+
146
+ else:
147
+ # Use chat completion API
148
+ chat_messages = [
149
+ (
150
+ ChatCompletionUserMessageParam(role="user", content=m.content)
151
+ if m.role is not None and m.role.value.lower() == "user"
152
+ else ChatCompletionAssistantMessageParam(role="assistant", content=m.content)
153
+ )
154
+ for m in single_messages
155
+ ]
156
+ assert self._model_name is not None
157
+ chat_response = self._client.chat.completions.create(
158
+ model=self._model_name,
159
+ messages=chat_messages,
160
+ temperature=effective_temperature,
161
+ max_tokens=scaled_max_tokens,
162
+ stop=stop_sequences,
163
+ )
164
+ prompt = "\n".join([f"{m.get('role', '')}: {m.get('content', '')}" for m in chat_messages])
165
+ prompt_tokens = getattr(chat_response.usage, "prompt_tokens", None)
166
+ completion = chat_response.choices[0].message.content or ""
167
+ return RawCompletion(
168
+ prompt=prompt,
169
+ prompt_sequence_positions=prompt_tokens,
170
+ concat_compression=ConcatCompression.calculate(
171
+ single_messages, count_tokens=self._count_tokens, completion=completion
172
+ ),
173
+ completion=completion,
174
+ completion_sequence_positions=self._count_tokens(completion),
175
+ )
176
+
177
+ with concurrent.futures.ThreadPoolExecutor() as executor:
178
+ results = list(executor.map(_process_one, messages))
179
+ return results
180
+
181
+ def logprobs(self, samples: list[Sample]) -> list[RawLoglikelihood]:
182
+ """
183
+ Compute total log-probabilities for possible completions given each sample's prompt.
184
+
185
+ Args:
186
+ samples: List of Sample objects, each with prompt messages and possible completions.
187
+
188
+ Returns:
189
+ List of RawLoglikelihood objects mapping each prompt and completion to its log-probability.
190
+
191
+ Note:
192
+ Uses the OpenAI completions API with echo=True; chat logprobs are not supported.
193
+ """
194
+ assert self._model_name in ["babbage-002", "davinci-002"], (
195
+ "Log-probs for prompt tokens are only supported for a limited set of models."
196
+ )
197
+ # apparently OpenAI stopped providing logprobs of prompt tokens, see discussion in:
198
+ # https://github.com/EleutherAI/lm-evaluation-harness/issues/1196
199
+
200
+ assert self._formatter is not None, "Log-probs require a formatter to create text prompts."
201
+ results: list[RawLoglikelihood] = []
202
+ for sample in samples:
203
+ prompt = self._formatter.format(sample.messages, output_mode="string") if sample.messages else ""
204
+ choices_log_probs: dict[str, float] = {}
205
+ choices_sequence_positions: dict[str, int] = {}
206
+ prompt_sequence_positions: int | None = self._count_tokens(prompt)
207
+ error: Error | None = None
208
+
209
+ for choice in sample.possible_completions or []:
210
+ if error is not None:
211
+ continue
212
+
213
+ # Tokenize prompt and completion
214
+ prompt_tokens = self._encoder.encode(prompt)
215
+ completion_tokens = self._encoder.encode(choice)
216
+ full_text = prompt + choice
217
+
218
+ try:
219
+ response = self._client.completions.create(
220
+ model=self._model_name,
221
+ prompt=full_text,
222
+ echo=True,
223
+ max_tokens=0,
224
+ logprobs=1,
225
+ temperature=0,
226
+ )
227
+
228
+ choice_obj = response.choices[0]
229
+ if not hasattr(choice_obj, "logprobs") or choice_obj.logprobs is None:
230
+ raise ValueError("Logprobs not returned in response.")
231
+
232
+ all_tokens = getattr(choice_obj.logprobs, "tokens", None)
233
+ all_logprobs = getattr(choice_obj.logprobs, "token_logprobs", None)
234
+
235
+ if all_tokens is None or all_logprobs is None:
236
+ raise ValueError("Logprobs response missing expected 'tokens' or 'token_logprobs' fields.")
237
+
238
+ if len(all_tokens) != len(prompt_tokens) + len(completion_tokens):
239
+ raise ValueError(
240
+ f"Token count mismatch: tokens in response ({len(all_tokens)}) != prompt+completion "
241
+ f"({len(prompt_tokens) + len(completion_tokens)})"
242
+ )
243
+
244
+ # Sum logprobs for the completion portion
245
+ choices_log_probs[choice] = sum(all_logprobs[len(prompt_tokens) :])
246
+ choices_sequence_positions[choice] = len(completion_tokens)
247
+
248
+ except Exception as e:
249
+ error = Error(error_class=e.__class__.__name__, message=str(e), traceback=traceback.format_exc())
250
+ prompt_sequence_positions = None
251
+ choices_log_probs = {}
252
+ choices_sequence_positions = {}
253
+
254
+ results.append(
255
+ RawLoglikelihood(
256
+ prompt=prompt,
257
+ prompt_sequence_positions=prompt_sequence_positions,
258
+ loglikelihoods=choices_log_probs,
259
+ loglikelihoods_sequence_positions=choices_sequence_positions,
260
+ raw_loglikelihood_error=error,
261
+ )
262
+ )
263
+ return results
264
+
265
+ def __del__(self) -> None:
266
+ if hasattr(self, "_client"):
267
+ self._client.close()
268
+
269
+
270
+ class OpenAIEmbeddingModel(BaseLLM):
271
+ def __init__(
272
+ self,
273
+ model_name: str = "text-embedding-3-large",
274
+ formatter: BaseFormatter | None = None,
275
+ api_key: str | None = None,
276
+ organization: str | None = None,
277
+ base_url: str | None = None,
278
+ ) -> None:
279
+ """Initialize OpenAI API client.
280
+ Args:
281
+ model_name: Name of the OpenAI model to use (e.g., "text-embedding-3-large")
282
+ formatter: Optional message formatter
283
+ api_key: OpenAI API key (defaults to OPENAI_API_KEY env variable)
284
+ organization: Optional organization ID
285
+ base_url: Optional API base URL for Azure or other endpoints
286
+ """
287
+ if formatter is not None:
288
+ raise ValueError("Formatter is not supported for embedding model.")
289
+ self._model_name = model_name
290
+ logger.info(f"Using {model_name} as embedding model")
291
+ self._client = OpenAI(
292
+ api_key=api_key or os.getenv("OPENAI_API_KEY", ""),
293
+ organization=organization,
294
+ base_url=base_url,
295
+ )
296
+
297
+ def generate_from_messages(
298
+ self,
299
+ messages: list[Sequence[Message]],
300
+ stop_sequences: list[str] | None = None,
301
+ max_tokens: int | None = None,
302
+ temperature: float | None = None,
303
+ ) -> list[RawCompletion]:
304
+ raise NotImplementedError(
305
+ "Embedding model does not support generate_from_messages. Use generate_embeddings instead."
306
+ )
307
+
308
+ def generate_embeddings(
309
+ self,
310
+ messages: list[Sequence[Message]],
311
+ ) -> list[list[float]]:
312
+ embeddings = []
313
+ for single_messages in messages:
314
+ prompt = "".join([m.content for m in single_messages])
315
+ response = self._client.embeddings.create(model=self._model_name, input=[prompt])
316
+ embedding = response.data[0].embedding
317
+ embeddings.append(embedding)
318
+ return embeddings
319
+
320
+ def logprobs(self, samples: list[Sample]) -> list[RawLoglikelihood]:
321
+ raise NotImplementedError("Embedding model cannot return logprobs.")
322
+
323
+ def __del__(self) -> None:
324
+ if hasattr(self, "_client"):
325
+ self._client.close()
326
+ try:
327
+ self._client.close()
328
+ except Exception:
329
+ pass
330
+
331
+
332
+ class DeepseekModel(OpenAIModel):
333
+ """
334
+ General Deepseek model wrapper using OpenAI-compatible API for deepseek-chat and deepseek-reasoner models.
335
+
336
+ Using the deepseek API: https://api-docs.deepseek.com/quick_start/pricing
337
+ """
338
+
339
+ def __init__(
340
+ self,
341
+ model_name: str | None = None,
342
+ formatter: BaseFormatter | None = None,
343
+ temperature: float | None = None,
344
+ api_key: str | None = None,
345
+ organization: str | None = None,
346
+ base_url: str | None = None,
347
+ tokenizer_name: str | None = None,
348
+ ) -> None:
349
+ super().__init__(
350
+ model_name=model_name,
351
+ formatter=formatter,
352
+ temperature=temperature,
353
+ api_key=os.getenv("DEEPSEEK_API_KEY", ""),
354
+ organization=organization,
355
+ base_url="https://api.deepseek.com/beta",
356
+ )
357
+ self._tokenizer_name = tokenizer_name if tokenizer_name is not None else "deepseek-ai/DeepSeek-V3.2-Exp"
358
+
359
+ def _get_encoder(self) -> Tokenizer:
360
+ return AutoTokenizer.from_pretrained(self._tokenizer_name)
361
+
362
+ def _count_tokens(self, text: str) -> int:
363
+ return len(self._encoder.encode(text))
364
+
365
+
366
+ ### Model Aliases ###
367
+
368
+
369
+ class OpenAI_gpt_4o_mini(OpenAIModel):
370
+ LLM_NAME = "gpt-4o-mini-2024-07-18"
371
+
372
+
373
+ class OpenAI_gpt_4o_mini_with_ConcatFormatter(OpenAIModel):
374
+ LLM_NAME = "gpt-4o-mini-2024-07-18"
375
+ DEFAULT_FORMATTER = ConcatFormatter
376
+
377
+
378
+ class OpenAI_davinci_002(OpenAIModel):
379
+ LLM_NAME = "davinci-002"
380
+ DEFAULT_FORMATTER = ConcatFormatter
381
+
382
+
383
+ class Deepseek_reasoner(DeepseekModel):
384
+ LLM_NAME = "deepseek-reasoner" # DeepSeek-V3.2-Exp (Thinking Mode)
385
+ # multi-round conversations for reasoning model documented here:
386
+ # https://api-docs.deepseek.com/guides/reasoning_model#api-example
387
+ # does not support completion API
388
+
389
+
390
+ class Deepseek_chat(DeepseekModel):
391
+ LLM_NAME = "deepseek-chat" # DeepSeek-V3.2-Exp (Non-thinking Mode)
392
+
393
+
394
+ class Deepseek_chat_with_formatter(DeepseekModel):
395
+ LLM_NAME = "deepseek-chat" # DeepSeek-V3.2-Exp (Non-thinking Mode)
396
+ DEFAULT_FORMATTER = partial(HFFormatter, "deepseek-ai/DeepSeek-V3.2-Exp")
397
+ """
398
+ <|begin▁of▁sentence|><|User|>Question: What color is the night sky?
399
+ <|Assistant|></think>Answer:
400
+ """
@@ -29,12 +29,13 @@ def main(
29
29
  trial_id: int | None = None,
30
30
  *args: Any,
31
31
  resource_cleanup: bool = False,
32
+ verbosity: int = 1,
32
33
  ) -> list[Result]:
33
34
  """Runs the entire evaluation process: responses generation and evaluation."""
34
35
  # Set up centralized logging early
35
36
  output_dir = generate_output_dir(llm.name, config)
36
- print(f"Output directory for evaluation: {output_dir}")
37
- setup_logging(output_dir=output_dir, log_level=logging.INFO, log_filename="evaluation.log")
37
+ setup_logging(output_dir=output_dir, log_level=verbosity, log_filename="evaluation.log")
38
+ logger.info(f"Output directory for evaluation: {output_dir}")
38
39
 
39
40
  logger.info(f"{RED}[ Running full evaluation process ------- ]{RESET}")
40
41
  logger.info(f"Evaluating {llm.name} on {config.task_name}")
@@ -0,0 +1,28 @@
1
+ import logging
2
+
3
+ from eval_framework.metrics.base import BaseMetric, MetricResult
4
+ from eval_framework.shared.types import Completion
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ class AidanBenchMetric(BaseMetric[Completion]):
10
+ NAME = "AidanBench"
11
+
12
+ def calculate(self, response: Completion) -> list[MetricResult]:
13
+ # subtract 2 to not count 1) initial instruction and 2) the latest model response, which caused the stop
14
+ # i.e. was not (unique && coherent)
15
+ num_unique_responses = len(response.messages) - 2 if response.messages is not None else 0
16
+ if num_unique_responses < 0:
17
+ logger.warning(
18
+ "Number of unique responses calculated as negative, setting to 0."
19
+ "Likely something went wrong during answer generation."
20
+ )
21
+ num_unique_responses = 0
22
+ return [
23
+ MetricResult(
24
+ metric_name=f"{self.NAME}/num_responses",
25
+ value=num_unique_responses,
26
+ higher_is_better=True,
27
+ )
28
+ ]