eval-framework 0.2.6__tar.gz → 0.2.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (170) hide show
  1. {eval_framework-0.2.6 → eval_framework-0.2.8}/PKG-INFO +47 -29
  2. {eval_framework-0.2.6 → eval_framework-0.2.8}/README.md +45 -28
  3. {eval_framework-0.2.6 → eval_framework-0.2.8}/pyproject.toml +4 -1
  4. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/context/determined.py +1 -0
  5. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/context/eval.py +2 -0
  6. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/context/local.py +1 -0
  7. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/evaluation_generator.py +4 -1
  8. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/llm/aleph_alpha.py +10 -6
  9. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/math_reasoning_completion.py +10 -9
  10. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/base.py +2 -1
  11. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/graders/comparison_grader.py +56 -4
  12. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +110 -25
  13. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +9 -0
  14. eval_framework-0.2.8/src/eval_framework/metrics/llm/utils.py +20 -0
  15. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/run.py +6 -0
  16. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/eval_config.py +1 -0
  17. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/utils/generate_task_docs.py +24 -6
  18. {eval_framework-0.2.6 → eval_framework-0.2.8}/LICENSE +0 -0
  19. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/__init__.py +0 -0
  20. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/base_config.py +0 -0
  21. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/context/__init__.py +0 -0
  22. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/exceptions.py +0 -0
  23. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/external/ifeval_impl/README.md +0 -0
  24. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
  25. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
  26. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
  27. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
  28. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/llm/__init__.py +0 -0
  29. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/llm/base.py +0 -0
  30. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/llm/huggingface.py +0 -0
  31. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/llm/mistral.py +0 -0
  32. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/llm/models.py +0 -0
  33. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/llm/openai.py +0 -0
  34. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/llm/vllm.py +0 -0
  35. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/logger.py +0 -0
  36. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/main.py +0 -0
  37. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/__init__.py +0 -0
  38. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/base.py +0 -0
  39. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/__init__.py +0 -0
  40. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
  41. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
  42. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/bleu.py +0 -0
  43. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/chrf.py +0 -0
  44. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
  45. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
  46. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/comet.py +0 -0
  47. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
  48. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/csv_format.py +0 -0
  49. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
  50. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
  51. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/f1.py +0 -0
  52. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/format_checker.py +0 -0
  53. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
  54. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/ifeval.py +0 -0
  55. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/json_format.py +0 -0
  56. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/language_checker.py +0 -0
  57. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/length_control.py +0 -0
  58. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
  59. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
  60. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/repetition.py +0 -0
  61. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
  62. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
  63. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
  64. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
  65. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
  66. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/ter.py +0 -0
  67. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/text_counter.py +0 -0
  68. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
  69. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
  70. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/__init__.py +0 -0
  71. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
  72. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
  73. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
  74. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
  75. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
  76. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
  77. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/graders/language.py +0 -0
  78. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
  79. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/graders/models.py +0 -0
  80. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
  81. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
  82. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
  83. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
  84. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
  85. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
  86. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
  87. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
  88. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
  89. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
  90. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
  91. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
  92. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
  93. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
  94. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
  95. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
  96. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
  97. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
  98. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
  99. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
  100. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/py.typed +0 -0
  101. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/response_generator.py +0 -0
  102. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/result_processors/__init__.py +0 -0
  103. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/result_processors/base.py +0 -0
  104. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/result_processors/hf_uploader.py +0 -0
  105. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/result_processors/result_processor.py +0 -0
  106. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
  107. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/run_direct.py +0 -0
  108. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/shared/types.py +0 -0
  109. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/__init__.py +0 -0
  110. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/base.py +0 -0
  111. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
  112. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/aidanbench.py +0 -0
  113. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
  114. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
  115. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
  116. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
  117. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
  118. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
  119. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
  120. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
  121. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
  122. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/flores200.py +0 -0
  123. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
  124. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
  125. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/gsm8k.py +0 -0
  126. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
  127. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
  128. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
  129. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
  130. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/include.py +0 -0
  131. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
  132. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/math_reasoning.py +0 -0
  133. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
  134. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
  135. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
  136. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
  137. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
  138. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
  139. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
  140. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
  141. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
  142. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
  143. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
  144. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
  145. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/squad.py +0 -0
  146. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
  147. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
  148. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
  149. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/truthfulqa.py +0 -0
  150. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
  151. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
  152. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
  153. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
  154. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
  155. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/perturbation.py +0 -0
  156. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/registry.py +0 -0
  157. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/task_loader.py +0 -0
  158. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/task_names.py +0 -0
  159. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/utils.py +0 -0
  160. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/utils/constants.py +0 -0
  161. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/utils/file_ops.py +0 -0
  162. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/utils/helpers.py +0 -0
  163. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/utils/logging.py +0 -0
  164. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/utils/packaging.py +0 -0
  165. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/utils/tqdm_handler.py +0 -0
  166. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/template_formatting/README.md +0 -0
  167. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/template_formatting/__init__.py +0 -0
  168. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/template_formatting/formatter.py +0 -0
  169. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/template_formatting/mistral_formatter.py +0 -0
  170. {eval_framework-0.2.6 → eval_framework-0.2.8}/src/template_formatting/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: eval-framework
3
- Version: 0.2.6
3
+ Version: 0.2.8
4
4
  Summary: Evalulation Framework
5
5
  Author: Aleph Alpha Research
6
6
  License: Apache License
@@ -235,6 +235,7 @@ Requires-Dist: python-iso639>=2025.2.18
235
235
  Requires-Dist: wandb>=0.23.0,<1
236
236
  Requires-Dist: boto3>=1.40.54,<2
237
237
  Requires-Dist: numpy>=1.26.4
238
+ Requires-Dist: antlr4-python3-runtime==4.11.0
238
239
  Requires-Dist: accelerate ; extra == 'accelerate'
239
240
  Requires-Dist: eval-framework[determined,api,openai,transformers,accelerate,vllm,comet,optional,mistral] ; extra == 'all'
240
241
  Requires-Dist: aleph-alpha-client>=10,<11 ; extra == 'api'
@@ -268,10 +269,24 @@ Provides-Extra: transformers
268
269
  Provides-Extra: vllm
269
270
  Description-Content-Type: text/markdown
270
271
 
272
+ <!-- Badges -->
273
+ <div align="center">
274
+
271
275
  # Aleph Alpha Eval-Framework
272
276
 
273
- > **Comprehensive LLM evaluation at scale** - A production-ready framework for evaluating large language models across 90+ benchmarks.
274
- ![eval-framework](docs/eval-framework.png "https://raw.githubusercontent.com/Aleph-Alpha-Research/eval-framework/refs/heads/main/docs/eval-framework.png")
277
+ **Comprehensive LLM evaluation at scale** - A production-ready framework for evaluating large language models across 90+ benchmarks.
278
+
279
+ [![Build Status](https://github.com/Aleph-Alpha-Research/eval-framework/actions/workflows/tests.yml/badge.svg)](https://github.com/Aleph-Alpha-Research/eval-framework/actions)
280
+ [![Version](https://img.shields.io/github/v/release/Aleph-Alpha-Research/eval-framework)](https://github.com/Aleph-Alpha-Research/eval-framework/releases)
281
+ [![PyPI](https://img.shields.io/pypi/v/eval-framework.svg)](https://pypi.org/project/eval-framework/)
282
+ [![License](https://img.shields.io/github/license/Aleph-Alpha-Research/eval-framework.svg)](LICENSE)
283
+
284
+ [![Docs](https://img.shields.io/badge/docs-online-blue)](https://aleph-alpha-research.github.io/eval-framework/)
285
+ [![Stars](https://img.shields.io/github/stars/Aleph-Alpha-Research/eval-framework)](https://github.com/Aleph-Alpha-Research/eval-framework/stargazers)
286
+
287
+ ![eval-framework](https://raw.githubusercontent.com/Aleph-Alpha-Research/eval-framework/refs/heads/main/docs/eval-framework.png)
288
+
289
+ </div>
275
290
 
276
291
  ## Why Choose This Framework?
277
292
 
@@ -289,10 +304,12 @@ Description-Content-Type: text/markdown
289
304
  - Statistical Analysis: Includes confidence intervals and significance testing for reliable comparisons.
290
305
  - Docker Support: Pre-configured Dockerfiles for local and distributed setups.
291
306
 
307
+ For full documentation, visit our [Docs Page](https://aleph-alpha-research.github.io/eval-framework/).
308
+
292
309
  ## Quick Start
293
310
 
294
311
  The codebase is tested and compatible with Python 3.12 and PyTorch 2.5.
295
- You will also need the appropriate CUDA dependencies and version installed on your system for GPU support. Detailed installation instructions can be found [here](docs/installation.md).
312
+ You will also need the appropriate CUDA dependencies and version installed on your system for GPU support. Detailed installation instructions can be found [here](https://aleph-alpha-research.github.io/eval-framework/installation.html).
296
313
 
297
314
  The easiest way to get started is by installing the library via `pip` and use it as an external dependency.
298
315
  ```
@@ -350,7 +367,7 @@ eval_framework \
350
367
  --num-samples 10
351
368
  ```
352
369
 
353
- For more detailed CLI usage instructions, see the [CLI Usage Guide](docs/cli_usage.md).
370
+ For more detailed CLI usage instructions, see the [CLI Usage Guide](https://aleph-alpha-research.github.io/eval-framework/cli_usage.html).
354
371
 
355
372
  ## Benchmark Coverage & Task Categories
356
373
 
@@ -403,7 +420,7 @@ Evaluation metrics include:
403
420
  - **LLM Metrics:** Chatbot Style Judge, Instruction Judge
404
421
  - **Efficiency Metrics:** Bytes per Sequence Position
405
422
 
406
- For the full list of tasks and metrics, see [Detailed Task Table](docs/benchmarks_and_metrics.md).
423
+ For the full list of tasks and metrics, see [Detailed Task Table](https://aleph-alpha-research.github.io/eval-framework/benchmarks_and_metrics.html).
407
424
 
408
425
  ## Getting Started
409
426
 
@@ -419,9 +436,9 @@ Eval-Framework provides a unified interface for evaluating language models acros
419
436
 
420
437
  ### Core Components
421
438
 
422
- - **Models**: Defined via [`BaseLLM`](docs/evaluate_huggingface_model.md) interface (HuggingFace, OpenAI, custom APIs)
423
- - **Tasks**: Inherit from [`BaseTask`](docs/add_new_benchmark_guide.md) (completion, loglikelihood, or LLM-judge based)
424
- - **Metrics**: Automatic scoring via [`BaseMetric`](docs/benchmarks_and_metrics.md) classes
439
+ - **Models**: Defined via [`BaseLLM`](https://aleph-alpha-research.github.io/eval-framework/evaluate_huggingface_model.html) interface (HuggingFace, OpenAI, custom APIs)
440
+ - **Tasks**: Inherit from [`BaseTask`](https://aleph-alpha-research.github.io/eval-framework/add_new_benchmark_guide.html) (completion, loglikelihood, or LLM-judge based)
441
+ - **Metrics**: Automatic scoring via [`BaseMetric`](https://aleph-alpha-research.github.io/eval-framework/benchmarks_and_metrics.html) classes
425
442
  - **Formatters**: Handle prompt construction and model-specific formatting
426
443
  - **Results**: Structured outputs with sample-level details and aggregated statistics
427
444
 
@@ -466,41 +483,42 @@ if __name__ == "__main__":
466
483
  results = main(llm=llm, config=config)
467
484
  ```
468
485
 
469
- 3. **Review results** - Check `./eval_results/` for detailed outputs and use our [results guide](docs/understanding_results_guide.md) to interpret them
486
+ 3. **Review results** - Check `./eval_results/` for detailed outputs and use our [results guide](https://aleph-alpha-research.github.io/eval-framework/understanding_results_guide.html) to interpret them
470
487
 
471
488
  ### Next Steps
472
489
 
473
- - **Use CLI interface**: See [CLI usage guide](docs/cli_usage.md) for command-line evaluation options
474
- - **Evaluate HuggingFace models**: Follow our [HuggingFace evaluation guide](docs/evaluate_huggingface_model.md)
475
- - **Understand model arguments**: Read out [Model Arguments guide](docs/model_arguments.md)
476
- - **Create custom benchmarks**: Follow our [benchmark creation guide](docs/add_new_benchmark_guide.md)
477
- - **Scale your evaluations**: Use [Determined AI integration](docs/using_determined.md) for distributed evaluation
478
- - **Understand your results**: Read our [results interpretation guide](docs/understanding_results_guide.md)
479
- - **Log results in WandB**: See how [we integrate WandB](docs/wandb_integration.md) for metric and lineage tracking
490
+ - **Use CLI interface**: See [CLI usage guide](https://aleph-alpha-research.github.io/eval-framework/cli_usage.html) for command-line evaluation options
491
+ - **Evaluate HuggingFace models**: Follow our [HuggingFace evaluation guide](https://aleph-alpha-research.github.io/eval-framework/evaluate_huggingface_model.html)
492
+ - **Understand model arguments**: Read out [Model Arguments guide](https://aleph-alpha-research.github.io/eval-framework/model_arguments.html)
493
+ - **Create custom benchmarks**: Follow our [benchmark creation guide](https://aleph-alpha-research.github.io/eval-framework/add_new_benchmark_guide.html)
494
+ - **Scale your evaluations**: Use [Determined AI integration](https://aleph-alpha-research.github.io/eval-framework/using_determined.html) for distributed evaluation
495
+ - **Understand your results**: Read our [results interpretation guide](https://aleph-alpha-research.github.io/eval-framework/understanding_results_guide.html)
496
+ - **Log results in WandB**: See how [we integrate WandB](https://aleph-alpha-research.github.io/eval-framework/wandb_integration.html) for metric and lineage tracking
480
497
 
481
498
  ## Documentation
482
499
 
483
500
  ### Getting Started
484
501
 
485
- - **[CLI Usage Guide](docs/cli_usage.md)** - Detailed instructions for using the command-line interface
486
- - **[Evaluating HuggingFace Models](docs/evaluate_huggingface_model.md)** - Complete guide for evaluating HuggingFace models
487
- - **[Understanding Results](docs/understanding_results_guide.md)** - How to read and interpret evaluation results
502
+ - **[CLI Usage Guide](https://aleph-alpha-research.github.io/eval-framework/cli_usage.html)** - Detailed instructions for using the command-line interface
503
+ - **[Evaluating HuggingFace Models](https://aleph-alpha-research.github.io/eval-framework/evaluate_huggingface_model.html)** - Complete guide for evaluating HuggingFace models
504
+ - **[Understanding Results](https://aleph-alpha-research.github.io/eval-framework/understanding_results_guide.html)** - How to read and interpret evaluation results
488
505
 
489
506
  ### Advanced Usage
490
507
 
491
- - **[Understanding Model Arguments](docs/model_arguments.md)** - Thorough guide on each constructor argument for salient model classes
492
- - **[Adding New Benchmarks](docs/add_new_benchmark_guide.md)** - Complete guide with practical examples for adding new benchmarks
493
- - **[Benchmarks and Metrics](docs/benchmarks_and_metrics.md)** - Comprehensive overview of all available benchmarks and evaluation metrics
494
- - **[Overview of Dataloading](docs/overview_dataloading.md)** - Explanation of dataloading and task/sample/message structure
508
+ - **[Understanding Model Arguments](https://aleph-alpha-research.github.io/eval-framework/model_arguments.html)** - Thorough guide on each constructor argument for salient model classes
509
+ - **[Adding New Benchmarks](https://aleph-alpha-research.github.io/eval-framework/add_new_benchmark_guide.html)** - Complete guide with practical examples for adding new benchmarks
510
+ - **[Benchmarks and Metrics](https://aleph-alpha-research.github.io/eval-framework/benchmarks_and_metrics.html)** - Comprehensive overview of all available benchmarks and evaluation metrics
511
+ - **[Overview of Dataloading](https://aleph-alpha-research.github.io/eval-framework/overview_dataloading.html)** - Explanation of dataloading and task/sample/message structure
495
512
 
496
513
  ### Scaling & Production
497
514
 
498
- - **[Using Determined](docs/using_determined.md)** - Guide for distributed evaluation using Determined AI
499
- - **[Controlling Upload Results](docs/controlling_upload_results.md)** - How to manage and control the upload of evaluation results
515
+ - **[Using Determined](https://aleph-alpha-research.github.io/eval-framework/using_determined.html)** - Guide for distributed evaluation using Determined AI
516
+ - **[Controlling Upload Results](https://aleph-alpha-research.github.io/eval-framework/controlling_upload_results.html)** - How to manage and control the upload of evaluation results
500
517
 
501
518
  ### Contributing
502
519
 
503
- - **[Contributing Guide](CONTRIBUTING.md)** - Guide for contributing to this project
520
+ - **[Contributing Guide](https://aleph-alpha-research.github.io/eval-framework/CONTRIBUTING.html)** - Guide for contributing to this project
521
+ - **[Testing](https://aleph-alpha-research.github.io/eval-framework/testing.html)** - Guide for running tests comparable to the CI pipelines
504
522
 
505
523
  ### Citation
506
524
 
@@ -526,6 +544,6 @@ This project has received funding from the European Union’s Digital Europe Pro
526
544
  The contents of this publication are the sole responsibility of the OpenEuroLLM consortium and do not necessarily reflect the opinion of the European Union.
527
545
 
528
546
  <p align="center">
529
- <img src="docs/OELLM_1.png" alt="https://raw.githubusercontent.com/Aleph-Alpha-Research/eval-framework/main/docs/OELLM_1.png" width="100" style="margin-right: 50px;"/>
530
- <img src="docs/OELLM_2.png" alt="https://raw.githubusercontent.com/Aleph-Alpha-Research/eval-framework/main/docs/OELLM_2.png" width="350"/>
547
+ <img src="https://raw.githubusercontent.com/Aleph-Alpha-Research/eval-framework/main/docs/OELLM_1.png" width="100" style="margin-right: 50px;"/>
548
+ <img src="https://raw.githubusercontent.com/Aleph-Alpha-Research/eval-framework/main/docs/OELLM_2.png" width="350"/>
531
549
  </p>
@@ -1,7 +1,21 @@
1
+ <!-- Badges -->
2
+ <div align="center">
3
+
1
4
  # Aleph Alpha Eval-Framework
2
5
 
3
- > **Comprehensive LLM evaluation at scale** - A production-ready framework for evaluating large language models across 90+ benchmarks.
4
- ![eval-framework](docs/eval-framework.png "https://raw.githubusercontent.com/Aleph-Alpha-Research/eval-framework/refs/heads/main/docs/eval-framework.png")
6
+ **Comprehensive LLM evaluation at scale** - A production-ready framework for evaluating large language models across 90+ benchmarks.
7
+
8
+ [![Build Status](https://github.com/Aleph-Alpha-Research/eval-framework/actions/workflows/tests.yml/badge.svg)](https://github.com/Aleph-Alpha-Research/eval-framework/actions)
9
+ [![Version](https://img.shields.io/github/v/release/Aleph-Alpha-Research/eval-framework)](https://github.com/Aleph-Alpha-Research/eval-framework/releases)
10
+ [![PyPI](https://img.shields.io/pypi/v/eval-framework.svg)](https://pypi.org/project/eval-framework/)
11
+ [![License](https://img.shields.io/github/license/Aleph-Alpha-Research/eval-framework.svg)](LICENSE)
12
+
13
+ [![Docs](https://img.shields.io/badge/docs-online-blue)](https://aleph-alpha-research.github.io/eval-framework/)
14
+ [![Stars](https://img.shields.io/github/stars/Aleph-Alpha-Research/eval-framework)](https://github.com/Aleph-Alpha-Research/eval-framework/stargazers)
15
+
16
+ ![eval-framework](https://raw.githubusercontent.com/Aleph-Alpha-Research/eval-framework/refs/heads/main/docs/eval-framework.png)
17
+
18
+ </div>
5
19
 
6
20
  ## Why Choose This Framework?
7
21
 
@@ -19,10 +33,12 @@
19
33
  - Statistical Analysis: Includes confidence intervals and significance testing for reliable comparisons.
20
34
  - Docker Support: Pre-configured Dockerfiles for local and distributed setups.
21
35
 
36
+ For full documentation, visit our [Docs Page](https://aleph-alpha-research.github.io/eval-framework/).
37
+
22
38
  ## Quick Start
23
39
 
24
40
  The codebase is tested and compatible with Python 3.12 and PyTorch 2.5.
25
- You will also need the appropriate CUDA dependencies and version installed on your system for GPU support. Detailed installation instructions can be found [here](docs/installation.md).
41
+ You will also need the appropriate CUDA dependencies and version installed on your system for GPU support. Detailed installation instructions can be found [here](https://aleph-alpha-research.github.io/eval-framework/installation.html).
26
42
 
27
43
  The easiest way to get started is by installing the library via `pip` and use it as an external dependency.
28
44
  ```
@@ -80,7 +96,7 @@ eval_framework \
80
96
  --num-samples 10
81
97
  ```
82
98
 
83
- For more detailed CLI usage instructions, see the [CLI Usage Guide](docs/cli_usage.md).
99
+ For more detailed CLI usage instructions, see the [CLI Usage Guide](https://aleph-alpha-research.github.io/eval-framework/cli_usage.html).
84
100
 
85
101
  ## Benchmark Coverage & Task Categories
86
102
 
@@ -133,7 +149,7 @@ Evaluation metrics include:
133
149
  - **LLM Metrics:** Chatbot Style Judge, Instruction Judge
134
150
  - **Efficiency Metrics:** Bytes per Sequence Position
135
151
 
136
- For the full list of tasks and metrics, see [Detailed Task Table](docs/benchmarks_and_metrics.md).
152
+ For the full list of tasks and metrics, see [Detailed Task Table](https://aleph-alpha-research.github.io/eval-framework/benchmarks_and_metrics.html).
137
153
 
138
154
  ## Getting Started
139
155
 
@@ -149,9 +165,9 @@ Eval-Framework provides a unified interface for evaluating language models acros
149
165
 
150
166
  ### Core Components
151
167
 
152
- - **Models**: Defined via [`BaseLLM`](docs/evaluate_huggingface_model.md) interface (HuggingFace, OpenAI, custom APIs)
153
- - **Tasks**: Inherit from [`BaseTask`](docs/add_new_benchmark_guide.md) (completion, loglikelihood, or LLM-judge based)
154
- - **Metrics**: Automatic scoring via [`BaseMetric`](docs/benchmarks_and_metrics.md) classes
168
+ - **Models**: Defined via [`BaseLLM`](https://aleph-alpha-research.github.io/eval-framework/evaluate_huggingface_model.html) interface (HuggingFace, OpenAI, custom APIs)
169
+ - **Tasks**: Inherit from [`BaseTask`](https://aleph-alpha-research.github.io/eval-framework/add_new_benchmark_guide.html) (completion, loglikelihood, or LLM-judge based)
170
+ - **Metrics**: Automatic scoring via [`BaseMetric`](https://aleph-alpha-research.github.io/eval-framework/benchmarks_and_metrics.html) classes
155
171
  - **Formatters**: Handle prompt construction and model-specific formatting
156
172
  - **Results**: Structured outputs with sample-level details and aggregated statistics
157
173
 
@@ -196,41 +212,42 @@ if __name__ == "__main__":
196
212
  results = main(llm=llm, config=config)
197
213
  ```
198
214
 
199
- 3. **Review results** - Check `./eval_results/` for detailed outputs and use our [results guide](docs/understanding_results_guide.md) to interpret them
215
+ 3. **Review results** - Check `./eval_results/` for detailed outputs and use our [results guide](https://aleph-alpha-research.github.io/eval-framework/understanding_results_guide.html) to interpret them
200
216
 
201
217
  ### Next Steps
202
218
 
203
- - **Use CLI interface**: See [CLI usage guide](docs/cli_usage.md) for command-line evaluation options
204
- - **Evaluate HuggingFace models**: Follow our [HuggingFace evaluation guide](docs/evaluate_huggingface_model.md)
205
- - **Understand model arguments**: Read out [Model Arguments guide](docs/model_arguments.md)
206
- - **Create custom benchmarks**: Follow our [benchmark creation guide](docs/add_new_benchmark_guide.md)
207
- - **Scale your evaluations**: Use [Determined AI integration](docs/using_determined.md) for distributed evaluation
208
- - **Understand your results**: Read our [results interpretation guide](docs/understanding_results_guide.md)
209
- - **Log results in WandB**: See how [we integrate WandB](docs/wandb_integration.md) for metric and lineage tracking
219
+ - **Use CLI interface**: See [CLI usage guide](https://aleph-alpha-research.github.io/eval-framework/cli_usage.html) for command-line evaluation options
220
+ - **Evaluate HuggingFace models**: Follow our [HuggingFace evaluation guide](https://aleph-alpha-research.github.io/eval-framework/evaluate_huggingface_model.html)
221
+ - **Understand model arguments**: Read out [Model Arguments guide](https://aleph-alpha-research.github.io/eval-framework/model_arguments.html)
222
+ - **Create custom benchmarks**: Follow our [benchmark creation guide](https://aleph-alpha-research.github.io/eval-framework/add_new_benchmark_guide.html)
223
+ - **Scale your evaluations**: Use [Determined AI integration](https://aleph-alpha-research.github.io/eval-framework/using_determined.html) for distributed evaluation
224
+ - **Understand your results**: Read our [results interpretation guide](https://aleph-alpha-research.github.io/eval-framework/understanding_results_guide.html)
225
+ - **Log results in WandB**: See how [we integrate WandB](https://aleph-alpha-research.github.io/eval-framework/wandb_integration.html) for metric and lineage tracking
210
226
 
211
227
  ## Documentation
212
228
 
213
229
  ### Getting Started
214
230
 
215
- - **[CLI Usage Guide](docs/cli_usage.md)** - Detailed instructions for using the command-line interface
216
- - **[Evaluating HuggingFace Models](docs/evaluate_huggingface_model.md)** - Complete guide for evaluating HuggingFace models
217
- - **[Understanding Results](docs/understanding_results_guide.md)** - How to read and interpret evaluation results
231
+ - **[CLI Usage Guide](https://aleph-alpha-research.github.io/eval-framework/cli_usage.html)** - Detailed instructions for using the command-line interface
232
+ - **[Evaluating HuggingFace Models](https://aleph-alpha-research.github.io/eval-framework/evaluate_huggingface_model.html)** - Complete guide for evaluating HuggingFace models
233
+ - **[Understanding Results](https://aleph-alpha-research.github.io/eval-framework/understanding_results_guide.html)** - How to read and interpret evaluation results
218
234
 
219
235
  ### Advanced Usage
220
236
 
221
- - **[Understanding Model Arguments](docs/model_arguments.md)** - Thorough guide on each constructor argument for salient model classes
222
- - **[Adding New Benchmarks](docs/add_new_benchmark_guide.md)** - Complete guide with practical examples for adding new benchmarks
223
- - **[Benchmarks and Metrics](docs/benchmarks_and_metrics.md)** - Comprehensive overview of all available benchmarks and evaluation metrics
224
- - **[Overview of Dataloading](docs/overview_dataloading.md)** - Explanation of dataloading and task/sample/message structure
237
+ - **[Understanding Model Arguments](https://aleph-alpha-research.github.io/eval-framework/model_arguments.html)** - Thorough guide on each constructor argument for salient model classes
238
+ - **[Adding New Benchmarks](https://aleph-alpha-research.github.io/eval-framework/add_new_benchmark_guide.html)** - Complete guide with practical examples for adding new benchmarks
239
+ - **[Benchmarks and Metrics](https://aleph-alpha-research.github.io/eval-framework/benchmarks_and_metrics.html)** - Comprehensive overview of all available benchmarks and evaluation metrics
240
+ - **[Overview of Dataloading](https://aleph-alpha-research.github.io/eval-framework/overview_dataloading.html)** - Explanation of dataloading and task/sample/message structure
225
241
 
226
242
  ### Scaling & Production
227
243
 
228
- - **[Using Determined](docs/using_determined.md)** - Guide for distributed evaluation using Determined AI
229
- - **[Controlling Upload Results](docs/controlling_upload_results.md)** - How to manage and control the upload of evaluation results
244
+ - **[Using Determined](https://aleph-alpha-research.github.io/eval-framework/using_determined.html)** - Guide for distributed evaluation using Determined AI
245
+ - **[Controlling Upload Results](https://aleph-alpha-research.github.io/eval-framework/controlling_upload_results.html)** - How to manage and control the upload of evaluation results
230
246
 
231
247
  ### Contributing
232
248
 
233
- - **[Contributing Guide](CONTRIBUTING.md)** - Guide for contributing to this project
249
+ - **[Contributing Guide](https://aleph-alpha-research.github.io/eval-framework/CONTRIBUTING.html)** - Guide for contributing to this project
250
+ - **[Testing](https://aleph-alpha-research.github.io/eval-framework/testing.html)** - Guide for running tests comparable to the CI pipelines
234
251
 
235
252
  ### Citation
236
253
 
@@ -256,6 +273,6 @@ This project has received funding from the European Union’s Digital Europe Pro
256
273
  The contents of this publication are the sole responsibility of the OpenEuroLLM consortium and do not necessarily reflect the opinion of the European Union.
257
274
 
258
275
  <p align="center">
259
- <img src="docs/OELLM_1.png" alt="https://raw.githubusercontent.com/Aleph-Alpha-Research/eval-framework/main/docs/OELLM_1.png" width="100" style="margin-right: 50px;"/>
260
- <img src="docs/OELLM_2.png" alt="https://raw.githubusercontent.com/Aleph-Alpha-Research/eval-framework/main/docs/OELLM_2.png" width="350"/>
276
+ <img src="https://raw.githubusercontent.com/Aleph-Alpha-Research/eval-framework/main/docs/OELLM_1.png" width="100" style="margin-right: 50px;"/>
277
+ <img src="https://raw.githubusercontent.com/Aleph-Alpha-Research/eval-framework/main/docs/OELLM_2.png" width="350"/>
261
278
  </p>
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "eval-framework"
3
- version = "0.2.6"
3
+ version = "0.2.8"
4
4
  description = "Evalulation Framework"
5
5
  readme = "README.md"
6
6
  license = { file = "LICENSE" }
@@ -42,6 +42,9 @@ dependencies = [
42
42
  "wandb>=0.23.0,<1",
43
43
  "boto3>=1.40.54,<2",
44
44
  "numpy>=1.26.4",
45
+ # is a dependency of sympy, but not explicitly listed in the requirements.txt
46
+ # https://github.com/sympy/sympy/blob/0204fa34e8f6f6f8ccb4de01209be9a2345c9d6e/doc/src/contributing/dependencies.md?plain=1#L125
47
+ "antlr4-python3-runtime==4.11.0",
45
48
  ]
46
49
 
47
50
  [project.optional-dependencies]
@@ -149,6 +149,7 @@ class DeterminedContext(EvalContext):
149
149
  wandb_upload_results=self.hparams.wandb_upload_results or self.wandb_upload_results,
150
150
  batch_size=self.hparams.task_args.batch_size or self.batch_size,
151
151
  description=self.hparams.description or self.description,
152
+ randomize_judge_order=self.randomize_judge_order,
152
153
  delete_output_dir_after_upload=self.hparams.delete_output_dir_after_upload
153
154
  or self.delete_output_dir_after_upload,
154
155
  )
@@ -73,6 +73,7 @@ class EvalContext(AbstractContextManager):
73
73
  perturbation_type: str | None = None,
74
74
  perturbation_probability: float | None = None,
75
75
  perturbation_seed: int | None = None,
76
+ randomize_judge_order: bool = False,
76
77
  delete_output_dir_after_upload: bool | None = None,
77
78
  ) -> None:
78
79
  self.llm_name = llm_name
@@ -96,6 +97,7 @@ class EvalContext(AbstractContextManager):
96
97
  self.judge_model_args = judge_model_args if judge_model_args is not None else {}
97
98
  self.batch_size = batch_size
98
99
  self.description = description
100
+ self.randomize_judge_order = randomize_judge_order
99
101
  self.delete_output_dir_after_upload = delete_output_dir_after_upload
100
102
 
101
103
  if perturbation_type or perturbation_probability is not None:
@@ -63,6 +63,7 @@ class LocalContext(EvalContext):
63
63
  judge_model_args=self.judge_model_args,
64
64
  batch_size=self.batch_size,
65
65
  description=self.description,
66
+ randomize_judge_order=self.randomize_judge_order,
66
67
  delete_output_dir_after_upload=self.delete_output_dir_after_upload,
67
68
  )
68
69
 
@@ -67,7 +67,10 @@ class EvaluationGenerator:
67
67
  if llm_judge is None:
68
68
  assert self.config.llm_judge_class is not None, "The llm_judge_class must be defined in the config."
69
69
  llm_judge = self.config.llm_judge_class(**self.config.judge_model_args)
70
- metric = metric_class(llm_judge=llm_judge)
70
+ metric = metric_class(
71
+ llm_judge=llm_judge,
72
+ randomize_order=self.config.randomize_judge_order,
73
+ )
71
74
  else:
72
75
  metric = metric_class()
73
76
 
@@ -55,6 +55,8 @@ class AlephAlphaAPIModel(BaseLLM):
55
55
  request_timeout_seconds: int = 30 * 60 + 5,
56
56
  queue_full_timeout_seconds: int = 30 * 60 + 5,
57
57
  bytes_per_token: float | None = None,
58
+ token: str = os.getenv("AA_TOKEN", "dummy"),
59
+ base_url: str = os.getenv("AA_INFERENCE_ENDPOINT", "dummy_endpoint"),
58
60
  ) -> None:
59
61
  self._formatter: BaseFormatter
60
62
  if formatter is None:
@@ -69,7 +71,9 @@ class AlephAlphaAPIModel(BaseLLM):
69
71
  self.max_retries = max_retries
70
72
  self.request_timeout_seconds = request_timeout_seconds
71
73
  self.queue_full_timeout_seconds = queue_full_timeout_seconds
72
- self._validate_model_availability()
74
+ self.token = token
75
+ self.base_url = base_url
76
+ self._validate_model_availability(base_url, token)
73
77
  # set bytes_per_token_scalar for non-standard models
74
78
  if bytes_per_token is not None and bytes_per_token <= 0:
75
79
  raise ValueError("bytes_per_token must be positive")
@@ -77,15 +81,15 @@ class AlephAlphaAPIModel(BaseLLM):
77
81
  4.0 / bytes_per_token if bytes_per_token is not None else 4.0 / self.BYTES_PER_TOKEN
78
82
  )
79
83
 
80
- def _validate_model_availability(self) -> None:
84
+ def _validate_model_availability(self, base_url: str, token: str) -> None:
81
85
  """
82
86
  Validate that the model name is available by making a test request.
83
87
  """
84
88
  try:
85
89
  # 'Client' object does not support the context manager protocol
86
90
  client = Client(
87
- host=os.getenv("AA_INFERENCE_ENDPOINT", "dummy_endpoint"),
88
- token=os.getenv("AA_TOKEN", "dummy"),
91
+ host=base_url,
92
+ token=token,
89
93
  )
90
94
 
91
95
  request = CompletionRequest(
@@ -190,10 +194,10 @@ class AlephAlphaAPIModel(BaseLLM):
190
194
  """Process multiple requests concurrently, returning request/response pairs."""
191
195
  semaphore = asyncio.Semaphore(self.max_async_concurrent_requests)
192
196
  async with AsyncClient(
193
- host=os.getenv("AA_INFERENCE_ENDPOINT", "dummy_endpoint"),
197
+ host=self.base_url,
194
198
  nice=True,
195
199
  request_timeout_seconds=self.request_timeout_seconds,
196
- token=os.getenv("AA_TOKEN", "dummy"),
200
+ token=self.token,
197
201
  total_retries=0, # we have a custom retry policy in _request_with_backoff()
198
202
  ) as client:
199
203
  tasks = (
@@ -204,10 +204,15 @@ class MathReasoningCompletion(BaseMetric[Completion]):
204
204
  timeout = 10
205
205
  # latex parse all ingested ground truth values for math reasoning
206
206
  for gt in response.ground_truth_list:
207
+ if gt is None:
208
+ continue
207
209
  signal.signal(signal.SIGALRM, timeout_handler) # Set timeout signal
208
210
  signal.alarm(timeout) # Set timeout duration
209
211
  try:
210
- gt_parsed = parse_latex(gt) # NOTE: parses f(x)=0,\quadf(x)=x-1,\quadf(x)=-x+1 to Eq(f(x), 0) ONLY
212
+ gt_normalized = self.normalize_expression(gt)
213
+ gt_parsed = parse_latex(
214
+ gt_normalized
215
+ ) # NOTE: parses f(x)=0,\quadf(x)=x-1,\quadf(x)=-x+1 to Eq(f(x), 0) ONLY
211
216
  ground_truths.append(gt_parsed)
212
217
  except Exception:
213
218
  ground_truths.append(gt)
@@ -229,15 +234,11 @@ class MathReasoningCompletion(BaseMetric[Completion]):
229
234
  )
230
235
  ]
231
236
  else:
232
- # fall back to string comparison
233
- # ground truth can be list or str, we have str comparisons
234
- assert isinstance(response.ground_truth, str)
235
- str_is_correct = self._is_str_correct(normalized_response, response.ground_truth)
236
- return [
237
- MetricResult(
238
- metric_name=self.NAME, value=float(str_is_correct), higher_is_better=True, error=response.error
239
- )
237
+ normalized_ground_truths = [
238
+ self.normalize_expression(gt) for gt in response.ground_truth_list if gt is not None
240
239
  ]
240
+ res = self._any_str_correct([normalized_response], normalized_ground_truths)
241
+ return [MetricResult(metric_name=self.NAME, value=float(res), higher_is_better=True, error=response.error)]
241
242
 
242
243
  def _any_str_correct(self, response_list: list, ground_truths: list) -> bool:
243
244
  """
@@ -6,8 +6,9 @@ from eval_framework.shared.types import Completion, Error
6
6
 
7
7
 
8
8
  class BaseLLMJudgeMetric(BaseMetric[Completion]):
9
- def __init__(self, llm_judge: BaseLLM) -> None:
9
+ def __init__(self, llm_judge: BaseLLM, randomize_order: bool = False) -> None:
10
10
  self._llm_judge = llm_judge
11
+ self._randomize_order = randomize_order
11
12
 
12
13
  def _create_metric_result(
13
14
  self,
@@ -1,3 +1,4 @@
1
+ import random
1
2
  from collections.abc import Mapping
2
3
  from enum import Enum
3
4
 
@@ -8,6 +9,7 @@ from eval_framework.metrics.llm.graders.models import (
8
9
  PromptTemplateWithParseMap,
9
10
  parse_json_output,
10
11
  )
12
+ from eval_framework.metrics.llm.utils import order_answers_for_comparison
11
13
 
12
14
 
13
15
  class MatchOutcome(str, Enum):
@@ -23,6 +25,14 @@ class MatchOutcome(str, Enum):
23
25
  return (0.5, 0.5)
24
26
  return (0, 1)
25
27
 
28
+ def flip(self) -> "MatchOutcome":
29
+ """Flip the outcome (A_WINS <-> B_WINS, DRAW stays DRAW)."""
30
+ if self == self.A_WINS:
31
+ return MatchOutcome.B_WINS
32
+ if self == self.B_WINS:
33
+ return MatchOutcome.A_WINS
34
+ return self # DRAW stays DRAW
35
+
26
36
  @staticmethod
27
37
  def from_rank_literal(rank: int) -> "MatchOutcome":
28
38
  match rank:
@@ -122,25 +132,67 @@ Answer 2:
122
132
  self._prompt_templates = prompt_templates
123
133
 
124
134
  def grade(
125
- self, instruction: str, completion_1: str, completion_2: str, language: Language
135
+ self,
136
+ instruction: str,
137
+ completion_1: str,
138
+ completion_2: str,
139
+ language: Language,
140
+ randomize_order: bool = False,
141
+ seed: int | None = None,
126
142
  ) -> ComparisonGradingOutput:
143
+ """Grade two completions by comparing them.
144
+
145
+ Args:
146
+ instruction: The instruction/task that was given.
147
+ completion_1: The first completion (typically the candidate).
148
+ completion_2: The second completion (typically the reference).
149
+ language: The language for the grading prompts.
150
+ randomize_order: If True, randomly swap the order of completions to eliminate
151
+ position bias.
152
+ seed: Optional random seed for reproducibility. If None and randomize_order
153
+ is True, uses a random swap decision.
154
+
155
+ Returns:
156
+ ComparisonGradingOutput with the outcome corrected for any position swap,
157
+ so outcome always reflects completion_1 vs completion_2 regardless of
158
+ presentation order to the judge.
159
+ """
127
160
  prompt_template = language.language_config(self._prompt_templates)
128
161
 
162
+ # Determine whether to swap the order
163
+ if randomize_order:
164
+ rng = random.Random(seed)
165
+ swap_order = rng.choice([True, False])
166
+ else:
167
+ swap_order = False
168
+
169
+ # Apply the swap if needed
170
+ actual_answer_1, actual_answer_2 = order_answers_for_comparison(completion_1, completion_2, swap_order)
171
+
129
172
  messages = prompt_template.to_messages(
130
173
  [],
131
174
  [
132
175
  (self.INSTRUCTION_KEY, instruction),
133
- (self.ANSWER_1_KEY, completion_1),
134
- (self.ANSWER_2_KEY, completion_2),
176
+ (self.ANSWER_1_KEY, actual_answer_1),
177
+ (self.ANSWER_2_KEY, actual_answer_2),
135
178
  ],
136
179
  )
137
180
 
138
181
  raw_completion = self._grading_model.generate_from_messages([messages])[0]
139
182
  loaded_json = parse_json_output(raw_completion.completion)
140
183
 
184
+ # Get the raw outcome from the judge
185
+ raw_outcome: MatchOutcome | None = prompt_template.parse_map.get(
186
+ str(loaded_json.get(self.BETTER_ANSWER_KEY, None)), None
187
+ )
188
+
189
+ # Correct the outcome if we swapped the order
190
+ # If swapped: "Answer 1 is better" means completion_2 is better (B_WINS from completion_1's perspective)
191
+ final_outcome = raw_outcome.flip() if swap_order and raw_outcome is not None else raw_outcome
192
+
141
193
  return ComparisonGradingOutput(
142
194
  reasoning=loaded_json.get(self.REASONING_KEY, None),
143
- outcome=prompt_template.parse_map.get(str(loaded_json.get(self.BETTER_ANSWER_KEY, None)), None),
195
+ outcome=final_outcome,
144
196
  judge_prompt=raw_completion.prompt,
145
197
  judge_response=raw_completion.completion,
146
198
  )