eval-framework 0.2.0__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. {eval_framework-0.2.0 → eval_framework-0.2.2}/LICENSE +1 -1
  2. {eval_framework-0.2.0 → eval_framework-0.2.2}/PKG-INFO +69 -76
  3. {eval_framework-0.2.0 → eval_framework-0.2.2}/README.md +67 -71
  4. {eval_framework-0.2.0 → eval_framework-0.2.2}/pyproject.toml +19 -22
  5. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/context/determined.py +11 -12
  6. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/context/eval.py +4 -3
  7. eval_framework-0.2.2/src/eval_framework/context/local.py +75 -0
  8. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/llm/base.py +39 -0
  9. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/llm/huggingface.py +58 -18
  10. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/llm/models.py +8 -3
  11. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/llm/vllm.py +70 -5
  12. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/main.py +8 -0
  13. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/response_generator.py +3 -0
  14. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/run.py +30 -18
  15. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/base.py +1 -1
  16. eval_framework-0.2.2/src/eval_framework/tasks/benchmarks/flores200.py +133 -0
  17. eval_framework-0.2.2/src/eval_framework/tasks/benchmarks/squad.py +211 -0
  18. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/struct_eval.py +17 -11
  19. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/utils.py +7 -1
  20. eval_framework-0.2.2/src/eval_framework/utils/file_ops.py +224 -0
  21. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/utils/generate_task_docs.py +6 -6
  22. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/template_formatting/formatter.py +2 -1
  23. eval_framework-0.2.0/src/eval_framework/context/local.py +0 -52
  24. eval_framework-0.2.0/src/eval_framework/tasks/benchmarks/flores200.py +0 -62
  25. eval_framework-0.2.0/src/eval_framework/tasks/benchmarks/squad.py +0 -89
  26. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/__init__.py +0 -0
  27. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/base_config.py +0 -0
  28. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/context/__init__.py +0 -0
  29. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/evaluation_generator.py +0 -0
  30. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/exceptions.py +0 -0
  31. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/external/ifeval_impl/README.md +0 -0
  32. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
  33. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
  34. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
  35. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
  36. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/llm/__init__.py +0 -0
  37. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/llm/aleph_alpha.py +0 -0
  38. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/llm/mistral.py +0 -0
  39. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/llm/openai.py +0 -0
  40. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/logger.py +0 -0
  41. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/__init__.py +0 -0
  42. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/base.py +0 -0
  43. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/__init__.py +0 -0
  44. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
  45. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/bleu.py +0 -0
  46. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/chrf.py +0 -0
  47. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
  48. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
  49. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/comet.py +0 -0
  50. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
  51. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/csv_format.py +0 -0
  52. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
  53. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
  54. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/f1.py +0 -0
  55. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/format_checker.py +0 -0
  56. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
  57. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/ifeval.py +0 -0
  58. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/json_format.py +0 -0
  59. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/language_checker.py +0 -0
  60. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/length_control.py +0 -0
  61. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
  62. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
  63. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
  64. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/repetition.py +0 -0
  65. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
  66. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
  67. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
  68. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
  69. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
  70. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/ter.py +0 -0
  71. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/text_counter.py +0 -0
  72. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
  73. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
  74. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/__init__.py +0 -0
  75. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/base.py +0 -0
  76. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
  77. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
  78. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
  79. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
  80. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
  81. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
  82. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/graders/language.py +0 -0
  83. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
  84. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/graders/models.py +0 -0
  85. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
  86. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
  87. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
  88. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
  89. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
  90. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
  91. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
  92. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
  93. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
  94. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
  95. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
  96. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
  97. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
  98. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
  99. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
  100. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
  101. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
  102. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/py.typed +0 -0
  103. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/result_processors/__init__.py +0 -0
  104. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/result_processors/base.py +0 -0
  105. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/result_processors/hf_processor.py +0 -0
  106. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/result_processors/result_processor.py +0 -0
  107. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/run_direct.py +0 -0
  108. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/shared/types.py +0 -0
  109. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/__init__.py +0 -0
  110. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
  111. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
  112. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
  113. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
  114. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
  115. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
  116. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
  117. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
  118. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
  119. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
  120. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
  121. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
  122. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/gsm8k.py +0 -0
  123. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
  124. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
  125. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
  126. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
  127. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/include.py +0 -0
  128. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
  129. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/math_reasoning.py +0 -0
  130. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
  131. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
  132. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
  133. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
  134. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
  135. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
  136. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
  137. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
  138. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
  139. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
  140. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
  141. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
  142. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
  143. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
  144. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/truthfulqa.py +0 -0
  145. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
  146. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
  147. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
  148. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
  149. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
  150. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/eval_config.py +0 -0
  151. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/perturbation.py +0 -0
  152. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/registry.py +0 -0
  153. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/task_loader.py +0 -0
  154. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/task_names.py +0 -0
  155. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/utils/constants.py +0 -0
  156. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/utils/helpers.py +0 -0
  157. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/utils/logging.py +0 -0
  158. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/utils/packaging.py +0 -0
  159. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/template_formatting/README.md +0 -0
  160. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/template_formatting/__init__.py +0 -0
  161. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/template_formatting/mistral_formatter.py +0 -0
  162. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/template_formatting/py.typed +0 -0
  163. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/template_formatting/tests/test_formatter_eval.py +0 -0
  164. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/template_formatting/tests/test_formatter_scaling.py +0 -0
  165. {eval_framework-0.2.0 → eval_framework-0.2.2}/src/template_formatting/tests/test_mistral_formatter.py +0 -0
@@ -186,7 +186,7 @@
186
186
  same "printed page" as the copyright notice for easier
187
187
  identification within third-party archives.
188
188
 
189
- Copyright [yyyy] [name of copyright owner]
189
+ Copyright 2025 Aleph Alpha Research GmbH
190
190
 
191
191
  Licensed under the Apache License, Version 2.0 (the "License");
192
192
  you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: eval-framework
3
- Version: 0.2.0
3
+ Version: 0.2.2
4
4
  Summary: Evalulation Framework
5
5
  Author: Aleph Alpha Research
6
6
  License: Apache License
@@ -191,7 +191,7 @@ License: Apache License
191
191
  same "printed page" as the copyright notice for easier
192
192
  identification within third-party archives.
193
193
 
194
- Copyright [yyyy] [name of copyright owner]
194
+ Copyright 2025 Aleph Alpha Research GmbH
195
195
 
196
196
  Licensed under the Apache License, Version 2.0 (the "License");
197
197
  you may not use this file except in compliance with the License.
@@ -218,8 +218,6 @@ Requires-Dist: datasets>=2.19.1,<4
218
218
  Requires-Dist: sacrebleu>=2.4.3,<3
219
219
  Requires-Dist: pycountry>=24.6.1,<25
220
220
  Requires-Dist: nltk>=3.9.1,<4
221
- Requires-Dist: types-pyyaml>=6.0.12.20240917,<7
222
- Requires-Dist: psutil>=6.1,<7
223
221
  Requires-Dist: python-dotenv>=1.0.1,<2
224
222
  Requires-Dist: lingua-language-detector>=2.0.2,<3
225
223
  Requires-Dist: google-crc32c>=1.5.0,<2
@@ -235,7 +233,6 @@ Requires-Dist: jsonlines>=4,<5
235
233
  Requires-Dist: lxml>=6,<7
236
234
  Requires-Dist: python-iso639>=2025.2.18
237
235
  Requires-Dist: wandb>=0.21.1,<1
238
- Requires-Dist: torch
239
236
  Requires-Dist: accelerate ; extra == 'accelerate'
240
237
  Requires-Dist: eval-framework[determined,api,openai,transformers,accelerate,vllm,comet,optional,mistral] ; extra == 'all'
241
238
  Requires-Dist: aleph-alpha-client>=10,<11 ; extra == 'api'
@@ -270,21 +267,73 @@ Description-Content-Type: text/markdown
270
267
  # Aleph Alpha Eval-Framework
271
268
 
272
269
  > **Comprehensive LLM evaluation at scale** - A production-ready framework for evaluating large language models across 90+ benchmarks.
270
+ ![eval-framework](docs/eval-framework.png "https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/docs/eval-framework.png")
273
271
 
274
- ## Features
272
+ ## Why Choose This Framework?
273
+
274
+ - **Scalability**: Built for distributed evaluation. Currently providing an integration with Determined AI.
275
+ - **Extensibility**: Easily add custom models, benchmarks, and metrics with object-oriented base classes.
276
+ - **Comprehensive**: Comes pre-loaded with over 90 tasks covering a broad and diverse range, from reasoning and coding to safety and long-context. Also comes with a comprehensive set of metrics, including LLM-as-a-judge evaluations.
277
+
278
+ ## Other features
275
279
 
276
- - 90+ Benchmarks: Covers reasoning, knowledge, coding, long-context, and safety tasks.
277
- - Custom Benchmarks: Easily add new benchmarks with minimal code using the BaseTask class.
278
- - Distributed Evaluation: Integration with Determined AI for scalable distributed evaluation.
279
- - Docker Support: Pre-configured Dockerfiles for local and distributed setups.
280
280
  - Flexible Model Integration: Supports models loaded via HuggingFace Transformers or custom implementations using the BaseLLM class.
281
+ - Custom Benchmarks: Easily add new benchmarks with minimal code using the BaseTask class.
281
282
  - Custom Metrics: Easily define new metrics using the BaseMetric class.
282
- - Rich Outputs: Generates JSON results, plots, and detailed analysis reports.
283
283
  - Perturbation Testing: Robustness analysis with configurable perturbation types and probabilities.
284
+ - Rich Outputs: Generates JSON results, plots, and detailed analysis reports.
284
285
  - Statistical Analysis: Includes confidence intervals and significance testing for reliable comparisons.
285
- - LLM-as-a-Judge: Evaluation using LLM judges.
286
+ - Docker Support: Pre-configured Dockerfiles for local and distributed setups.
287
+
288
+ ## Quick Start
289
+
290
+ The codebase is tested and compatible with Python 3.12 and PyTorch 2.5.
291
+ You will also need the appropriate CUDA dependencies and version installed on your system for GPU support. Detailed installation instructions can be found [here](docs/installation.md).
292
+
293
+ The easiest way to get started is by installing the library via `pip` and use it as an external dependency.
294
+ ```
295
+ pip install eval_framework
296
+ ```
297
+
298
+ There are optional extras available to unlock specific features of the library:
299
+ - `api` for inference using the aleph-alpha client.
300
+ - `comet` for the COMET metric.
301
+ - `determined` for running jobs via determined.
302
+ - `mistral` for inference on Mistral models.
303
+ - `transformers` for inference using the transformers library.
304
+ - `vllm` for inference via VLLM.
305
+
306
+ As a short hand, the `all` extra installs all of the above.
307
+
308
+ For development, you can instead install it directly from the repository. Please first install
309
+ [uv](https://docs.astral.sh/uv/getting-started/installation/)
286
310
 
287
- ![eval-framework](docs/eval-framework.png "eval-framework")
311
+ To install the project with all optional extras use
312
+ ```bash
313
+ uv sync --all-extras
314
+ ```
315
+
316
+ We provide custom groups to control optional extras.
317
+ - `flash_attn`: Install `flash_attn` with correct handling of build isolation
318
+
319
+ Thus, the following will setup the project with `flash_attn`
320
+ ```bash
321
+ uv sync --all-extras --group flash_attn
322
+ ```
323
+
324
+ To evaluate a single benchmark locally, you can use the following command:
325
+
326
+ ```bash
327
+ eval_framework \
328
+ --models src/eval_framework/llm/models.py \
329
+ --llm-name Smollm135MInstruct \
330
+ --task-name "GSM8K" \
331
+ --output-dir ./eval \
332
+ --num-fewshot 5 \
333
+ --num-samples 10
334
+ ```
335
+
336
+ For more detailed CLI usage instructions, see the [CLI Usage Guide](docs/cli_usage.md).
288
337
 
289
338
  ## Benchmark Coverage & Task Categories
290
339
 
@@ -336,51 +385,6 @@ Evaluation metrics include:
336
385
 
337
386
  For the full list of tasks and metrics, see [Detailed Task Table](docs/benchmarks_and_metrics.md).
338
387
 
339
- ## Quick Start
340
-
341
- The codebase is tested and compatible with Python 3.12 and PyTorch 2.5.
342
- You will also need the appropriate CUDA dependencies and version installed on your system for GPU support.
343
-
344
- The easiest way to get started is by installing the library via `pip` and use it as an external dependency.
345
- ```
346
- pip install eval_framework
347
- ```
348
-
349
- There are optional extras available to unlock specific features of the library:
350
- - `mistral` for inference on Mistral models
351
- - `transformers` for inference using the transformers library
352
- - `api` for inference using the aleph-alpha client.
353
- - `vllm` for inference via VLLM
354
- - `determined` for running jobs via determined
355
- - `comet` for the COMET metric
356
-
357
- As a short hand, the `all` extra installs all of the above.
358
-
359
- For development, you can instead install it directly from the repository instead, please first install
360
- [uv](https://docs.astral.sh/uv/getting-started/installation/)
361
-
362
- To install the project with all optional extras use
363
- ```bash
364
- uv sync --all-extras
365
- ```
366
-
367
- We provide custom groups to control optional extras.
368
- - `cpu`: Use the CPU backend for torch
369
- - `cu124`: Use the CUDA 12.4 backend
370
- - `flash_attn`: Install `flash_attn` with correct handling of build isolation
371
-
372
- Thus, the following will setup the project with `flash_attn` and CUDA 12.4
373
- ```bash
374
- uv sync --all-extras --group flash_attn --group cu124
375
- ```
376
-
377
- There is also a pre-commit hook to help with development:
378
- ```
379
- uv run pre-commit install
380
- ```
381
-
382
- After installation, task documentation can be generated with `uv run python src/eval_framework/utils/generate_task_docs.py` (see [docs/installation.md(docs/installation.md)) for more details.
383
-
384
388
  ## Getting Started
385
389
 
386
390
  ### Understanding the Evaluation Framework
@@ -449,22 +453,7 @@ pip install eval_framework[transformers]
449
453
  - **Create custom benchmarks**: Follow our [benchmark creation guide](docs/add_new_benchmark_guide.md)
450
454
  - **Scale your evaluations**: Use [Determined AI integration](docs/using_determined.md) for distributed evaluation
451
455
  - **Understand your results**: Read our [results interpretation guide](docs/understanding_results_guide.md)
452
-
453
- ### Example CLI Usage
454
-
455
- To evaluate a single benchmark locally, you can use the following command:
456
-
457
- ```bash
458
- eval_framework \
459
- --models src/eval_framework/llm/models.py \
460
- --llm-name Smollm135MInstruct \
461
- --task-name "GSM8K" \
462
- --output-dir ./eval \
463
- --num-fewshot 5 \
464
- --num-samples 10
465
- ```
466
-
467
- For more detailed CLI usage instructions, see the [CLI Usage Guide](docs/cli_usage.md).
456
+ - **Log results in WandB**: See how [we integrate WandB](docs/wandb_integration.md) for metric and lineage tracking
468
457
 
469
458
  ## Documentation
470
459
 
@@ -485,6 +474,10 @@ For more detailed CLI usage instructions, see the [CLI Usage Guide](docs/cli_usa
485
474
  - **[Using Determined](docs/using_determined.md)** - Guide for distributed evaluation using Determined AI
486
475
  - **[Controlling Upload Results](docs/controlling_upload_results.md)** - How to manage and control the upload of evaluation results
487
476
 
477
+ ### Contributing
478
+
479
+ - **[Contributing Guide](CONTRIBUTING.md)** - Guide for contributing to this project
480
+
488
481
  ### Citation
489
482
 
490
483
  If you use `eval-framework` in your research, please cite:
@@ -509,6 +502,6 @@ This project has received funding from the European Union’s Digital Europe Pro
509
502
  The contents of this publication are the sole responsibility of the OpenEuroLLM consortium and do not necessarily reflect the opinion of the European Union.
510
503
 
511
504
  <p align="center">
512
- <img src="docs/OELLM_1.png" alt="OELLM 1" width="100" style="margin-right: 50px;"/>
513
- <img src="docs/OELLM_2.png" alt="OELLM 2" width="350"/>
505
+ <img src="docs/OELLM_1.png" alt="https://github.com/Aleph-Alpha-Research/eval-framework/raw/main/docs/OELLM_1.png" width="100" style="margin-right: 50px;"/>
506
+ <img src="docs/OELLM_2.png" alt="https://github.com/Aleph-Alpha-Research/eval-framework/raw/main/docs/OELLM_2.png" width="350"/>
514
507
  </p>
@@ -1,21 +1,73 @@
1
1
  # Aleph Alpha Eval-Framework
2
2
 
3
3
  > **Comprehensive LLM evaluation at scale** - A production-ready framework for evaluating large language models across 90+ benchmarks.
4
+ ![eval-framework](docs/eval-framework.png "https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/docs/eval-framework.png")
4
5
 
5
- ## Features
6
+ ## Why Choose This Framework?
7
+
8
+ - **Scalability**: Built for distributed evaluation. Currently providing an integration with Determined AI.
9
+ - **Extensibility**: Easily add custom models, benchmarks, and metrics with object-oriented base classes.
10
+ - **Comprehensive**: Comes pre-loaded with over 90 tasks covering a broad and diverse range, from reasoning and coding to safety and long-context. Also comes with a comprehensive set of metrics, including LLM-as-a-judge evaluations.
11
+
12
+ ## Other features
6
13
 
7
- - 90+ Benchmarks: Covers reasoning, knowledge, coding, long-context, and safety tasks.
8
- - Custom Benchmarks: Easily add new benchmarks with minimal code using the BaseTask class.
9
- - Distributed Evaluation: Integration with Determined AI for scalable distributed evaluation.
10
- - Docker Support: Pre-configured Dockerfiles for local and distributed setups.
11
14
  - Flexible Model Integration: Supports models loaded via HuggingFace Transformers or custom implementations using the BaseLLM class.
15
+ - Custom Benchmarks: Easily add new benchmarks with minimal code using the BaseTask class.
12
16
  - Custom Metrics: Easily define new metrics using the BaseMetric class.
13
- - Rich Outputs: Generates JSON results, plots, and detailed analysis reports.
14
17
  - Perturbation Testing: Robustness analysis with configurable perturbation types and probabilities.
18
+ - Rich Outputs: Generates JSON results, plots, and detailed analysis reports.
15
19
  - Statistical Analysis: Includes confidence intervals and significance testing for reliable comparisons.
16
- - LLM-as-a-Judge: Evaluation using LLM judges.
20
+ - Docker Support: Pre-configured Dockerfiles for local and distributed setups.
21
+
22
+ ## Quick Start
23
+
24
+ The codebase is tested and compatible with Python 3.12 and PyTorch 2.5.
25
+ You will also need the appropriate CUDA dependencies and version installed on your system for GPU support. Detailed installation instructions can be found [here](docs/installation.md).
26
+
27
+ The easiest way to get started is by installing the library via `pip` and use it as an external dependency.
28
+ ```
29
+ pip install eval_framework
30
+ ```
31
+
32
+ There are optional extras available to unlock specific features of the library:
33
+ - `api` for inference using the aleph-alpha client.
34
+ - `comet` for the COMET metric.
35
+ - `determined` for running jobs via determined.
36
+ - `mistral` for inference on Mistral models.
37
+ - `transformers` for inference using the transformers library.
38
+ - `vllm` for inference via VLLM.
39
+
40
+ As a short hand, the `all` extra installs all of the above.
41
+
42
+ For development, you can instead install it directly from the repository. Please first install
43
+ [uv](https://docs.astral.sh/uv/getting-started/installation/)
17
44
 
18
- ![eval-framework](docs/eval-framework.png "eval-framework")
45
+ To install the project with all optional extras use
46
+ ```bash
47
+ uv sync --all-extras
48
+ ```
49
+
50
+ We provide custom groups to control optional extras.
51
+ - `flash_attn`: Install `flash_attn` with correct handling of build isolation
52
+
53
+ Thus, the following will setup the project with `flash_attn`
54
+ ```bash
55
+ uv sync --all-extras --group flash_attn
56
+ ```
57
+
58
+ To evaluate a single benchmark locally, you can use the following command:
59
+
60
+ ```bash
61
+ eval_framework \
62
+ --models src/eval_framework/llm/models.py \
63
+ --llm-name Smollm135MInstruct \
64
+ --task-name "GSM8K" \
65
+ --output-dir ./eval \
66
+ --num-fewshot 5 \
67
+ --num-samples 10
68
+ ```
69
+
70
+ For more detailed CLI usage instructions, see the [CLI Usage Guide](docs/cli_usage.md).
19
71
 
20
72
  ## Benchmark Coverage & Task Categories
21
73
 
@@ -67,51 +119,6 @@ Evaluation metrics include:
67
119
 
68
120
  For the full list of tasks and metrics, see [Detailed Task Table](docs/benchmarks_and_metrics.md).
69
121
 
70
- ## Quick Start
71
-
72
- The codebase is tested and compatible with Python 3.12 and PyTorch 2.5.
73
- You will also need the appropriate CUDA dependencies and version installed on your system for GPU support.
74
-
75
- The easiest way to get started is by installing the library via `pip` and use it as an external dependency.
76
- ```
77
- pip install eval_framework
78
- ```
79
-
80
- There are optional extras available to unlock specific features of the library:
81
- - `mistral` for inference on Mistral models
82
- - `transformers` for inference using the transformers library
83
- - `api` for inference using the aleph-alpha client.
84
- - `vllm` for inference via VLLM
85
- - `determined` for running jobs via determined
86
- - `comet` for the COMET metric
87
-
88
- As a short hand, the `all` extra installs all of the above.
89
-
90
- For development, you can instead install it directly from the repository instead, please first install
91
- [uv](https://docs.astral.sh/uv/getting-started/installation/)
92
-
93
- To install the project with all optional extras use
94
- ```bash
95
- uv sync --all-extras
96
- ```
97
-
98
- We provide custom groups to control optional extras.
99
- - `cpu`: Use the CPU backend for torch
100
- - `cu124`: Use the CUDA 12.4 backend
101
- - `flash_attn`: Install `flash_attn` with correct handling of build isolation
102
-
103
- Thus, the following will setup the project with `flash_attn` and CUDA 12.4
104
- ```bash
105
- uv sync --all-extras --group flash_attn --group cu124
106
- ```
107
-
108
- There is also a pre-commit hook to help with development:
109
- ```
110
- uv run pre-commit install
111
- ```
112
-
113
- After installation, task documentation can be generated with `uv run python src/eval_framework/utils/generate_task_docs.py` (see [docs/installation.md(docs/installation.md)) for more details.
114
-
115
122
  ## Getting Started
116
123
 
117
124
  ### Understanding the Evaluation Framework
@@ -180,22 +187,7 @@ pip install eval_framework[transformers]
180
187
  - **Create custom benchmarks**: Follow our [benchmark creation guide](docs/add_new_benchmark_guide.md)
181
188
  - **Scale your evaluations**: Use [Determined AI integration](docs/using_determined.md) for distributed evaluation
182
189
  - **Understand your results**: Read our [results interpretation guide](docs/understanding_results_guide.md)
183
-
184
- ### Example CLI Usage
185
-
186
- To evaluate a single benchmark locally, you can use the following command:
187
-
188
- ```bash
189
- eval_framework \
190
- --models src/eval_framework/llm/models.py \
191
- --llm-name Smollm135MInstruct \
192
- --task-name "GSM8K" \
193
- --output-dir ./eval \
194
- --num-fewshot 5 \
195
- --num-samples 10
196
- ```
197
-
198
- For more detailed CLI usage instructions, see the [CLI Usage Guide](docs/cli_usage.md).
190
+ - **Log results in WandB**: See how [we integrate WandB](docs/wandb_integration.md) for metric and lineage tracking
199
191
 
200
192
  ## Documentation
201
193
 
@@ -216,6 +208,10 @@ For more detailed CLI usage instructions, see the [CLI Usage Guide](docs/cli_usa
216
208
  - **[Using Determined](docs/using_determined.md)** - Guide for distributed evaluation using Determined AI
217
209
  - **[Controlling Upload Results](docs/controlling_upload_results.md)** - How to manage and control the upload of evaluation results
218
210
 
211
+ ### Contributing
212
+
213
+ - **[Contributing Guide](CONTRIBUTING.md)** - Guide for contributing to this project
214
+
219
215
  ### Citation
220
216
 
221
217
  If you use `eval-framework` in your research, please cite:
@@ -240,6 +236,6 @@ This project has received funding from the European Union’s Digital Europe Pro
240
236
  The contents of this publication are the sole responsibility of the OpenEuroLLM consortium and do not necessarily reflect the opinion of the European Union.
241
237
 
242
238
  <p align="center">
243
- <img src="docs/OELLM_1.png" alt="OELLM 1" width="100" style="margin-right: 50px;"/>
244
- <img src="docs/OELLM_2.png" alt="OELLM 2" width="350"/>
239
+ <img src="docs/OELLM_1.png" alt="https://github.com/Aleph-Alpha-Research/eval-framework/raw/main/docs/OELLM_1.png" width="100" style="margin-right: 50px;"/>
240
+ <img src="docs/OELLM_2.png" alt="https://github.com/Aleph-Alpha-Research/eval-framework/raw/main/docs/OELLM_2.png" width="350"/>
245
241
  </p>
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "eval-framework"
3
- version = "0.2.0"
3
+ version = "0.2.2"
4
4
  description = "Evalulation Framework"
5
5
  readme = "README.md"
6
6
  license = { file = "LICENSE" }
@@ -25,25 +25,21 @@ dependencies = [
25
25
  "sacrebleu>=2.4.3,<3",
26
26
  "pycountry>=24.6.1,<25",
27
27
  "nltk>=3.9.1,<4",
28
- "types-pyyaml>=6.0.12.20240917,<7",
29
- "psutil>=6.1,<7",
30
28
  "python-dotenv>=1.0.1,<2",
31
29
  "lingua-language-detector>=2.0.2,<3",
32
30
  "google-crc32c>=1.5.0,<2",
33
- "kubernetes>=31.0.0,<32", # required by llm-sandbox though actually not needed
34
- "langdetect>=1.0.9,<2", # required by the original ifeval implementation
31
+ "kubernetes>=31.0.0,<32", # required by llm-sandbox though actually not needed
32
+ "langdetect>=1.0.9,<2", # required by the original ifeval implementation
35
33
  "spacy>=3.8.3,<4",
36
34
  "jsonschema>=4.23.0,<5",
37
- "mysql-connector-python>=9.0.0,<10", # required for sql-related tasks
38
- "psycopg2-binary>=2.9.9,<3", # required for sql-related tasks
35
+ "mysql-connector-python>=9.0.0,<10", # required for sql-related tasks
36
+ "psycopg2-binary>=2.9.9,<3", # required for sql-related tasks
39
37
  "sympy>=1.13.1,<2",
40
38
  "llm-sandbox[docker]>=0.1.8,<0.2",
41
39
  "jsonlines>=4,<5",
42
40
  "lxml>=6,<7",
43
41
  "python-iso639>=2025.2.18",
44
42
  "wandb>=0.21.1,<1",
45
- # Needed for uv bug: https://github.com/astral-sh/uv/issues/15661
46
- "torch",
47
43
  ]
48
44
 
49
45
  [project.optional-dependencies]
@@ -99,9 +95,10 @@ dev = [
99
95
  "plotly>=5.24.1,<6",
100
96
  "ruff>=0.12.8",
101
97
  ]
102
- flash-attn = ["flash-attn>=2.7.2.post1,<2.8"]
103
- cu124 = ["torch"]
104
- cpu = ["torch"]
98
+ flash-attn = [
99
+ "flash-attn>=2.7.2.post1,<2.8",
100
+ "torch"
101
+ ]
105
102
 
106
103
  [build-system]
107
104
  requires = ["uv_build>=0.8.10,<0.9.0"]
@@ -114,17 +111,11 @@ module-name = ["eval_framework", "template_formatting"]
114
111
  override-dependencies = [
115
112
  "requests>=2.32,<3", # fix for determined
116
113
  ]
117
- conflicts = [
118
- [
119
- { group = "cpu" },
120
- { group = "cu124" },
121
- ],
122
- ]
123
114
 
124
115
  [tool.uv.sources]
125
116
  torch = [
126
- { index = "pytorch-cu124", group = "cu124"},
127
- { index = "pytorch-cpu", group = "cpu"},
117
+ { index = "pytorch-default", marker = "sys_platform != 'linux'" },
118
+ { index = "pytorch-cu124", marker = "sys_platform == 'linux'" },
128
119
  ]
129
120
 
130
121
  [[tool.uv.index]]
@@ -133,8 +124,8 @@ url = "https://download.pytorch.org/whl/cu124"
133
124
  explicit = true
134
125
 
135
126
  [[tool.uv.index]]
136
- name = "pytorch-cpu"
137
- url = "https://download.pytorch.org/whl/cpu"
127
+ name = "pytorch-default"
128
+ url = "https://pypi.org/simple"
138
129
  explicit = true
139
130
 
140
131
  [tool.uv.extra-build-dependencies]
@@ -152,6 +143,12 @@ select = [
152
143
  "UP", # Auto-upgrading of new Python features
153
144
  "I", # Sort imports
154
145
  ]
146
+ [tool.ruff.lint.isort]
147
+ # https://github.com/astral-sh/ruff-pre-commit/issues/121
148
+ # https://github.com/astral-sh/ruff/issues/10519
149
+ # wandb creates a folder called 'wandb' during local runs (not logged in)
150
+ # this needs to be added to prevent isort from incorrectly sorting
151
+ known-third-party = ["wandb"]
155
152
 
156
153
  [tool.ruff.lint.extend-per-file-ignores]
157
154
  "__init__.py" = ["F401"]
@@ -8,7 +8,8 @@ from determined.core._context import init as determined_core_init
8
8
  from determined.core._distributed import DummyDistributedContext
9
9
  from pydantic import AfterValidator, BaseModel, ConfigDict
10
10
 
11
- from eval_framework.context.eval import EvalContext, import_models
11
+ from eval_framework.context.eval import EvalContext
12
+ from eval_framework.context.local import _load_model
12
13
  from eval_framework.llm.base import BaseLLM
13
14
  from eval_framework.tasks.eval_config import EvalConfig
14
15
  from eval_framework.tasks.perturbation import PerturbationConfig
@@ -111,18 +112,16 @@ class DeterminedContext(EvalContext):
111
112
  if val_cli and val_hparams and val_cli != val_hparams:
112
113
  logger.info(f"CLI argument {name} ({val_cli}) is being overridden by hyperparameters: ({val_hparams}).")
113
114
 
114
- models = import_models(self.models_path)
115
- if self.hparams.llm_name not in models:
116
- raise ValueError(f"LLM '{self.hparams.llm_name}' not found.")
117
- llm_class = models[self.hparams.llm_name]
118
-
119
- llm_judge_class: type[BaseLLM] | None = None
115
+ # Hyperparameters take precedence over core context
116
+ llm_name = self.hparams.llm_name or self.llm_name
120
117
  judge_model_name = self.hparams.task_args.judge_model_name or self.judge_model_name
121
- if self.judge_models_path is not None and judge_model_name is not None:
122
- judge_models = import_models(self.judge_models_path)
123
- if judge_model_name not in judge_models:
124
- raise ValueError(f"LLM judge '{judge_model_name}' not found.")
125
- llm_judge_class = judge_models[judge_model_name]
118
+
119
+ llm_class = _load_model(llm_name, models_path=self.models_path)
120
+ llm_judge_class: type[BaseLLM] | None = (
121
+ _load_model(judge_model_name, models_path=self.judge_models_path, info="judge")
122
+ if judge_model_name
123
+ else None
124
+ )
126
125
 
127
126
  # for all optional hyperparameters, resort to the respective CLI argument if the hyperparameter is not set
128
127
  self.config = EvalConfig(
@@ -2,6 +2,7 @@ import importlib.util
2
2
  import inspect
3
3
  import sys
4
4
  from contextlib import AbstractContextManager
5
+ from os import PathLike
5
6
  from pathlib import Path
6
7
  from typing import Any
7
8
 
@@ -11,7 +12,7 @@ from eval_framework.tasks.eval_config import EvalConfig
11
12
  from eval_framework.tasks.perturbation import PerturbationConfig
12
13
 
13
14
 
14
- def import_models(models_file: Path | str) -> dict[str, type[BaseLLM]]:
15
+ def import_models(models_file: PathLike | str) -> dict[str, type[BaseLLM]]:
15
16
  models_file = Path(models_file).resolve()
16
17
  library_path = Path(eval_framework.__path__[0]).resolve()
17
18
 
@@ -86,10 +87,10 @@ class EvalContext(AbstractContextManager):
86
87
  self.wandb_run_id = wandb_run_id
87
88
  self.hf_upload_dir = hf_upload_dir
88
89
  self.hf_upload_repo = hf_upload_repo
89
- self.llm_args = llm_args
90
+ self.llm_args = llm_args if llm_args is not None else {}
90
91
  self.judge_models_path = judge_models_path
91
92
  self.judge_model_name = judge_model_name
92
- self.judge_model_args = judge_model_args
93
+ self.judge_model_args = judge_model_args if judge_model_args is not None else {}
93
94
  self.batch_size = batch_size
94
95
  self.description = description
95
96
 
@@ -0,0 +1,75 @@
1
+ import importlib
2
+ from os import PathLike
3
+ from typing import Any
4
+
5
+ from eval_framework.context.eval import EvalContext, import_models
6
+ from eval_framework.llm.base import BaseLLM
7
+ from eval_framework.tasks.eval_config import EvalConfig
8
+
9
+
10
+ def _load_model(llm_name: str, models_path: str | PathLike | None, *, info: str = "") -> type[BaseLLM]:
11
+ """Load a model class either from a models file or as a fully qualified module path.
12
+
13
+ Args:
14
+ llm_name: The name of the model class to load, or a fully qualified module path.
15
+ models_path: The path to a Python file containing model class definitions
16
+ info: Additional info to include in error messages.
17
+ Returns:
18
+ The model class.
19
+ """
20
+ if models_path is None or "." in llm_name:
21
+ # The llm_name must a a fully qualified module path
22
+ if "." not in llm_name:
23
+ raise ValueError(f"LLM {info} '{llm_name}' is not a fully qualified module path.")
24
+ module_path, llm_class_name = llm_name.rsplit(".", 1)
25
+ module = importlib.import_module(module_path)
26
+ if not hasattr(module, llm_class_name):
27
+ raise ValueError(f"LLM '{llm_class_name}' not found in module '{module_path}'.")
28
+ return getattr(module, llm_class_name)
29
+ else:
30
+ models_dict = import_models(models_path)
31
+ if llm_name not in models_dict:
32
+ if info:
33
+ info = f"{info.strip()} "
34
+ raise ValueError(f"LLM {info} '{llm_name}' not found in {models_path}.")
35
+ return models_dict[llm_name]
36
+
37
+
38
+ class LocalContext(EvalContext):
39
+ def __enter__(self) -> "LocalContext":
40
+ llm_class = _load_model(self.llm_name, models_path=self.models_path)
41
+ self.llm_judge_class: type[BaseLLM] | None = None
42
+ if self.judge_model_name is not None:
43
+ self.llm_judge_class = _load_model(self.judge_model_name, models_path=self.judge_models_path, info="judge")
44
+
45
+ self.config = EvalConfig(
46
+ llm_class=llm_class,
47
+ llm_args=self.llm_args,
48
+ num_samples=self.num_samples,
49
+ max_tokens=self.max_tokens,
50
+ num_fewshot=self.num_fewshot,
51
+ perturbation_config=self.perturbation_config,
52
+ task_name=self.task_name,
53
+ task_subjects=self.task_subjects,
54
+ hf_revision=self.hf_revision,
55
+ output_dir=self.output_dir,
56
+ hf_upload_dir=self.hf_upload_dir,
57
+ hf_upload_repo=self.hf_upload_repo,
58
+ wandb_entity=self.wandb_entity,
59
+ wandb_project=self.wandb_project,
60
+ wandb_run_id=self.wandb_run_id,
61
+ llm_judge_class=self.llm_judge_class,
62
+ judge_model_args=self.judge_model_args,
63
+ batch_size=self.batch_size,
64
+ description=self.description,
65
+ )
66
+
67
+ return self
68
+
69
+ def __exit__(
70
+ self,
71
+ exc_type: type[BaseException] | None,
72
+ exc_value: BaseException | None,
73
+ traceback: Any | None,
74
+ ) -> None:
75
+ pass