eval-framework 0.2.10__tar.gz → 0.2.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (170) hide show
  1. {eval_framework-0.2.10 → eval_framework-0.2.12}/PKG-INFO +5 -5
  2. {eval_framework-0.2.10 → eval_framework-0.2.12}/pyproject.toml +5 -5
  3. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/llm/aleph_alpha.py +14 -70
  4. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/llm/base.py +8 -6
  5. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/llm/huggingface.py +12 -13
  6. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/llm/vllm.py +24 -23
  7. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/base.py +0 -2
  8. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/duc.py +11 -7
  9. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/flores200.py +1 -2
  10. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/gsm8k.py +1 -0
  11. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/infinitebench.py +1 -4
  12. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/mmlu_de.py +1 -0
  13. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +10 -0
  14. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/piqa.py +1 -0
  15. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/squad.py +0 -1
  16. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/winox.py +32 -0
  17. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +2 -0
  18. {eval_framework-0.2.10 → eval_framework-0.2.12}/LICENSE +0 -0
  19. {eval_framework-0.2.10 → eval_framework-0.2.12}/README.md +0 -0
  20. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/__init__.py +0 -0
  21. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/base_config.py +0 -0
  22. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/context/__init__.py +0 -0
  23. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/context/determined.py +0 -0
  24. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/context/eval.py +0 -0
  25. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/context/local.py +0 -0
  26. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/evaluation_generator.py +0 -0
  27. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/exceptions.py +0 -0
  28. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/external/ifeval_impl/README.md +0 -0
  29. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
  30. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
  31. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
  32. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
  33. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/llm/__init__.py +0 -0
  34. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/llm/mistral.py +0 -0
  35. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/llm/models.py +0 -0
  36. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/llm/openai.py +0 -0
  37. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/logger.py +0 -0
  38. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/main.py +0 -0
  39. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/__init__.py +0 -0
  40. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/base.py +0 -0
  41. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/__init__.py +0 -0
  42. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
  43. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
  44. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/bleu.py +0 -0
  45. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/chrf.py +0 -0
  46. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
  47. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
  48. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/comet.py +0 -0
  49. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
  50. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/csv_format.py +0 -0
  51. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
  52. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
  53. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/f1.py +0 -0
  54. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/format_checker.py +0 -0
  55. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
  56. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/ifeval.py +0 -0
  57. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/json_format.py +0 -0
  58. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/language_checker.py +0 -0
  59. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/length_control.py +0 -0
  60. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
  61. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
  62. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
  63. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/repetition.py +0 -0
  64. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
  65. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
  66. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
  67. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
  68. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
  69. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/ter.py +0 -0
  70. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/text_counter.py +0 -0
  71. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
  72. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
  73. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/__init__.py +0 -0
  74. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/base.py +0 -0
  75. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
  76. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
  77. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
  78. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
  79. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
  80. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
  81. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
  82. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/language.py +0 -0
  83. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
  84. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/models.py +0 -0
  85. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
  86. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
  87. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
  88. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
  89. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
  90. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
  91. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
  92. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
  93. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
  94. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
  95. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
  96. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
  97. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
  98. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
  99. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
  100. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/utils.py +0 -0
  101. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
  102. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
  103. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
  104. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
  105. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
  106. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
  107. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
  108. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/py.typed +0 -0
  109. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/response_generator.py +0 -0
  110. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/result_processors/__init__.py +0 -0
  111. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/result_processors/base.py +0 -0
  112. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/result_processors/hf_uploader.py +0 -0
  113. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/result_processors/result_processor.py +0 -0
  114. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
  115. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/run.py +0 -0
  116. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/run_direct.py +0 -0
  117. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/shared/types.py +0 -0
  118. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/__init__.py +0 -0
  119. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
  120. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/aidanbench.py +0 -0
  121. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
  122. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
  123. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
  124. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
  125. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
  126. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
  127. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
  128. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
  129. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
  130. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
  131. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
  132. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
  133. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
  134. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
  135. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/include.py +0 -0
  136. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/math_reasoning.py +0 -0
  137. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
  138. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
  139. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
  140. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
  141. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
  142. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
  143. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
  144. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
  145. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
  146. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
  147. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
  148. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
  149. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/truthfulqa.py +0 -0
  150. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
  151. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
  152. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
  153. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/eval_config.py +0 -0
  154. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/perturbation.py +0 -0
  155. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/registry.py +0 -0
  156. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/task_loader.py +0 -0
  157. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/task_names.py +0 -0
  158. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/utils.py +0 -0
  159. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/utils/constants.py +0 -0
  160. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/utils/file_ops.py +0 -0
  161. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/utils/generate_task_docs.py +0 -0
  162. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/utils/helpers.py +0 -0
  163. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/utils/logging.py +0 -0
  164. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/utils/packaging.py +0 -0
  165. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/utils/tqdm_handler.py +0 -0
  166. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/template_formatting/README.md +0 -0
  167. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/template_formatting/__init__.py +0 -0
  168. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/template_formatting/formatter.py +0 -0
  169. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/template_formatting/mistral_formatter.py +0 -0
  170. {eval_framework-0.2.10 → eval_framework-0.2.12}/src/template_formatting/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: eval-framework
3
- Version: 0.2.10
3
+ Version: 0.2.12
4
4
  Summary: Evalulation Framework
5
5
  Author: Aleph Alpha Research
6
6
  License: Apache License
@@ -214,7 +214,7 @@ Classifier: Typing :: Typed
214
214
  Requires-Dist: pyyaml>=6.0.1,<7
215
215
  Requires-Dist: xmltodict>=0.13.0,<0.16
216
216
  Requires-Dist: pydantic>=2.7,<3
217
- Requires-Dist: datasets>=2.19.1,<4
217
+ Requires-Dist: datasets>=4.0.0,<5
218
218
  Requires-Dist: sacrebleu>=2.4.3,<3
219
219
  Requires-Dist: pycountry>=24.6.1,<25
220
220
  Requires-Dist: nltk>=3.9.1,<4
@@ -238,15 +238,15 @@ Requires-Dist: numpy>=1.26.4
238
238
  Requires-Dist: antlr4-python3-runtime==4.11.0
239
239
  Requires-Dist: accelerate ; extra == 'accelerate'
240
240
  Requires-Dist: eval-framework[determined,api,openai,transformers,accelerate,vllm,comet,optional,mistral] ; extra == 'all'
241
- Requires-Dist: aleph-alpha-client>=10,<11 ; extra == 'api'
241
+ Requires-Dist: aleph-alpha-client>=11.5.1 ; extra == 'api'
242
242
  Requires-Dist: unbabel-comet>=2.2.6,<3 ; extra == 'comet'
243
243
  Requires-Dist: determined>=0.38,<0.39 ; extra == 'determined'
244
244
  Requires-Dist: tensorboard==2.19.0 ; extra == 'determined'
245
245
  Requires-Dist: mistral-common>=1.7,<2 ; extra == 'mistral'
246
246
  Requires-Dist: huggingface-hub>=0.33.2,<0.34 ; extra == 'mistral'
247
247
  Requires-Dist: eval-framework[vllm] ; extra == 'mistral'
248
- Requires-Dist: openai>=1.62,<2.8 ; extra == 'openai'
249
- Requires-Dist: tiktoken>=0.9,<0.10 ; extra == 'openai'
248
+ Requires-Dist: openai>=1.62,<3 ; extra == 'openai'
249
+ Requires-Dist: tiktoken>=0.9,<1 ; extra == 'openai'
250
250
  Requires-Dist: transformers>=4.45.2,<5 ; extra == 'openai'
251
251
  Requires-Dist: transformers>=4.45.2,<5 ; extra == 'optional'
252
252
  Requires-Dist: jinja2>=3.1.6,<4 ; extra == 'optional'
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "eval-framework"
3
- version = "0.2.10"
3
+ version = "0.2.12"
4
4
  description = "Evalulation Framework"
5
5
  readme = "README.md"
6
6
  license = { file = "LICENSE" }
@@ -21,7 +21,7 @@ dependencies = [
21
21
  "pyyaml>=6.0.1,<7",
22
22
  "xmltodict>=0.13.0,<0.16",
23
23
  "pydantic>=2.7,<3",
24
- "datasets>=2.19.1,<4", # dataset v4 has breaking changes we'd need to adapt to
24
+ "datasets>=4.0.0,<5",
25
25
  "sacrebleu>=2.4.3,<3",
26
26
  "pycountry>=24.6.1,<25",
27
27
  "nltk>=3.9.1,<4",
@@ -53,10 +53,10 @@ determined = [
53
53
  "determined>=0.38,<0.39",
54
54
  "tensorboard==2.19.0"
55
55
  ]
56
- api = ["aleph-alpha-client>=10,<11"]
56
+ api = ["aleph-alpha-client>=11.5.1"]
57
57
  openai = [
58
- "openai>=1.62,<2.8",
59
- "tiktoken>=0.9,<0.10",
58
+ "openai>=1.62,<3",
59
+ "tiktoken>=0.9,<1",
60
60
  "transformers>=4.45.2,<5",
61
61
  ]
62
62
  transformers = [
@@ -3,16 +3,12 @@ import json
3
3
  import logging
4
4
  import math
5
5
  import os
6
- import random
7
6
  import re
8
- import time
9
7
  import traceback
10
8
  from collections.abc import Callable, Sequence
11
9
 
12
- import aiohttp
13
10
  from aleph_alpha_client import (
14
11
  AsyncClient,
15
- BusyError,
16
12
  Client,
17
13
  CompletionRequest,
18
14
  CompletionResponse,
@@ -49,11 +45,11 @@ class AlephAlphaAPIModel(BaseLLM):
49
45
  formatter: BaseFormatter | None = None,
50
46
  checkpoint_name: str | None = None,
51
47
  temperature: float | None = None,
48
+ top_p: float | None = None,
52
49
  # Please see README.md for tips if adapting the following parameters.
53
50
  max_retries: int = 100,
54
51
  max_async_concurrent_requests: int = 32,
55
52
  request_timeout_seconds: int = 30 * 60 + 5,
56
- queue_full_timeout_seconds: int = 30 * 60 + 5,
57
53
  bytes_per_token: float | None = None,
58
54
  token: str = os.getenv("AA_TOKEN", "dummy"),
59
55
  base_url: str = os.getenv("AA_INFERENCE_ENDPOINT", "dummy_endpoint"),
@@ -67,10 +63,10 @@ class AlephAlphaAPIModel(BaseLLM):
67
63
  self._formatter = formatter
68
64
  self._llm_name = checkpoint_name or self.LLM_NAME
69
65
  self._temperature = temperature if temperature is not None else 0.0
66
+ self._top_p = top_p if top_p is not None else 0.0
70
67
  self.max_async_concurrent_requests = max_async_concurrent_requests
71
68
  self.max_retries = max_retries
72
69
  self.request_timeout_seconds = request_timeout_seconds
73
- self.queue_full_timeout_seconds = queue_full_timeout_seconds
74
70
  self.token = token
75
71
  self.base_url = base_url
76
72
  self._validate_model_availability(base_url, token)
@@ -101,56 +97,6 @@ class AlephAlphaAPIModel(BaseLLM):
101
97
  except Exception as e:
102
98
  raise RuntimeError(f"Model '{self._llm_name}' is not available: {e}")
103
99
 
104
- async def _request_with_backoff(
105
- self, client: AsyncClient, request: CompletionRequest, id: int
106
- ) -> CompletionResponse:
107
- """
108
- Query Aleph-Alpha API with complete. Retry with back-off until it responds.
109
- """
110
- num_attempts = 0
111
- start_time: float | None = None
112
-
113
- while True:
114
- try:
115
- return await client.complete(request, model=self._llm_name)
116
-
117
- except (TimeoutError, BusyError, RuntimeError, aiohttp.ClientError) as e:
118
- status_code: str = safe_json_loads(e.args[1]).get("code", "") if len(e.args) >= 2 else ""
119
- str_e = str(e)
120
- if status_code == "QUEUE_FULL":
121
- # Worker not available or missed a heartbeat (inference longer than scheduler's
122
- # API_MODEL_AVAILABLE_TIMEOUT_DURATION_MILLIS) or the scheduler is overloaded.
123
- if start_time is None:
124
- start_time = time.time()
125
- elapsed = time.time() - start_time
126
- if elapsed <= self.queue_full_timeout_seconds:
127
- logger.info(
128
- f"Request {id}: {status_code or str_e[:256]} - retrying: attempt"
129
- f" {num_attempts}/{self.max_retries}, elapsed {elapsed:.1f} sec"
130
- )
131
- # don't count as retry (request returns immediately, so just wait a bit not to DoS the server)
132
- await asyncio.sleep(random.randint(5, 30))
133
- continue
134
-
135
- elif (
136
- status_code == "TIMEOUT_TASK"
137
- or isinstance(e, TimeoutError)
138
- or "502 Bad Gateway" in str_e
139
- or "504 Gateway Time-out" in str_e
140
- or isinstance(e, aiohttp.ClientError)
141
- ):
142
- # client timeout, either because task too long in a queue or inference too long
143
- # (scheduler's API_CLIENT_TIMEOUT_DURATION_MILLIS). Retrying for the "inference too long"
144
- # case makes no sense but we unfortunately don't know which case has happened.
145
- num_attempts += 1
146
- start_time = None
147
- if num_attempts < self.max_retries:
148
- logger.info(f"Request {id}: TIMEOUT_TASK - retrying: attempt {num_attempts}/{self.max_retries}")
149
- await asyncio.sleep(random.randint(5, 30))
150
- continue
151
-
152
- raise e
153
-
154
100
  def _error_from_exception(self, e: Exception) -> Error:
155
101
  """Convert an exception to an Error object."""
156
102
  if len(e.args) >= 2:
@@ -171,39 +117,36 @@ class AlephAlphaAPIModel(BaseLLM):
171
117
  async def _process_request_with_client(
172
118
  self,
173
119
  client: AsyncClient,
174
- semaphore: asyncio.Semaphore,
175
120
  request: CompletionRequest,
176
121
  id: int,
177
122
  ) -> tuple[CompletionRequest, CompletionResponse | Error]:
178
123
  """Process a single request, returning the request and either a response or error."""
179
- async with semaphore:
180
- try:
181
- response = await self._request_with_backoff(client=client, request=request, id=id)
182
- logger.info(f"Request {id}: Success")
183
- return (request, response)
184
- except Exception as e:
185
- if raise_errors():
186
- raise e
187
- logger.info(f"Request {id}: Failure: {str(e)[:256]}")
188
- return (request, self._error_from_exception(e))
124
+ try:
125
+ response = await client.complete(request, model=self._llm_name)
126
+ logger.info(f"Request {id}: Success")
127
+ return (request, response)
128
+ except Exception as e:
129
+ if raise_errors():
130
+ raise e
131
+ logger.info(f"Request {id}: Failure: {str(e)[:256]}")
132
+ return (request, self._error_from_exception(e))
189
133
 
190
134
  async def _process_requests(
191
135
  self,
192
136
  requests: list[CompletionRequest],
193
137
  ) -> list[tuple[CompletionRequest, CompletionResponse | Error]]:
194
138
  """Process multiple requests concurrently, returning request/response pairs."""
195
- semaphore = asyncio.Semaphore(self.max_async_concurrent_requests)
196
139
  async with AsyncClient(
197
140
  host=self.base_url,
198
141
  nice=True,
199
142
  request_timeout_seconds=self.request_timeout_seconds,
200
143
  token=self.token,
201
- total_retries=0, # we have a custom retry policy in _request_with_backoff()
144
+ total_retries=self.max_retries,
145
+ limit=self.max_async_concurrent_requests,
202
146
  ) as client:
203
147
  tasks = (
204
148
  self._process_request_with_client(
205
149
  client,
206
- semaphore,
207
150
  request,
208
151
  i,
209
152
  )
@@ -272,6 +215,7 @@ class AlephAlphaAPIModel(BaseLLM):
272
215
  maximum_tokens=scaled_max_tokens,
273
216
  stop_sequences=stop_sequences,
274
217
  temperature=effective_temperature,
218
+ top_p=self._top_p,
275
219
  )
276
220
  )
277
221
 
@@ -1,5 +1,6 @@
1
1
  from abc import ABC, abstractmethod
2
- from collections.abc import Sequence
2
+ from collections.abc import Generator, Sequence
3
+ from contextlib import contextmanager
3
4
  from pathlib import Path
4
5
  from typing import Any
5
6
 
@@ -112,21 +113,22 @@ class BaseLLM(ABC):
112
113
  """
113
114
  pass
114
115
 
116
+ @contextmanager
115
117
  def _get_final_checkpoint(
116
118
  self, checkpoint_path: str | Path | None = None, model_name: str | None = None, artifact_name: str | None = None
117
- ) -> tuple[str | Path | None, str | None]:
119
+ ) -> Generator[tuple[str | Path | None, str | None], None, None]:
118
120
  if (num_provided := sum(x is not None for x in [checkpoint_path, model_name, artifact_name])) == 0:
119
121
  if not getattr(self, "LLM_NAME", ""):
120
122
  raise ValueError("Either LLM_NAME, checkpoint_path, model_name, or artifact_name must be provided.")
121
- return None, None # no argument given, so will use the LLM_NAME of the class
123
+ yield None, None # no argument given, so will use the LLM_NAME of the class
122
124
  elif num_provided > 1:
123
125
  raise ValueError("At most one of `checkpoint_path`, `model_name`, or `artifact_name` must be provided.")
124
126
 
125
127
  elif checkpoint_path is not None:
126
- return checkpoint_path, str(checkpoint_path)
128
+ yield checkpoint_path, str(checkpoint_path)
127
129
 
128
130
  elif model_name is not None:
129
- return model_name, model_name
131
+ yield model_name, model_name
130
132
 
131
133
  else:
132
134
  from eval_framework.utils.file_ops import WandbFs
@@ -139,7 +141,7 @@ class BaseLLM(ABC):
139
141
  file_root = wandb_fs.find_hf_checkpoint_root_from_path_list()
140
142
  if file_root is None:
141
143
  raise ValueError(f"Could not find HuggingFace checkpoint in artifact {artifact_base}:{version}")
142
- return file_root, artifact_name
144
+ yield file_root, artifact_name
143
145
 
144
146
  def _get_final_formatter(
145
147
  self,
@@ -322,22 +322,21 @@ class HFLLM(BaseHFLLM):
322
322
  bytes_per_token: float | None = None,
323
323
  **kwargs: Any,
324
324
  ) -> None:
325
- final_path, possible_name = self._get_final_checkpoint(checkpoint_path, model_name, artifact_name)
325
+ with self._get_final_checkpoint(checkpoint_path, model_name, artifact_name) as (final_path, possible_name):
326
+ self.checkpoint_name = checkpoint_name
327
+ if self.checkpoint_name is None and possible_name is not None:
328
+ self.checkpoint_name = possible_name.replace("/", "_").replace(":", "_").strip("_") # sanitize pathname
326
329
 
327
- self.checkpoint_name = checkpoint_name
328
- if self.checkpoint_name is None and possible_name is not None:
329
- self.checkpoint_name = possible_name.replace("/", "_").replace(":", "_").strip("_") # sanitize pathname
330
+ if final_path:
331
+ self.LLM_NAME = str(final_path)
330
332
 
331
- if final_path:
332
- self.LLM_NAME = str(final_path)
333
+ final_formatter = self._get_final_formatter(formatter, formatter_name, formatter_kwargs)
333
334
 
334
- final_formatter = self._get_final_formatter(formatter, formatter_name, formatter_kwargs)
335
-
336
- super().__init__(
337
- formatter=final_formatter,
338
- bytes_per_token=bytes_per_token,
339
- **kwargs,
340
- )
335
+ super().__init__(
336
+ formatter=final_formatter,
337
+ bytes_per_token=bytes_per_token,
338
+ **kwargs,
339
+ )
341
340
 
342
341
  @property
343
342
  def name(self) -> str:
@@ -137,10 +137,12 @@ class BaseVLLMModel(BaseLLM):
137
137
  device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
138
138
 
139
139
  self.batch_size = batch_size
140
- self._tokenizer: None | VLLMTokenizerAPI = None
141
140
 
142
141
  self.model = LLM(**model_args, device=device)
143
142
 
143
+ self._tokenizer: None | VLLMTokenizerAPI = None
144
+ _ = self.tokenizer # make sure tokenizer is initialized
145
+
144
146
  self.sampling_params: SamplingParams = self._process_sampling_params(sampling_params)
145
147
 
146
148
  logger.info(
@@ -481,28 +483,27 @@ class VLLMModel(BaseVLLMModel):
481
483
  sampling_params: SamplingParams | dict[str, Any] | None = None,
482
484
  **kwargs: Any,
483
485
  ) -> None:
484
- final_path, possible_name = self._get_final_checkpoint(checkpoint_path, model_name, artifact_name)
485
-
486
- if final_path:
487
- self.LLM_NAME = str(final_path)
488
-
489
- final_name = checkpoint_name
490
- if final_name is None and possible_name is not None:
491
- final_name = possible_name.replace("/", "_").replace(":", "_").strip("_") # sanitize pathname
492
-
493
- final_formatter = self._get_final_formatter(formatter, formatter_name, formatter_kwargs)
494
-
495
- super().__init__(
496
- formatter=final_formatter,
497
- checkpoint_path=final_path,
498
- checkpoint_name=final_name,
499
- max_model_len=max_model_len,
500
- tensor_parallel_size=tensor_parallel_size,
501
- gpu_memory_utilization=gpu_memory_utilization,
502
- batch_size=batch_size,
503
- sampling_params=sampling_params,
504
- **kwargs,
505
- )
486
+ with self._get_final_checkpoint(checkpoint_path, model_name, artifact_name) as (final_path, possible_name):
487
+ if final_path:
488
+ self.LLM_NAME = str(final_path)
489
+
490
+ final_name = checkpoint_name
491
+ if final_name is None and possible_name is not None:
492
+ final_name = possible_name.replace("/", "_").replace(":", "_").strip("_") # sanitize pathname
493
+
494
+ final_formatter = self._get_final_formatter(formatter, formatter_name, formatter_kwargs)
495
+
496
+ super().__init__(
497
+ formatter=final_formatter,
498
+ checkpoint_path=final_path,
499
+ checkpoint_name=final_name,
500
+ max_model_len=max_model_len,
501
+ tensor_parallel_size=tensor_parallel_size,
502
+ gpu_memory_utilization=gpu_memory_utilization,
503
+ batch_size=batch_size,
504
+ sampling_params=sampling_params,
505
+ **kwargs,
506
+ )
506
507
 
507
508
 
508
509
  class VLLMRegistryModel(VLLMModel): # deprecated
@@ -171,7 +171,6 @@ class BaseTask[SubjectType](ABC):
171
171
  return load_dataset(
172
172
  **kwargs,
173
173
  revision=self.HF_REVISION,
174
- trust_remote_code=True,
175
174
  cache_dir=cache_dir,
176
175
  download_config=download_config,
177
176
  )
@@ -179,7 +178,6 @@ class BaseTask[SubjectType](ABC):
179
178
  return load_dataset(
180
179
  **kwargs,
181
180
  revision=self.HF_REVISION,
182
- trust_remote_code=True,
183
181
  cache_dir=f"{Path.home()}/.cache/eval-framework",
184
182
  )
185
183
 
@@ -12,11 +12,12 @@ class DUC(BaseTask[str], ABC):
12
12
  """https://huggingface.co/datasets/midas/duc2001"""
13
13
 
14
14
  DATASET_PATH: str = "midas/duc2001"
15
- SAMPLE_SPLIT: str = "test"
16
- FEWSHOT_SPLIT: str = "test"
15
+ HF_REVISION: str = "77d6dedcbce421695a12f24c8802e8847a129d92"
16
+ SAMPLE_SPLIT: str = "train"
17
+ FEWSHOT_SPLIT: str = "train"
17
18
  RESPONSE_TYPE: ResponseType = ResponseType.COMPLETION
18
19
  METRICS: list[type[BaseMetric]] = [AccuracyCompletion]
19
- SUBJECTS: list[str] = ["raw"]
20
+ SUBJECTS: list[str] = ["default"]
20
21
  PERTURBATION_UNMODIFIABLE_WORDS = ["Text", "Keyphrase"]
21
22
  LANGUAGE = Language.ENG
22
23
 
@@ -33,6 +34,10 @@ class DUC(BaseTask[str], ABC):
33
34
  completion_text = completion_text.strip()
34
35
  return completion_text
35
36
 
37
+ def _load_dataset(self, subject: str) -> None:
38
+ hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=subject, data_files="raw/test/0000.parquet")
39
+ self.dataset = self._shuffle_splits(hf_dataset=hf_dataset)
40
+
36
41
  def _get_instruction_text(self, item: dict[str, Any]) -> str:
37
42
  instruction_text = " ".join(item["document"])
38
43
  instruction_text = re.sub(r"\s+([.,!?;:])", r"\1", instruction_text)
@@ -47,7 +52,7 @@ class DUC(BaseTask[str], ABC):
47
52
 
48
53
  class DUC_EXTRACTIVE(DUC):
49
54
  NAME = "DUC Extractive"
50
- SUBJECTS: list[str] = ["raw"]
55
+ SUBJECTS: list[str] = ["default"]
51
56
 
52
57
  def _get_ground_truth(self, item: dict[str, Any]) -> list[str]:
53
58
  return item["extractive_keyphrases"]
@@ -61,14 +66,13 @@ class DUC_EXTRACTIVE(DUC):
61
66
 
62
67
  class DUC_ABSTRACTIVE(DUC):
63
68
  NAME = "DUC Abstractive"
64
- SUBJECTS: list[str] = ["raw"]
69
+ SUBJECTS: list[str] = ["default"]
65
70
 
66
71
  def _get_ground_truth(self, item: dict[str, Any]) -> list[str]:
67
72
  return item["abstractive_keyphrases"]
68
73
 
69
74
  def _load_dataset(self, subject: str) -> None:
70
- # not all samples have abstractive keyphrases
71
- hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=subject)
75
+ hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=subject, data_files="raw/test/0000.parquet")
72
76
  self.dataset = {}
73
77
 
74
78
  for split, data in hf_dataset.items():
@@ -25,6 +25,7 @@ class Flores200(BaseTask[str]):
25
25
 
26
26
  NAME = "FLoRes-200"
27
27
  DATASET_PATH = "facebook/flores"
28
+ HF_REVISION = "fd7d8f42fccb9dbc35830053a8c705a2627124ce"
28
29
  SAMPLE_SPLIT = "devtest"
29
30
  FEWSHOT_SPLIT = "dev"
30
31
  RESPONSE_TYPE = ResponseType.COMPLETION
@@ -66,7 +67,6 @@ class Flores200(BaseTask[str]):
66
67
  split=kwargs.get("split"),
67
68
  data_files=None, # Let it auto-discover parquet files
68
69
  revision=self.HF_REVISION,
69
- trust_remote_code=False, # Disable the loading script!
70
70
  cache_dir=cache_dir,
71
71
  download_config=download_config,
72
72
  )
@@ -79,7 +79,6 @@ class Flores200(BaseTask[str]):
79
79
  dataset = load_dataset(
80
80
  **kwargs,
81
81
  revision=self.HF_REVISION,
82
- trust_remote_code=True,
83
82
  cache_dir=cache_dir,
84
83
  download_config=download_config,
85
84
  )
@@ -92,6 +92,7 @@ class GSM8KEvalHarness(BaseTask[str]):
92
92
 
93
93
  NAME = "GSM8KEvalHarness"
94
94
  DATASET_PATH = "openai/gsm8k"
95
+ HF_REVISION = "main"
95
96
  SAMPLE_SPLIT = "test"
96
97
  FEWSHOT_SPLIT = "train"
97
98
  RESPONSE_TYPE = ResponseType.COMPLETION
@@ -39,13 +39,10 @@ class InfiniteBench(BaseTask[str], ABC):
39
39
  }
40
40
  )
41
41
  try:
42
- return load_dataset(
43
- **kwargs, trust_remote_code=True, cache_dir=cache_dir, download_config=download_config, features=ft
44
- )
42
+ return load_dataset(**kwargs, cache_dir=cache_dir, download_config=download_config, features=ft)
45
43
  except Exception:
46
44
  return load_dataset(
47
45
  **kwargs,
48
- trust_remote_code=True,
49
46
  cache_dir=f"{Path.home()}/.cache/eval-framework",
50
47
  features=ft,
51
48
  )
@@ -73,6 +73,7 @@ class MMLU_DE(BaseTask[str]):
73
73
 
74
74
  NAME = "MMLU_DE"
75
75
  DATASET_PATH = "LeoLM/MMLU_de"
76
+ HF_REVISION = "11433b408001dd26444c7e666cc536e0b8907ca5"
76
77
  SAMPLE_SPLIT = "test"
77
78
  FEWSHOT_SPLIT = "validation"
78
79
  RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
@@ -27,6 +27,7 @@ class ARC_EU20_DE(ARC):
27
27
 
28
28
  NAME = "ARC_EU20_DE"
29
29
  DATASET_PATH = "openGPT-X/arcx"
30
+ HF_REVISION = "e4c31fa077b82832cc21e614832701603a8ad319"
30
31
  SAMPLE_SPLIT = "test"
31
32
  FEWSHOT_SPLIT = "train"
32
33
  SUBJECTS = ["challenge_DE", "easy_DE"]
@@ -36,6 +37,7 @@ class ARC_EU20_DE(ARC):
36
37
  class ARC_EU20_FR(ARC):
37
38
  NAME = "ARC_EU20_FR"
38
39
  DATASET_PATH = "openGPT-X/arcx"
40
+ HF_REVISION = "e4c31fa077b82832cc21e614832701603a8ad319"
39
41
  SAMPLE_SPLIT = "test"
40
42
  FEWSHOT_SPLIT = "train"
41
43
  SUBJECTS = ["challenge_FR", "easy_FR"]
@@ -51,6 +53,7 @@ class GSM8K_EU20_DE(GSM8KEvalHarness):
51
53
  """ # noqa: E501
52
54
 
53
55
  NAME = "GSM8K_EU20_DE"
56
+ HF_REVISION = "3ed0f81d31a9013e05d16644aabcc36db50078a9"
54
57
  DATASET_PATH = "openGPT-X/gsm8kx"
55
58
  SAMPLE_SPLIT = "test"
56
59
  FEWSHOT_SPLIT = "train"
@@ -60,6 +63,7 @@ class GSM8K_EU20_DE(GSM8KEvalHarness):
60
63
 
61
64
  class GSM8K_EU20_FR(GSM8KEvalHarness):
62
65
  NAME = "GSM8K_EU20_FR"
66
+ HF_REVISION = "3ed0f81d31a9013e05d16644aabcc36db50078a9"
63
67
  DATASET_PATH = "openGPT-X/gsm8kx"
64
68
  SAMPLE_SPLIT = "test"
65
69
  FEWSHOT_SPLIT = "train"
@@ -77,6 +81,7 @@ class HELLASWAG_EU20_DE(HELLASWAG):
77
81
 
78
82
  NAME = "HellaSwag_EU20_DE"
79
83
  DATASET_PATH = "openGPT-X/hellaswagx"
84
+ HF_REVISION = "7c30407f4f11fa4fada74bd4384ed0fe572ae8f2"
80
85
  SAMPLE_SPLIT = "train"
81
86
  FEWSHOT_SPLIT = "validation"
82
87
  SUBJECTS = ["DE"]
@@ -86,6 +91,7 @@ class HELLASWAG_EU20_DE(HELLASWAG):
86
91
  class HELLASWAG_EU20_FR(HELLASWAG):
87
92
  NAME = "HellaSwag_EU20_FR"
88
93
  DATASET_PATH = "openGPT-X/hellaswagx"
94
+ HF_REVISION = "7c30407f4f11fa4fada74bd4384ed0fe572ae8f2"
89
95
  SAMPLE_SPLIT = "train"
90
96
  FEWSHOT_SPLIT = "validation"
91
97
  SUBJECTS = ["FR"]
@@ -128,6 +134,7 @@ class TRUTHFULQA_EU20_DE(TRUTHFULQA):
128
134
 
129
135
  NAME = "TruthfulQA_EU20_DE"
130
136
  DATASET_PATH = "openGPT-X/truthfulqax"
137
+ HF_REVISION = "cff042da87dfb8885c357cb1c83194fa6aaf1d49"
131
138
  LANGUAGE = Language.DEU
132
139
 
133
140
  def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]:
@@ -182,6 +189,7 @@ TRUTHFULQA_EU20_FR_FEWSHOT_ITEMS = [
182
189
  class TRUTHFULQA_EU20_FR(TRUTHFULQA):
183
190
  NAME = "TruthfulQA_EU20_FR"
184
191
  DATASET_PATH = "openGPT-X/truthfulqax"
192
+ HF_REVISION = "cff042da87dfb8885c357cb1c83194fa6aaf1d49"
185
193
  LANGUAGE = Language.FRA
186
194
 
187
195
  def _load_dataset(self, subject: SubjectType) -> None:
@@ -214,6 +222,7 @@ class MMLU_EU20_DE(MMLU):
214
222
 
215
223
  NAME = "MMLU_EU20_DE"
216
224
  DATASET_PATH = "openGPT-X/mmlux"
225
+ HF_REVISION = "6412d5d5d03a7b31d02f4ba34b787c2e7939a800"
217
226
  SAMPLE_SPLIT = "test"
218
227
  FEWSHOT_SPLIT = "dev" # one could merge dev and validation to have a larger pool of fewshot examples
219
228
  SUBJECTS = [i + "_DE" for i in MMLU_SUBJECTS]
@@ -321,6 +330,7 @@ MMLU_SUBJECTS_TRANSLATION_FR = {
321
330
  class MMLU_EU20_FR(MMLU):
322
331
  NAME = "MMLU_EU20_FR"
323
332
  DATASET_PATH = "openGPT-X/mmlux"
333
+ HF_REVISION = "6412d5d5d03a7b31d02f4ba34b787c2e7939a800"
324
334
  SAMPLE_SPLIT = "test"
325
335
  FEWSHOT_SPLIT = "dev"
326
336
  SUBJECTS = [i + "_FR" for i in MMLU_SUBJECTS]
@@ -15,6 +15,7 @@ class PIQA(BaseTask[str]):
15
15
 
16
16
  NAME = "PIQA"
17
17
  DATASET_PATH = "ybisk/piqa"
18
+ HF_REVISION = "6b3aceb3276e5ab7e51895d73151a718690af38c"
18
19
  SAMPLE_SPLIT = "validation" # 1838 examples (same split as lm-eval)
19
20
  FEWSHOT_SPLIT = "test" # 3084 examples
20
21
  RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
@@ -75,7 +75,6 @@ class SQUAD2(BaseTask[str]):
75
75
  return load_dataset(
76
76
  **kwargs,
77
77
  revision=self.HF_REVISION,
78
- trust_remote_code=True,
79
78
  cache_dir=cache_dir,
80
79
  download_config=download_config,
81
80
  )
@@ -1,5 +1,11 @@
1
+ import os
2
+ from pathlib import Path
1
3
  from typing import Any
2
4
 
5
+ from datasets import DownloadConfig, load_dataset
6
+ from huggingface_hub import HfApi
7
+ from huggingface_hub.errors import RevisionNotFoundError
8
+
3
9
  from eval_framework.tasks.base import Language
4
10
  from eval_framework.tasks.benchmarks.winogrande import WINOGRANDE
5
11
 
@@ -19,6 +25,7 @@ class WINOX(WINOGRANDE):
19
25
  """
20
26
 
21
27
  DATASET_PATH = "demelin/wino_x"
28
+ HF_REVISION = "7d82697fd52ac8b03e62aadfddc61077320f21e7"
22
29
  SAMPLE_SPLIT = "test"
23
30
  FEWSHOT_SPLIT = "test"
24
31
  LANGUAGE_SHORT_CODE = ""
@@ -42,6 +49,31 @@ class WINOX(WINOGRANDE):
42
49
  ]
43
50
  return choices
44
51
 
52
+ def _load_hf_dataset(self, **kwargs: Any) -> Any:
53
+ """Override to handle FLORES-200 encoding issues by using parquet files."""
54
+ # Check if the HF_REVISION is valid before loading the dataset
55
+ if self.HF_REVISION:
56
+ try:
57
+ _ = HfApi().dataset_info(repo_id=kwargs["path"], revision=self.HF_REVISION, timeout=100.0)
58
+ except Exception as e:
59
+ if isinstance(e, RevisionNotFoundError):
60
+ raise e
61
+
62
+ cache_dir: str = os.environ.get("HF_DATASET_CACHE_DIR", f"{Path.home()}/.cache/huggingface/datasets")
63
+ download_config = DownloadConfig(cache_dir=cache_dir, max_retries=5)
64
+
65
+ dataset = load_dataset(
66
+ kwargs.get("path", self.DATASET_PATH),
67
+ name=kwargs.get("name"),
68
+ split=kwargs.get("split"),
69
+ data_files=None, # Let it auto-discover parquet files
70
+ revision=self.HF_REVISION,
71
+ cache_dir=cache_dir,
72
+ download_config=download_config,
73
+ )
74
+
75
+ return dataset
76
+
45
77
 
46
78
  class WINOX_DE(WINOX):
47
79
  NAME = "WINOX_DE"
@@ -16,6 +16,7 @@ class ZERO_SCROLLS_QUALITY(BaseTask[str]):
16
16
 
17
17
  NAME = "ZeroSCROLLS QuALITY"
18
18
  DATASET_PATH = "tau/zero_scrolls"
19
+ HF_REVISION = "3ee203cfad81b1a4fab8f2351c12679fbe95b179"
19
20
  SAMPLE_SPLIT = "validation"
20
21
  FEWSHOT_SPLIT = "validation"
21
22
  RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
@@ -48,6 +49,7 @@ class ZERO_SCROLLS_COMPLETION(BaseTask[str]):
48
49
  """ZeroSCROLLS dataset: https://huggingface.co/datasets/tau/zero_scrolls"""
49
50
 
50
51
  DATASET_PATH = "tau/zero_scrolls"
52
+ HF_REVISION = "3ee203cfad81b1a4fab8f2351c12679fbe95b179"
51
53
  SAMPLE_SPLIT = "validation"
52
54
  FEWSHOT_SPLIT = "validation"
53
55
  RESPONSE_TYPE = ResponseType.COMPLETION