eval-framework 0.2.11__tar.gz → 0.2.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (170) hide show
  1. {eval_framework-0.2.11 → eval_framework-0.2.12}/PKG-INFO +4 -4
  2. {eval_framework-0.2.11 → eval_framework-0.2.12}/pyproject.toml +4 -4
  3. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/llm/aleph_alpha.py +3 -0
  4. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/base.py +0 -2
  5. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/duc.py +11 -7
  6. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/flores200.py +1 -2
  7. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/gsm8k.py +1 -0
  8. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/infinitebench.py +1 -4
  9. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/mmlu_de.py +1 -0
  10. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +10 -0
  11. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/piqa.py +1 -0
  12. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/squad.py +0 -1
  13. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/winox.py +32 -0
  14. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +2 -0
  15. {eval_framework-0.2.11 → eval_framework-0.2.12}/LICENSE +0 -0
  16. {eval_framework-0.2.11 → eval_framework-0.2.12}/README.md +0 -0
  17. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/__init__.py +0 -0
  18. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/base_config.py +0 -0
  19. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/context/__init__.py +0 -0
  20. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/context/determined.py +0 -0
  21. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/context/eval.py +0 -0
  22. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/context/local.py +0 -0
  23. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/evaluation_generator.py +0 -0
  24. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/exceptions.py +0 -0
  25. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/external/ifeval_impl/README.md +0 -0
  26. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
  27. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
  28. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
  29. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
  30. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/llm/__init__.py +0 -0
  31. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/llm/base.py +0 -0
  32. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/llm/huggingface.py +0 -0
  33. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/llm/mistral.py +0 -0
  34. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/llm/models.py +0 -0
  35. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/llm/openai.py +0 -0
  36. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/llm/vllm.py +0 -0
  37. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/logger.py +0 -0
  38. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/main.py +0 -0
  39. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/__init__.py +0 -0
  40. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/base.py +0 -0
  41. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/__init__.py +0 -0
  42. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
  43. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
  44. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/bleu.py +0 -0
  45. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/chrf.py +0 -0
  46. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
  47. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
  48. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/comet.py +0 -0
  49. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
  50. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/csv_format.py +0 -0
  51. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
  52. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
  53. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/f1.py +0 -0
  54. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/format_checker.py +0 -0
  55. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
  56. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/ifeval.py +0 -0
  57. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/json_format.py +0 -0
  58. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/language_checker.py +0 -0
  59. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/length_control.py +0 -0
  60. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
  61. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
  62. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
  63. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/repetition.py +0 -0
  64. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
  65. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
  66. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
  67. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
  68. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
  69. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/ter.py +0 -0
  70. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/text_counter.py +0 -0
  71. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
  72. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
  73. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/__init__.py +0 -0
  74. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/base.py +0 -0
  75. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
  76. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
  77. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
  78. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
  79. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
  80. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
  81. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
  82. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/language.py +0 -0
  83. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
  84. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/models.py +0 -0
  85. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
  86. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
  87. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
  88. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
  89. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
  90. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
  91. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
  92. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
  93. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
  94. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
  95. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
  96. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
  97. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
  98. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
  99. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
  100. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/utils.py +0 -0
  101. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
  102. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
  103. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
  104. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
  105. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
  106. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
  107. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
  108. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/py.typed +0 -0
  109. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/response_generator.py +0 -0
  110. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/result_processors/__init__.py +0 -0
  111. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/result_processors/base.py +0 -0
  112. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/result_processors/hf_uploader.py +0 -0
  113. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/result_processors/result_processor.py +0 -0
  114. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
  115. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/run.py +0 -0
  116. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/run_direct.py +0 -0
  117. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/shared/types.py +0 -0
  118. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/__init__.py +0 -0
  119. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
  120. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/aidanbench.py +0 -0
  121. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
  122. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
  123. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
  124. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
  125. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
  126. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
  127. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
  128. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
  129. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
  130. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
  131. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
  132. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
  133. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
  134. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
  135. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/include.py +0 -0
  136. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/math_reasoning.py +0 -0
  137. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
  138. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
  139. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
  140. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
  141. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
  142. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
  143. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
  144. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
  145. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
  146. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
  147. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
  148. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
  149. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/truthfulqa.py +0 -0
  150. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
  151. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
  152. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
  153. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/eval_config.py +0 -0
  154. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/perturbation.py +0 -0
  155. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/registry.py +0 -0
  156. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/task_loader.py +0 -0
  157. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/task_names.py +0 -0
  158. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/utils.py +0 -0
  159. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/utils/constants.py +0 -0
  160. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/utils/file_ops.py +0 -0
  161. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/utils/generate_task_docs.py +0 -0
  162. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/utils/helpers.py +0 -0
  163. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/utils/logging.py +0 -0
  164. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/utils/packaging.py +0 -0
  165. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/utils/tqdm_handler.py +0 -0
  166. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/template_formatting/README.md +0 -0
  167. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/template_formatting/__init__.py +0 -0
  168. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/template_formatting/formatter.py +0 -0
  169. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/template_formatting/mistral_formatter.py +0 -0
  170. {eval_framework-0.2.11 → eval_framework-0.2.12}/src/template_formatting/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: eval-framework
3
- Version: 0.2.11
3
+ Version: 0.2.12
4
4
  Summary: Evalulation Framework
5
5
  Author: Aleph Alpha Research
6
6
  License: Apache License
@@ -214,7 +214,7 @@ Classifier: Typing :: Typed
214
214
  Requires-Dist: pyyaml>=6.0.1,<7
215
215
  Requires-Dist: xmltodict>=0.13.0,<0.16
216
216
  Requires-Dist: pydantic>=2.7,<3
217
- Requires-Dist: datasets>=2.19.1,<4
217
+ Requires-Dist: datasets>=4.0.0,<5
218
218
  Requires-Dist: sacrebleu>=2.4.3,<3
219
219
  Requires-Dist: pycountry>=24.6.1,<25
220
220
  Requires-Dist: nltk>=3.9.1,<4
@@ -245,8 +245,8 @@ Requires-Dist: tensorboard==2.19.0 ; extra == 'determined'
245
245
  Requires-Dist: mistral-common>=1.7,<2 ; extra == 'mistral'
246
246
  Requires-Dist: huggingface-hub>=0.33.2,<0.34 ; extra == 'mistral'
247
247
  Requires-Dist: eval-framework[vllm] ; extra == 'mistral'
248
- Requires-Dist: openai>=1.62,<2.8 ; extra == 'openai'
249
- Requires-Dist: tiktoken>=0.9,<0.10 ; extra == 'openai'
248
+ Requires-Dist: openai>=1.62,<3 ; extra == 'openai'
249
+ Requires-Dist: tiktoken>=0.9,<1 ; extra == 'openai'
250
250
  Requires-Dist: transformers>=4.45.2,<5 ; extra == 'openai'
251
251
  Requires-Dist: transformers>=4.45.2,<5 ; extra == 'optional'
252
252
  Requires-Dist: jinja2>=3.1.6,<4 ; extra == 'optional'
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "eval-framework"
3
- version = "0.2.11"
3
+ version = "0.2.12"
4
4
  description = "Evalulation Framework"
5
5
  readme = "README.md"
6
6
  license = { file = "LICENSE" }
@@ -21,7 +21,7 @@ dependencies = [
21
21
  "pyyaml>=6.0.1,<7",
22
22
  "xmltodict>=0.13.0,<0.16",
23
23
  "pydantic>=2.7,<3",
24
- "datasets>=2.19.1,<4", # dataset v4 has breaking changes we'd need to adapt to
24
+ "datasets>=4.0.0,<5",
25
25
  "sacrebleu>=2.4.3,<3",
26
26
  "pycountry>=24.6.1,<25",
27
27
  "nltk>=3.9.1,<4",
@@ -55,8 +55,8 @@ determined = [
55
55
  ]
56
56
  api = ["aleph-alpha-client>=11.5.1"]
57
57
  openai = [
58
- "openai>=1.62,<2.8",
59
- "tiktoken>=0.9,<0.10",
58
+ "openai>=1.62,<3",
59
+ "tiktoken>=0.9,<1",
60
60
  "transformers>=4.45.2,<5",
61
61
  ]
62
62
  transformers = [
@@ -45,6 +45,7 @@ class AlephAlphaAPIModel(BaseLLM):
45
45
  formatter: BaseFormatter | None = None,
46
46
  checkpoint_name: str | None = None,
47
47
  temperature: float | None = None,
48
+ top_p: float | None = None,
48
49
  # Please see README.md for tips if adapting the following parameters.
49
50
  max_retries: int = 100,
50
51
  max_async_concurrent_requests: int = 32,
@@ -62,6 +63,7 @@ class AlephAlphaAPIModel(BaseLLM):
62
63
  self._formatter = formatter
63
64
  self._llm_name = checkpoint_name or self.LLM_NAME
64
65
  self._temperature = temperature if temperature is not None else 0.0
66
+ self._top_p = top_p if top_p is not None else 0.0
65
67
  self.max_async_concurrent_requests = max_async_concurrent_requests
66
68
  self.max_retries = max_retries
67
69
  self.request_timeout_seconds = request_timeout_seconds
@@ -213,6 +215,7 @@ class AlephAlphaAPIModel(BaseLLM):
213
215
  maximum_tokens=scaled_max_tokens,
214
216
  stop_sequences=stop_sequences,
215
217
  temperature=effective_temperature,
218
+ top_p=self._top_p,
216
219
  )
217
220
  )
218
221
 
@@ -171,7 +171,6 @@ class BaseTask[SubjectType](ABC):
171
171
  return load_dataset(
172
172
  **kwargs,
173
173
  revision=self.HF_REVISION,
174
- trust_remote_code=True,
175
174
  cache_dir=cache_dir,
176
175
  download_config=download_config,
177
176
  )
@@ -179,7 +178,6 @@ class BaseTask[SubjectType](ABC):
179
178
  return load_dataset(
180
179
  **kwargs,
181
180
  revision=self.HF_REVISION,
182
- trust_remote_code=True,
183
181
  cache_dir=f"{Path.home()}/.cache/eval-framework",
184
182
  )
185
183
 
@@ -12,11 +12,12 @@ class DUC(BaseTask[str], ABC):
12
12
  """https://huggingface.co/datasets/midas/duc2001"""
13
13
 
14
14
  DATASET_PATH: str = "midas/duc2001"
15
- SAMPLE_SPLIT: str = "test"
16
- FEWSHOT_SPLIT: str = "test"
15
+ HF_REVISION: str = "77d6dedcbce421695a12f24c8802e8847a129d92"
16
+ SAMPLE_SPLIT: str = "train"
17
+ FEWSHOT_SPLIT: str = "train"
17
18
  RESPONSE_TYPE: ResponseType = ResponseType.COMPLETION
18
19
  METRICS: list[type[BaseMetric]] = [AccuracyCompletion]
19
- SUBJECTS: list[str] = ["raw"]
20
+ SUBJECTS: list[str] = ["default"]
20
21
  PERTURBATION_UNMODIFIABLE_WORDS = ["Text", "Keyphrase"]
21
22
  LANGUAGE = Language.ENG
22
23
 
@@ -33,6 +34,10 @@ class DUC(BaseTask[str], ABC):
33
34
  completion_text = completion_text.strip()
34
35
  return completion_text
35
36
 
37
+ def _load_dataset(self, subject: str) -> None:
38
+ hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=subject, data_files="raw/test/0000.parquet")
39
+ self.dataset = self._shuffle_splits(hf_dataset=hf_dataset)
40
+
36
41
  def _get_instruction_text(self, item: dict[str, Any]) -> str:
37
42
  instruction_text = " ".join(item["document"])
38
43
  instruction_text = re.sub(r"\s+([.,!?;:])", r"\1", instruction_text)
@@ -47,7 +52,7 @@ class DUC(BaseTask[str], ABC):
47
52
 
48
53
  class DUC_EXTRACTIVE(DUC):
49
54
  NAME = "DUC Extractive"
50
- SUBJECTS: list[str] = ["raw"]
55
+ SUBJECTS: list[str] = ["default"]
51
56
 
52
57
  def _get_ground_truth(self, item: dict[str, Any]) -> list[str]:
53
58
  return item["extractive_keyphrases"]
@@ -61,14 +66,13 @@ class DUC_EXTRACTIVE(DUC):
61
66
 
62
67
  class DUC_ABSTRACTIVE(DUC):
63
68
  NAME = "DUC Abstractive"
64
- SUBJECTS: list[str] = ["raw"]
69
+ SUBJECTS: list[str] = ["default"]
65
70
 
66
71
  def _get_ground_truth(self, item: dict[str, Any]) -> list[str]:
67
72
  return item["abstractive_keyphrases"]
68
73
 
69
74
  def _load_dataset(self, subject: str) -> None:
70
- # not all samples have abstractive keyphrases
71
- hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=subject)
75
+ hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=subject, data_files="raw/test/0000.parquet")
72
76
  self.dataset = {}
73
77
 
74
78
  for split, data in hf_dataset.items():
@@ -25,6 +25,7 @@ class Flores200(BaseTask[str]):
25
25
 
26
26
  NAME = "FLoRes-200"
27
27
  DATASET_PATH = "facebook/flores"
28
+ HF_REVISION = "fd7d8f42fccb9dbc35830053a8c705a2627124ce"
28
29
  SAMPLE_SPLIT = "devtest"
29
30
  FEWSHOT_SPLIT = "dev"
30
31
  RESPONSE_TYPE = ResponseType.COMPLETION
@@ -66,7 +67,6 @@ class Flores200(BaseTask[str]):
66
67
  split=kwargs.get("split"),
67
68
  data_files=None, # Let it auto-discover parquet files
68
69
  revision=self.HF_REVISION,
69
- trust_remote_code=False, # Disable the loading script!
70
70
  cache_dir=cache_dir,
71
71
  download_config=download_config,
72
72
  )
@@ -79,7 +79,6 @@ class Flores200(BaseTask[str]):
79
79
  dataset = load_dataset(
80
80
  **kwargs,
81
81
  revision=self.HF_REVISION,
82
- trust_remote_code=True,
83
82
  cache_dir=cache_dir,
84
83
  download_config=download_config,
85
84
  )
@@ -92,6 +92,7 @@ class GSM8KEvalHarness(BaseTask[str]):
92
92
 
93
93
  NAME = "GSM8KEvalHarness"
94
94
  DATASET_PATH = "openai/gsm8k"
95
+ HF_REVISION = "main"
95
96
  SAMPLE_SPLIT = "test"
96
97
  FEWSHOT_SPLIT = "train"
97
98
  RESPONSE_TYPE = ResponseType.COMPLETION
@@ -39,13 +39,10 @@ class InfiniteBench(BaseTask[str], ABC):
39
39
  }
40
40
  )
41
41
  try:
42
- return load_dataset(
43
- **kwargs, trust_remote_code=True, cache_dir=cache_dir, download_config=download_config, features=ft
44
- )
42
+ return load_dataset(**kwargs, cache_dir=cache_dir, download_config=download_config, features=ft)
45
43
  except Exception:
46
44
  return load_dataset(
47
45
  **kwargs,
48
- trust_remote_code=True,
49
46
  cache_dir=f"{Path.home()}/.cache/eval-framework",
50
47
  features=ft,
51
48
  )
@@ -73,6 +73,7 @@ class MMLU_DE(BaseTask[str]):
73
73
 
74
74
  NAME = "MMLU_DE"
75
75
  DATASET_PATH = "LeoLM/MMLU_de"
76
+ HF_REVISION = "11433b408001dd26444c7e666cc536e0b8907ca5"
76
77
  SAMPLE_SPLIT = "test"
77
78
  FEWSHOT_SPLIT = "validation"
78
79
  RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
@@ -27,6 +27,7 @@ class ARC_EU20_DE(ARC):
27
27
 
28
28
  NAME = "ARC_EU20_DE"
29
29
  DATASET_PATH = "openGPT-X/arcx"
30
+ HF_REVISION = "e4c31fa077b82832cc21e614832701603a8ad319"
30
31
  SAMPLE_SPLIT = "test"
31
32
  FEWSHOT_SPLIT = "train"
32
33
  SUBJECTS = ["challenge_DE", "easy_DE"]
@@ -36,6 +37,7 @@ class ARC_EU20_DE(ARC):
36
37
  class ARC_EU20_FR(ARC):
37
38
  NAME = "ARC_EU20_FR"
38
39
  DATASET_PATH = "openGPT-X/arcx"
40
+ HF_REVISION = "e4c31fa077b82832cc21e614832701603a8ad319"
39
41
  SAMPLE_SPLIT = "test"
40
42
  FEWSHOT_SPLIT = "train"
41
43
  SUBJECTS = ["challenge_FR", "easy_FR"]
@@ -51,6 +53,7 @@ class GSM8K_EU20_DE(GSM8KEvalHarness):
51
53
  """ # noqa: E501
52
54
 
53
55
  NAME = "GSM8K_EU20_DE"
56
+ HF_REVISION = "3ed0f81d31a9013e05d16644aabcc36db50078a9"
54
57
  DATASET_PATH = "openGPT-X/gsm8kx"
55
58
  SAMPLE_SPLIT = "test"
56
59
  FEWSHOT_SPLIT = "train"
@@ -60,6 +63,7 @@ class GSM8K_EU20_DE(GSM8KEvalHarness):
60
63
 
61
64
  class GSM8K_EU20_FR(GSM8KEvalHarness):
62
65
  NAME = "GSM8K_EU20_FR"
66
+ HF_REVISION = "3ed0f81d31a9013e05d16644aabcc36db50078a9"
63
67
  DATASET_PATH = "openGPT-X/gsm8kx"
64
68
  SAMPLE_SPLIT = "test"
65
69
  FEWSHOT_SPLIT = "train"
@@ -77,6 +81,7 @@ class HELLASWAG_EU20_DE(HELLASWAG):
77
81
 
78
82
  NAME = "HellaSwag_EU20_DE"
79
83
  DATASET_PATH = "openGPT-X/hellaswagx"
84
+ HF_REVISION = "7c30407f4f11fa4fada74bd4384ed0fe572ae8f2"
80
85
  SAMPLE_SPLIT = "train"
81
86
  FEWSHOT_SPLIT = "validation"
82
87
  SUBJECTS = ["DE"]
@@ -86,6 +91,7 @@ class HELLASWAG_EU20_DE(HELLASWAG):
86
91
  class HELLASWAG_EU20_FR(HELLASWAG):
87
92
  NAME = "HellaSwag_EU20_FR"
88
93
  DATASET_PATH = "openGPT-X/hellaswagx"
94
+ HF_REVISION = "7c30407f4f11fa4fada74bd4384ed0fe572ae8f2"
89
95
  SAMPLE_SPLIT = "train"
90
96
  FEWSHOT_SPLIT = "validation"
91
97
  SUBJECTS = ["FR"]
@@ -128,6 +134,7 @@ class TRUTHFULQA_EU20_DE(TRUTHFULQA):
128
134
 
129
135
  NAME = "TruthfulQA_EU20_DE"
130
136
  DATASET_PATH = "openGPT-X/truthfulqax"
137
+ HF_REVISION = "cff042da87dfb8885c357cb1c83194fa6aaf1d49"
131
138
  LANGUAGE = Language.DEU
132
139
 
133
140
  def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]:
@@ -182,6 +189,7 @@ TRUTHFULQA_EU20_FR_FEWSHOT_ITEMS = [
182
189
  class TRUTHFULQA_EU20_FR(TRUTHFULQA):
183
190
  NAME = "TruthfulQA_EU20_FR"
184
191
  DATASET_PATH = "openGPT-X/truthfulqax"
192
+ HF_REVISION = "cff042da87dfb8885c357cb1c83194fa6aaf1d49"
185
193
  LANGUAGE = Language.FRA
186
194
 
187
195
  def _load_dataset(self, subject: SubjectType) -> None:
@@ -214,6 +222,7 @@ class MMLU_EU20_DE(MMLU):
214
222
 
215
223
  NAME = "MMLU_EU20_DE"
216
224
  DATASET_PATH = "openGPT-X/mmlux"
225
+ HF_REVISION = "6412d5d5d03a7b31d02f4ba34b787c2e7939a800"
217
226
  SAMPLE_SPLIT = "test"
218
227
  FEWSHOT_SPLIT = "dev" # one could merge dev and validation to have a larger pool of fewshot examples
219
228
  SUBJECTS = [i + "_DE" for i in MMLU_SUBJECTS]
@@ -321,6 +330,7 @@ MMLU_SUBJECTS_TRANSLATION_FR = {
321
330
  class MMLU_EU20_FR(MMLU):
322
331
  NAME = "MMLU_EU20_FR"
323
332
  DATASET_PATH = "openGPT-X/mmlux"
333
+ HF_REVISION = "6412d5d5d03a7b31d02f4ba34b787c2e7939a800"
324
334
  SAMPLE_SPLIT = "test"
325
335
  FEWSHOT_SPLIT = "dev"
326
336
  SUBJECTS = [i + "_FR" for i in MMLU_SUBJECTS]
@@ -15,6 +15,7 @@ class PIQA(BaseTask[str]):
15
15
 
16
16
  NAME = "PIQA"
17
17
  DATASET_PATH = "ybisk/piqa"
18
+ HF_REVISION = "6b3aceb3276e5ab7e51895d73151a718690af38c"
18
19
  SAMPLE_SPLIT = "validation" # 1838 examples (same split as lm-eval)
19
20
  FEWSHOT_SPLIT = "test" # 3084 examples
20
21
  RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
@@ -75,7 +75,6 @@ class SQUAD2(BaseTask[str]):
75
75
  return load_dataset(
76
76
  **kwargs,
77
77
  revision=self.HF_REVISION,
78
- trust_remote_code=True,
79
78
  cache_dir=cache_dir,
80
79
  download_config=download_config,
81
80
  )
@@ -1,5 +1,11 @@
1
+ import os
2
+ from pathlib import Path
1
3
  from typing import Any
2
4
 
5
+ from datasets import DownloadConfig, load_dataset
6
+ from huggingface_hub import HfApi
7
+ from huggingface_hub.errors import RevisionNotFoundError
8
+
3
9
  from eval_framework.tasks.base import Language
4
10
  from eval_framework.tasks.benchmarks.winogrande import WINOGRANDE
5
11
 
@@ -19,6 +25,7 @@ class WINOX(WINOGRANDE):
19
25
  """
20
26
 
21
27
  DATASET_PATH = "demelin/wino_x"
28
+ HF_REVISION = "7d82697fd52ac8b03e62aadfddc61077320f21e7"
22
29
  SAMPLE_SPLIT = "test"
23
30
  FEWSHOT_SPLIT = "test"
24
31
  LANGUAGE_SHORT_CODE = ""
@@ -42,6 +49,31 @@ class WINOX(WINOGRANDE):
42
49
  ]
43
50
  return choices
44
51
 
52
+ def _load_hf_dataset(self, **kwargs: Any) -> Any:
53
+ """Override to handle FLORES-200 encoding issues by using parquet files."""
54
+ # Check if the HF_REVISION is valid before loading the dataset
55
+ if self.HF_REVISION:
56
+ try:
57
+ _ = HfApi().dataset_info(repo_id=kwargs["path"], revision=self.HF_REVISION, timeout=100.0)
58
+ except Exception as e:
59
+ if isinstance(e, RevisionNotFoundError):
60
+ raise e
61
+
62
+ cache_dir: str = os.environ.get("HF_DATASET_CACHE_DIR", f"{Path.home()}/.cache/huggingface/datasets")
63
+ download_config = DownloadConfig(cache_dir=cache_dir, max_retries=5)
64
+
65
+ dataset = load_dataset(
66
+ kwargs.get("path", self.DATASET_PATH),
67
+ name=kwargs.get("name"),
68
+ split=kwargs.get("split"),
69
+ data_files=None, # Let it auto-discover parquet files
70
+ revision=self.HF_REVISION,
71
+ cache_dir=cache_dir,
72
+ download_config=download_config,
73
+ )
74
+
75
+ return dataset
76
+
45
77
 
46
78
  class WINOX_DE(WINOX):
47
79
  NAME = "WINOX_DE"
@@ -16,6 +16,7 @@ class ZERO_SCROLLS_QUALITY(BaseTask[str]):
16
16
 
17
17
  NAME = "ZeroSCROLLS QuALITY"
18
18
  DATASET_PATH = "tau/zero_scrolls"
19
+ HF_REVISION = "3ee203cfad81b1a4fab8f2351c12679fbe95b179"
19
20
  SAMPLE_SPLIT = "validation"
20
21
  FEWSHOT_SPLIT = "validation"
21
22
  RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
@@ -48,6 +49,7 @@ class ZERO_SCROLLS_COMPLETION(BaseTask[str]):
48
49
  """ZeroSCROLLS dataset: https://huggingface.co/datasets/tau/zero_scrolls"""
49
50
 
50
51
  DATASET_PATH = "tau/zero_scrolls"
52
+ HF_REVISION = "3ee203cfad81b1a4fab8f2351c12679fbe95b179"
51
53
  SAMPLE_SPLIT = "validation"
52
54
  FEWSHOT_SPLIT = "validation"
53
55
  RESPONSE_TYPE = ResponseType.COMPLETION
File without changes