eval-framework 0.2.9__tar.gz → 0.2.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (170) hide show
  1. {eval_framework-0.2.9 → eval_framework-0.2.11}/PKG-INFO +2 -2
  2. {eval_framework-0.2.9 → eval_framework-0.2.11}/pyproject.toml +2 -2
  3. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/llm/aleph_alpha.py +11 -70
  4. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/llm/base.py +8 -6
  5. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/llm/huggingface.py +12 -13
  6. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/llm/vllm.py +24 -23
  7. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/arc.py +1 -1
  8. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/casehold.py +3 -1
  9. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/gsm8k.py +1 -1
  10. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/math_reasoning.py +1 -1
  11. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/truthfulqa.py +1 -1
  12. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/winogrande.py +2 -2
  13. {eval_framework-0.2.9 → eval_framework-0.2.11}/LICENSE +0 -0
  14. {eval_framework-0.2.9 → eval_framework-0.2.11}/README.md +0 -0
  15. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/__init__.py +0 -0
  16. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/base_config.py +0 -0
  17. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/context/__init__.py +0 -0
  18. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/context/determined.py +0 -0
  19. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/context/eval.py +0 -0
  20. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/context/local.py +0 -0
  21. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/evaluation_generator.py +0 -0
  22. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/exceptions.py +0 -0
  23. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/external/ifeval_impl/README.md +0 -0
  24. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
  25. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
  26. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
  27. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
  28. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/llm/__init__.py +0 -0
  29. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/llm/mistral.py +0 -0
  30. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/llm/models.py +0 -0
  31. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/llm/openai.py +0 -0
  32. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/logger.py +0 -0
  33. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/main.py +0 -0
  34. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/__init__.py +0 -0
  35. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/base.py +0 -0
  36. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/__init__.py +0 -0
  37. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
  38. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
  39. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/bleu.py +0 -0
  40. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/chrf.py +0 -0
  41. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
  42. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
  43. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/comet.py +0 -0
  44. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
  45. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/csv_format.py +0 -0
  46. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
  47. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
  48. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/f1.py +0 -0
  49. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/format_checker.py +0 -0
  50. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
  51. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/ifeval.py +0 -0
  52. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/json_format.py +0 -0
  53. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/language_checker.py +0 -0
  54. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/length_control.py +0 -0
  55. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
  56. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
  57. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
  58. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/repetition.py +0 -0
  59. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
  60. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
  61. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
  62. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
  63. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
  64. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/ter.py +0 -0
  65. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/text_counter.py +0 -0
  66. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
  67. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
  68. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/__init__.py +0 -0
  69. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/base.py +0 -0
  70. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
  71. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
  72. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
  73. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
  74. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
  75. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
  76. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
  77. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/graders/language.py +0 -0
  78. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
  79. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/graders/models.py +0 -0
  80. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
  81. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
  82. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
  83. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
  84. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
  85. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
  86. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
  87. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
  88. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
  89. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
  90. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
  91. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
  92. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
  93. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
  94. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
  95. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/utils.py +0 -0
  96. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
  97. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
  98. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
  99. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
  100. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
  101. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
  102. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
  103. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/py.typed +0 -0
  104. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/response_generator.py +0 -0
  105. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/result_processors/__init__.py +0 -0
  106. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/result_processors/base.py +0 -0
  107. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/result_processors/hf_uploader.py +0 -0
  108. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/result_processors/result_processor.py +0 -0
  109. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
  110. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/run.py +0 -0
  111. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/run_direct.py +0 -0
  112. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/shared/types.py +0 -0
  113. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/__init__.py +0 -0
  114. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/base.py +0 -0
  115. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
  116. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/aidanbench.py +0 -0
  117. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
  118. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
  119. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
  120. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
  121. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
  122. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
  123. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
  124. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/flores200.py +0 -0
  125. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
  126. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
  127. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
  128. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
  129. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
  130. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
  131. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/include.py +0 -0
  132. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
  133. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
  134. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
  135. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
  136. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
  137. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
  138. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
  139. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
  140. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
  141. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
  142. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
  143. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
  144. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
  145. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/squad.py +0 -0
  146. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
  147. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
  148. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
  149. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
  150. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
  151. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
  152. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
  153. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/eval_config.py +0 -0
  154. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/perturbation.py +0 -0
  155. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/registry.py +0 -0
  156. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/task_loader.py +0 -0
  157. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/task_names.py +0 -0
  158. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/utils.py +0 -0
  159. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/utils/constants.py +0 -0
  160. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/utils/file_ops.py +0 -0
  161. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/utils/generate_task_docs.py +0 -0
  162. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/utils/helpers.py +0 -0
  163. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/utils/logging.py +0 -0
  164. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/utils/packaging.py +0 -0
  165. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/utils/tqdm_handler.py +0 -0
  166. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/template_formatting/README.md +0 -0
  167. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/template_formatting/__init__.py +0 -0
  168. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/template_formatting/formatter.py +0 -0
  169. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/template_formatting/mistral_formatter.py +0 -0
  170. {eval_framework-0.2.9 → eval_framework-0.2.11}/src/template_formatting/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: eval-framework
3
- Version: 0.2.9
3
+ Version: 0.2.11
4
4
  Summary: Evalulation Framework
5
5
  Author: Aleph Alpha Research
6
6
  License: Apache License
@@ -238,7 +238,7 @@ Requires-Dist: numpy>=1.26.4
238
238
  Requires-Dist: antlr4-python3-runtime==4.11.0
239
239
  Requires-Dist: accelerate ; extra == 'accelerate'
240
240
  Requires-Dist: eval-framework[determined,api,openai,transformers,accelerate,vllm,comet,optional,mistral] ; extra == 'all'
241
- Requires-Dist: aleph-alpha-client>=10,<11 ; extra == 'api'
241
+ Requires-Dist: aleph-alpha-client>=11.5.1 ; extra == 'api'
242
242
  Requires-Dist: unbabel-comet>=2.2.6,<3 ; extra == 'comet'
243
243
  Requires-Dist: determined>=0.38,<0.39 ; extra == 'determined'
244
244
  Requires-Dist: tensorboard==2.19.0 ; extra == 'determined'
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "eval-framework"
3
- version = "0.2.9"
3
+ version = "0.2.11"
4
4
  description = "Evalulation Framework"
5
5
  readme = "README.md"
6
6
  license = { file = "LICENSE" }
@@ -53,7 +53,7 @@ determined = [
53
53
  "determined>=0.38,<0.39",
54
54
  "tensorboard==2.19.0"
55
55
  ]
56
- api = ["aleph-alpha-client>=10,<11"]
56
+ api = ["aleph-alpha-client>=11.5.1"]
57
57
  openai = [
58
58
  "openai>=1.62,<2.8",
59
59
  "tiktoken>=0.9,<0.10",
@@ -3,16 +3,12 @@ import json
3
3
  import logging
4
4
  import math
5
5
  import os
6
- import random
7
6
  import re
8
- import time
9
7
  import traceback
10
8
  from collections.abc import Callable, Sequence
11
9
 
12
- import aiohttp
13
10
  from aleph_alpha_client import (
14
11
  AsyncClient,
15
- BusyError,
16
12
  Client,
17
13
  CompletionRequest,
18
14
  CompletionResponse,
@@ -53,7 +49,6 @@ class AlephAlphaAPIModel(BaseLLM):
53
49
  max_retries: int = 100,
54
50
  max_async_concurrent_requests: int = 32,
55
51
  request_timeout_seconds: int = 30 * 60 + 5,
56
- queue_full_timeout_seconds: int = 30 * 60 + 5,
57
52
  bytes_per_token: float | None = None,
58
53
  token: str = os.getenv("AA_TOKEN", "dummy"),
59
54
  base_url: str = os.getenv("AA_INFERENCE_ENDPOINT", "dummy_endpoint"),
@@ -70,7 +65,6 @@ class AlephAlphaAPIModel(BaseLLM):
70
65
  self.max_async_concurrent_requests = max_async_concurrent_requests
71
66
  self.max_retries = max_retries
72
67
  self.request_timeout_seconds = request_timeout_seconds
73
- self.queue_full_timeout_seconds = queue_full_timeout_seconds
74
68
  self.token = token
75
69
  self.base_url = base_url
76
70
  self._validate_model_availability(base_url, token)
@@ -101,56 +95,6 @@ class AlephAlphaAPIModel(BaseLLM):
101
95
  except Exception as e:
102
96
  raise RuntimeError(f"Model '{self._llm_name}' is not available: {e}")
103
97
 
104
- async def _request_with_backoff(
105
- self, client: AsyncClient, request: CompletionRequest, id: int
106
- ) -> CompletionResponse:
107
- """
108
- Query Aleph-Alpha API with complete. Retry with back-off until it responds.
109
- """
110
- num_attempts = 0
111
- start_time: float | None = None
112
-
113
- while True:
114
- try:
115
- return await client.complete(request, model=self._llm_name)
116
-
117
- except (TimeoutError, BusyError, RuntimeError, aiohttp.ClientError) as e:
118
- status_code: str = safe_json_loads(e.args[1]).get("code", "") if len(e.args) >= 2 else ""
119
- str_e = str(e)
120
- if status_code == "QUEUE_FULL":
121
- # Worker not available or missed a heartbeat (inference longer than scheduler's
122
- # API_MODEL_AVAILABLE_TIMEOUT_DURATION_MILLIS) or the scheduler is overloaded.
123
- if start_time is None:
124
- start_time = time.time()
125
- elapsed = time.time() - start_time
126
- if elapsed <= self.queue_full_timeout_seconds:
127
- logger.info(
128
- f"Request {id}: {status_code or str_e[:256]} - retrying: attempt"
129
- f" {num_attempts}/{self.max_retries}, elapsed {elapsed:.1f} sec"
130
- )
131
- # don't count as retry (request returns immediately, so just wait a bit not to DoS the server)
132
- await asyncio.sleep(random.randint(5, 30))
133
- continue
134
-
135
- elif (
136
- status_code == "TIMEOUT_TASK"
137
- or isinstance(e, TimeoutError)
138
- or "502 Bad Gateway" in str_e
139
- or "504 Gateway Time-out" in str_e
140
- or isinstance(e, aiohttp.ClientError)
141
- ):
142
- # client timeout, either because task too long in a queue or inference too long
143
- # (scheduler's API_CLIENT_TIMEOUT_DURATION_MILLIS). Retrying for the "inference too long"
144
- # case makes no sense but we unfortunately don't know which case has happened.
145
- num_attempts += 1
146
- start_time = None
147
- if num_attempts < self.max_retries:
148
- logger.info(f"Request {id}: TIMEOUT_TASK - retrying: attempt {num_attempts}/{self.max_retries}")
149
- await asyncio.sleep(random.randint(5, 30))
150
- continue
151
-
152
- raise e
153
-
154
98
  def _error_from_exception(self, e: Exception) -> Error:
155
99
  """Convert an exception to an Error object."""
156
100
  if len(e.args) >= 2:
@@ -171,39 +115,36 @@ class AlephAlphaAPIModel(BaseLLM):
171
115
  async def _process_request_with_client(
172
116
  self,
173
117
  client: AsyncClient,
174
- semaphore: asyncio.Semaphore,
175
118
  request: CompletionRequest,
176
119
  id: int,
177
120
  ) -> tuple[CompletionRequest, CompletionResponse | Error]:
178
121
  """Process a single request, returning the request and either a response or error."""
179
- async with semaphore:
180
- try:
181
- response = await self._request_with_backoff(client=client, request=request, id=id)
182
- logger.info(f"Request {id}: Success")
183
- return (request, response)
184
- except Exception as e:
185
- if raise_errors():
186
- raise e
187
- logger.info(f"Request {id}: Failure: {str(e)[:256]}")
188
- return (request, self._error_from_exception(e))
122
+ try:
123
+ response = await client.complete(request, model=self._llm_name)
124
+ logger.info(f"Request {id}: Success")
125
+ return (request, response)
126
+ except Exception as e:
127
+ if raise_errors():
128
+ raise e
129
+ logger.info(f"Request {id}: Failure: {str(e)[:256]}")
130
+ return (request, self._error_from_exception(e))
189
131
 
190
132
  async def _process_requests(
191
133
  self,
192
134
  requests: list[CompletionRequest],
193
135
  ) -> list[tuple[CompletionRequest, CompletionResponse | Error]]:
194
136
  """Process multiple requests concurrently, returning request/response pairs."""
195
- semaphore = asyncio.Semaphore(self.max_async_concurrent_requests)
196
137
  async with AsyncClient(
197
138
  host=self.base_url,
198
139
  nice=True,
199
140
  request_timeout_seconds=self.request_timeout_seconds,
200
141
  token=self.token,
201
- total_retries=0, # we have a custom retry policy in _request_with_backoff()
142
+ total_retries=self.max_retries,
143
+ limit=self.max_async_concurrent_requests,
202
144
  ) as client:
203
145
  tasks = (
204
146
  self._process_request_with_client(
205
147
  client,
206
- semaphore,
207
148
  request,
208
149
  i,
209
150
  )
@@ -1,5 +1,6 @@
1
1
  from abc import ABC, abstractmethod
2
- from collections.abc import Sequence
2
+ from collections.abc import Generator, Sequence
3
+ from contextlib import contextmanager
3
4
  from pathlib import Path
4
5
  from typing import Any
5
6
 
@@ -112,21 +113,22 @@ class BaseLLM(ABC):
112
113
  """
113
114
  pass
114
115
 
116
+ @contextmanager
115
117
  def _get_final_checkpoint(
116
118
  self, checkpoint_path: str | Path | None = None, model_name: str | None = None, artifact_name: str | None = None
117
- ) -> tuple[str | Path | None, str | None]:
119
+ ) -> Generator[tuple[str | Path | None, str | None], None, None]:
118
120
  if (num_provided := sum(x is not None for x in [checkpoint_path, model_name, artifact_name])) == 0:
119
121
  if not getattr(self, "LLM_NAME", ""):
120
122
  raise ValueError("Either LLM_NAME, checkpoint_path, model_name, or artifact_name must be provided.")
121
- return None, None # no argument given, so will use the LLM_NAME of the class
123
+ yield None, None # no argument given, so will use the LLM_NAME of the class
122
124
  elif num_provided > 1:
123
125
  raise ValueError("At most one of `checkpoint_path`, `model_name`, or `artifact_name` must be provided.")
124
126
 
125
127
  elif checkpoint_path is not None:
126
- return checkpoint_path, str(checkpoint_path)
128
+ yield checkpoint_path, str(checkpoint_path)
127
129
 
128
130
  elif model_name is not None:
129
- return model_name, model_name
131
+ yield model_name, model_name
130
132
 
131
133
  else:
132
134
  from eval_framework.utils.file_ops import WandbFs
@@ -139,7 +141,7 @@ class BaseLLM(ABC):
139
141
  file_root = wandb_fs.find_hf_checkpoint_root_from_path_list()
140
142
  if file_root is None:
141
143
  raise ValueError(f"Could not find HuggingFace checkpoint in artifact {artifact_base}:{version}")
142
- return file_root, artifact_name
144
+ yield file_root, artifact_name
143
145
 
144
146
  def _get_final_formatter(
145
147
  self,
@@ -322,22 +322,21 @@ class HFLLM(BaseHFLLM):
322
322
  bytes_per_token: float | None = None,
323
323
  **kwargs: Any,
324
324
  ) -> None:
325
- final_path, possible_name = self._get_final_checkpoint(checkpoint_path, model_name, artifact_name)
325
+ with self._get_final_checkpoint(checkpoint_path, model_name, artifact_name) as (final_path, possible_name):
326
+ self.checkpoint_name = checkpoint_name
327
+ if self.checkpoint_name is None and possible_name is not None:
328
+ self.checkpoint_name = possible_name.replace("/", "_").replace(":", "_").strip("_") # sanitize pathname
326
329
 
327
- self.checkpoint_name = checkpoint_name
328
- if self.checkpoint_name is None and possible_name is not None:
329
- self.checkpoint_name = possible_name.replace("/", "_").replace(":", "_").strip("_") # sanitize pathname
330
+ if final_path:
331
+ self.LLM_NAME = str(final_path)
330
332
 
331
- if final_path:
332
- self.LLM_NAME = str(final_path)
333
+ final_formatter = self._get_final_formatter(formatter, formatter_name, formatter_kwargs)
333
334
 
334
- final_formatter = self._get_final_formatter(formatter, formatter_name, formatter_kwargs)
335
-
336
- super().__init__(
337
- formatter=final_formatter,
338
- bytes_per_token=bytes_per_token,
339
- **kwargs,
340
- )
335
+ super().__init__(
336
+ formatter=final_formatter,
337
+ bytes_per_token=bytes_per_token,
338
+ **kwargs,
339
+ )
341
340
 
342
341
  @property
343
342
  def name(self) -> str:
@@ -137,10 +137,12 @@ class BaseVLLMModel(BaseLLM):
137
137
  device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
138
138
 
139
139
  self.batch_size = batch_size
140
- self._tokenizer: None | VLLMTokenizerAPI = None
141
140
 
142
141
  self.model = LLM(**model_args, device=device)
143
142
 
143
+ self._tokenizer: None | VLLMTokenizerAPI = None
144
+ _ = self.tokenizer # make sure tokenizer is initialized
145
+
144
146
  self.sampling_params: SamplingParams = self._process_sampling_params(sampling_params)
145
147
 
146
148
  logger.info(
@@ -481,28 +483,27 @@ class VLLMModel(BaseVLLMModel):
481
483
  sampling_params: SamplingParams | dict[str, Any] | None = None,
482
484
  **kwargs: Any,
483
485
  ) -> None:
484
- final_path, possible_name = self._get_final_checkpoint(checkpoint_path, model_name, artifact_name)
485
-
486
- if final_path:
487
- self.LLM_NAME = str(final_path)
488
-
489
- final_name = checkpoint_name
490
- if final_name is None and possible_name is not None:
491
- final_name = possible_name.replace("/", "_").replace(":", "_").strip("_") # sanitize pathname
492
-
493
- final_formatter = self._get_final_formatter(formatter, formatter_name, formatter_kwargs)
494
-
495
- super().__init__(
496
- formatter=final_formatter,
497
- checkpoint_path=final_path,
498
- checkpoint_name=final_name,
499
- max_model_len=max_model_len,
500
- tensor_parallel_size=tensor_parallel_size,
501
- gpu_memory_utilization=gpu_memory_utilization,
502
- batch_size=batch_size,
503
- sampling_params=sampling_params,
504
- **kwargs,
505
- )
486
+ with self._get_final_checkpoint(checkpoint_path, model_name, artifact_name) as (final_path, possible_name):
487
+ if final_path:
488
+ self.LLM_NAME = str(final_path)
489
+
490
+ final_name = checkpoint_name
491
+ if final_name is None and possible_name is not None:
492
+ final_name = possible_name.replace("/", "_").replace(":", "_").strip("_") # sanitize pathname
493
+
494
+ final_formatter = self._get_final_formatter(formatter, formatter_name, formatter_kwargs)
495
+
496
+ super().__init__(
497
+ formatter=final_formatter,
498
+ checkpoint_path=final_path,
499
+ checkpoint_name=final_name,
500
+ max_model_len=max_model_len,
501
+ tensor_parallel_size=tensor_parallel_size,
502
+ gpu_memory_utilization=gpu_memory_utilization,
503
+ batch_size=batch_size,
504
+ sampling_params=sampling_params,
505
+ **kwargs,
506
+ )
506
507
 
507
508
 
508
509
  class VLLMRegistryModel(VLLMModel): # deprecated
@@ -15,7 +15,7 @@ class ARC(BaseTask[str]):
15
15
  """ARC dataset: https://huggingface.co/datasets/allenai/ai2_arc"""
16
16
 
17
17
  NAME = "ARC"
18
- DATASET_PATH = "ai2_arc"
18
+ DATASET_PATH = "allenai/ai2_arc"
19
19
  SAMPLE_SPLIT = "test"
20
20
  FEWSHOT_SPLIT = "train"
21
21
  RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
@@ -9,8 +9,10 @@ from eval_framework.tasks.base import NO_SUBJECT, RANDOM_SEED, BaseTask, Languag
9
9
 
10
10
 
11
11
  class CASEHOLD(BaseTask[str]):
12
+ """CASEHOLD dataset: https://huggingface.co/datasets/coastalcph/lex_glue"""
13
+
12
14
  NAME = "CaseHold"
13
- DATASET_PATH = "lex_glue"
15
+ DATASET_PATH = "coastalcph/lex_glue"
14
16
  SAMPLE_SPLIT = "test"
15
17
  FEWSHOT_SPLIT = "train"
16
18
  RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
@@ -91,7 +91,7 @@ class GSM8KEvalHarness(BaseTask[str]):
91
91
  """
92
92
 
93
93
  NAME = "GSM8KEvalHarness"
94
- DATASET_PATH = "gsm8k"
94
+ DATASET_PATH = "openai/gsm8k"
95
95
  SAMPLE_SPLIT = "test"
96
96
  FEWSHOT_SPLIT = "train"
97
97
  RESPONSE_TYPE = ResponseType.COMPLETION
@@ -539,7 +539,7 @@ class GSM8KReasoning(MATHReasoning):
539
539
  """
540
540
 
541
541
  NAME = "GSM8KReasoning"
542
- DATASET_PATH = "gsm8k"
542
+ DATASET_PATH = "openai/gsm8k"
543
543
  SAMPLE_SPLIT = "test"
544
544
  FEWSHOT_SPLIT = "train"
545
545
  RESPONSE_TYPE = ResponseType.COMPLETION
@@ -38,7 +38,7 @@ class TRUTHFULQA(BaseTask[str]):
38
38
  """TRUTHFULQA dataset: https://huggingface.co/datasets/truthfulqa/truthful_qa"""
39
39
 
40
40
  NAME = "TruthfulQA"
41
- DATASET_PATH = "truthful_qa"
41
+ DATASET_PATH = "truthfulqa/truthful_qa"
42
42
  SAMPLE_SPLIT = "validation"
43
43
  FEWSHOT_SPLIT = ""
44
44
  RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
@@ -13,10 +13,10 @@ ANSWER_STR_TO_NUM = {"1": 0, "2": 1}
13
13
 
14
14
 
15
15
  class WINOGRANDE(BaseTask[str]):
16
- """WINOGRANDE dataset: https://huggingface.co/datasets/winogrande"""
16
+ """WINOGRANDE dataset: https://huggingface.co/datasets/allenai/winogrande"""
17
17
 
18
18
  NAME = "Winogrande"
19
- DATASET_PATH = "winogrande"
19
+ DATASET_PATH = "allenai/winogrande"
20
20
  SAMPLE_SPLIT = "validation"
21
21
  FEWSHOT_SPLIT = "train"
22
22
  RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
File without changes