eval-framework 0.3.3__tar.gz → 0.3.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (191) hide show
  1. {eval_framework-0.3.3 → eval_framework-0.3.4}/PKG-INFO +1 -1
  2. {eval_framework-0.3.3 → eval_framework-0.3.4}/pyproject.toml +1 -1
  3. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/response_generator.py +1 -1
  4. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/base.py +30 -11
  5. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/naturalqs_open.py +1 -1
  6. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/eval_config.py +1 -1
  7. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/utils.py +19 -1
  8. {eval_framework-0.3.3 → eval_framework-0.3.4}/LICENSE +0 -0
  9. {eval_framework-0.3.3 → eval_framework-0.3.4}/README.md +0 -0
  10. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/__init__.py +0 -0
  11. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/base_config.py +0 -0
  12. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/context/__init__.py +0 -0
  13. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/context/determined.py +0 -0
  14. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/context/eval.py +0 -0
  15. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/context/local.py +0 -0
  16. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/evaluation_generator.py +0 -0
  17. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/exceptions.py +0 -0
  18. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/external/drop_process_results.py +0 -0
  19. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/external/ifeval_impl/README.md +0 -0
  20. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
  21. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
  22. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
  23. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
  24. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/llm/__init__.py +0 -0
  25. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/llm/aleph_alpha.py +0 -0
  26. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/llm/base.py +0 -0
  27. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/llm/huggingface.py +0 -0
  28. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/llm/mistral.py +0 -0
  29. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/llm/models.py +0 -0
  30. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/llm/openai.py +0 -0
  31. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/llm/vllm.py +0 -0
  32. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/logger.py +0 -0
  33. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/main.py +0 -0
  34. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/__init__.py +0 -0
  35. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/aggregators/__init__.py +0 -0
  36. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/aggregators/aggregators.py +0 -0
  37. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/base.py +0 -0
  38. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/__init__.py +0 -0
  39. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
  40. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
  41. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/bleu.py +0 -0
  42. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/chrf.py +0 -0
  43. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
  44. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
  45. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/comet.py +0 -0
  46. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
  47. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/csv_format.py +0 -0
  48. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
  49. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/drop_completion.py +0 -0
  50. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
  51. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/f1.py +0 -0
  52. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/format_checker.py +0 -0
  53. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
  54. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/ifeval.py +0 -0
  55. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/json_format.py +0 -0
  56. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/language_checker.py +0 -0
  57. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/length_control.py +0 -0
  58. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/math_minerva_completion.py +0 -0
  59. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
  60. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/minerva_math_utils.py +0 -0
  61. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/multipl_e_assertion.py +0 -0
  62. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
  63. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
  64. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/repetition.py +0 -0
  65. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
  66. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
  67. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
  68. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
  69. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
  70. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/ter.py +0 -0
  71. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/text_counter.py +0 -0
  72. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
  73. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
  74. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/__init__.py +0 -0
  75. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/base.py +0 -0
  76. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
  77. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
  78. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
  79. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
  80. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
  81. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
  82. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
  83. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/language.py +0 -0
  84. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
  85. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/models.py +0 -0
  86. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
  87. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
  88. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
  89. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
  90. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
  91. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
  92. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
  93. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
  94. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
  95. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
  96. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
  97. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
  98. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
  99. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
  100. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
  101. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/utils.py +0 -0
  102. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
  103. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
  104. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
  105. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/loglikelihood/bits_per_byte.py +0 -0
  106. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
  107. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
  108. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
  109. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
  110. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/py.typed +0 -0
  111. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/result_processors/__init__.py +0 -0
  112. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/result_processors/base.py +0 -0
  113. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/result_processors/hf_uploader.py +0 -0
  114. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/result_processors/result_processor.py +0 -0
  115. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
  116. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/run.py +0 -0
  117. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/run_direct.py +0 -0
  118. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/shared/types.py +0 -0
  119. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/suite.py +0 -0
  120. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/Dockerfile_codebench +0 -0
  121. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/__init__.py +0 -0
  122. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
  123. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/aidanbench.py +0 -0
  124. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
  125. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
  126. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
  127. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/balancedcopa.py +0 -0
  128. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
  129. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
  130. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
  131. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
  132. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
  133. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/csqa.py +0 -0
  134. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/drop.py +0 -0
  135. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
  136. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/flores200.py +0 -0
  137. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
  138. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/global_mmlu.py +0 -0
  139. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/goldenswag.py +0 -0
  140. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
  141. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/gsm8k.py +0 -0
  142. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
  143. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
  144. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
  145. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
  146. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/include.py +0 -0
  147. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
  148. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/lab_bench.py +0 -0
  149. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/math_reasoning.py +0 -0
  150. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
  151. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/medqa.py +0 -0
  152. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
  153. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
  154. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
  155. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
  156. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/multipl_e.py +0 -0
  157. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
  158. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
  159. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
  160. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
  161. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
  162. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
  163. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/social_iqa.py +0 -0
  164. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
  165. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/squad.py +0 -0
  166. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
  167. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
  168. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
  169. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/truthfulqa.py +0 -0
  170. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
  171. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
  172. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
  173. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
  174. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
  175. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/perturbation.py +0 -0
  176. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/registry.py +0 -0
  177. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/task_loader.py +0 -0
  178. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/task_names.py +0 -0
  179. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/task_style.py +0 -0
  180. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/utils/constants.py +0 -0
  181. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/utils/file_ops.py +0 -0
  182. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/utils/generate_task_docs.py +0 -0
  183. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/utils/helpers.py +0 -0
  184. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/utils/logging.py +0 -0
  185. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/utils/packaging.py +0 -0
  186. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/utils/tqdm_handler.py +0 -0
  187. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/template_formatting/README.md +0 -0
  188. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/template_formatting/__init__.py +0 -0
  189. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/template_formatting/formatter.py +0 -0
  190. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/template_formatting/mistral_formatter.py +0 -0
  191. {eval_framework-0.3.3 → eval_framework-0.3.4}/src/template_formatting/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: eval-framework
3
- Version: 0.3.3
3
+ Version: 0.3.4
4
4
  Summary: Evalulation Framework
5
5
  Author: Aleph Alpha Research
6
6
  License: Apache License
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "eval-framework"
3
- version = "0.3.3"
3
+ version = "0.3.4"
4
4
  description = "Evalulation Framework"
5
5
  readme = "README.md"
6
6
  license = { file = "LICENSE" }
@@ -78,7 +78,7 @@ class ResponseGenerator:
78
78
  custom_hf_revision=self.config.hf_revision,
79
79
  )
80
80
 
81
- self.response_type, _ = self.task._get_type_and_metrics()
81
+ self.response_type = self.task.get_response_type()
82
82
 
83
83
  def _llm_task_param_precedence(self) -> tuple[list[str] | None, int | None]:
84
84
  """
@@ -15,7 +15,7 @@ from huggingface_hub.errors import RevisionNotFoundError
15
15
  from pydantic import BaseModel, ConfigDict
16
16
 
17
17
  from eval_framework.shared.types import BaseMetricContext, Completion, Error, RawCompletion
18
- from eval_framework.tasks.utils import raise_errors
18
+ from eval_framework.tasks.utils import classproperty, raise_errors
19
19
  from template_formatting.formatter import Message, Role
20
20
 
21
21
  if TYPE_CHECKING:
@@ -91,8 +91,6 @@ class BaseTask[SubjectType](ABC):
91
91
  DATASET_PATH: str
92
92
  SAMPLE_SPLIT: str
93
93
  FEWSHOT_SPLIT: str
94
- RESPONSE_TYPE: ResponseType
95
- METRICS: list[type["BaseMetric"]]
96
94
  SUBJECTS: list[SubjectType]
97
95
  HF_REVISION: str | None = None # tag name, or branch name, or commit hash to ensure reproducibility
98
96
 
@@ -104,6 +102,10 @@ class BaseTask[SubjectType](ABC):
104
102
  # language by subtopic, or `None` (for tasks not specific to a single language).
105
103
  LANGUAGE: Language | dict[str, Language] | dict[str, tuple[Language, Language]] | None
106
104
 
105
+ # RESPONSE_TYPE and METRICS use exposed as classproperties, so you can access them via either
106
+ # `TaskClass.*` or `task.*` (or `task.get_metrics()`). This avoids mypy conflicts from re-declaring class vars.
107
+ # By default, these values come from TASK_STYLER if set, otherwise from legacy class attributes.
108
+
107
109
  def __init__(self, num_fewshot: int = 0) -> None:
108
110
  self.num_fewshot = num_fewshot
109
111
  self.stop_sequences: list[str] | None = None
@@ -332,14 +334,12 @@ class BaseTask[SubjectType](ABC):
332
334
  return None
333
335
 
334
336
  def get_metadata(self) -> dict[str, str | list[str]]:
335
- response_type, metrics = self._get_type_and_metrics()
336
-
337
337
  meta: dict[str, str | list[str]] = {
338
338
  "dataset_path": self.DATASET_PATH,
339
339
  "sample_split": self.SAMPLE_SPLIT,
340
340
  "fewshot_split": self.FEWSHOT_SPLIT,
341
- "response_type": response_type.value,
342
- "metrics": [m.NAME for m in metrics],
341
+ "response_type": self.get_response_type().value,
342
+ "metrics": [m.NAME for m in self.get_metrics()],
343
343
  "subjects": [str(s) for s in self.SUBJECTS],
344
344
  }
345
345
  if hasattr(self, "TASK_STYLER"):
@@ -420,7 +420,26 @@ class BaseTask[SubjectType](ABC):
420
420
  )
421
421
  return completion_list
422
422
 
423
- def _get_type_and_metrics(self) -> tuple[ResponseType, list[type["BaseMetric"]]]:
424
- if hasattr(self, "TASK_STYLER"):
425
- return self.TASK_STYLER.response_type, self.TASK_STYLER.metrics
426
- return self.RESPONSE_TYPE, self.METRICS
423
+ @classmethod
424
+ def get_response_type(cls) -> ResponseType:
425
+ """Return the response type of the task (or the styler if it exists)."""
426
+ if hasattr(cls, "TASK_STYLER"):
427
+ return cls.TASK_STYLER.response_type
428
+ return cls.RESPONSE_TYPE
429
+
430
+ @classmethod
431
+ def get_metrics(cls) -> list[type["BaseMetric"]]:
432
+ """Return the metrics of the task (or the styler if it exists)."""
433
+ if hasattr(cls, "TASK_STYLER"):
434
+ return cls.TASK_STYLER.metrics
435
+ return cls.METRICS
436
+
437
+ @classproperty
438
+ def RESPONSE_TYPE(cls) -> ResponseType:
439
+ """For backwards compatibility."""
440
+ return cls.get_response_type()
441
+
442
+ @classproperty
443
+ def METRICS(cls) -> list[type["BaseMetric"]]:
444
+ """For backwards compatibility."""
445
+ return cls.get_metrics()
@@ -86,7 +86,7 @@ class NaturalQsOpenCloze(_NaturalQsOpenChoice_Base):
86
86
 
87
87
  class NaturalQsOpenMC(_NaturalQsOpenChoice_Base):
88
88
  NAME = "NaturalQsOpenMC"
89
- TASK_STYLER = MCStyle(space_prefixed_labels=True)
89
+ TASK_STYLER = MCStyle()
90
90
 
91
91
 
92
92
  class NaturalQsOpenMC_OLMES(_NaturalQsOpenChoice_Base):
@@ -112,7 +112,7 @@ class EvalConfig(BaseConfig):
112
112
  @model_validator(mode="after")
113
113
  def validate_llm_judge_defined(self) -> "EvalConfig":
114
114
  task = get_task(self.task_name)
115
- _, task_metrics = task(num_fewshot=0)._get_type_and_metrics()
115
+ task_metrics = task(num_fewshot=0).get_metrics()
116
116
  for metric_class in task_metrics:
117
117
  if issubclass(metric_class, BaseLLMJudgeMetric):
118
118
  assert self.llm_judge_class is not None, "The LLM Judge must be defined for this evaluation task."
@@ -8,7 +8,7 @@ import string
8
8
  import threading
9
9
  from collections.abc import Callable
10
10
  from pathlib import Path
11
- from typing import Any, Literal, NamedTuple
11
+ from typing import Any, Literal, NamedTuple, overload
12
12
 
13
13
  import dill
14
14
  import numpy as np
@@ -22,6 +22,24 @@ logger = logging.getLogger(__name__)
22
22
  RANDOM_SEED = 42 # hacky way to get around circular import
23
23
  redis_warning_printed = False
24
24
 
25
+
26
+ class classproperty[T]:
27
+ """Descriptor supporting property-like access on classes and instances."""
28
+
29
+ def __init__(self, fget: Callable[[Any], T]) -> None:
30
+ self.fget = fget
31
+
32
+ @overload
33
+ def __get__(self, obj: None, owner: type[Any]) -> T: ...
34
+
35
+ @overload
36
+ def __get__(self, obj: object, owner: type[Any] | None = None) -> T: ...
37
+
38
+ def __get__(self, obj: object | None, owner: type[Any] | None = None) -> T:
39
+ cls = owner if owner is not None else type(obj)
40
+ return self.fget(cls)
41
+
42
+
25
43
  _pools: dict[tuple[str | None, tuple[str, ...] | None], ContainerPoolManager] = {}
26
44
  _pools_lock = threading.Lock()
27
45
 
File without changes
File without changes