eval-framework 0.3.6__tar.gz → 0.3.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (191) hide show
  1. {eval_framework-0.3.6 → eval_framework-0.3.7}/PKG-INFO +1 -1
  2. {eval_framework-0.3.6 → eval_framework-0.3.7}/pyproject.toml +1 -1
  3. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/llm/openai.py +23 -10
  4. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/ifeval.py +2 -0
  5. {eval_framework-0.3.6 → eval_framework-0.3.7}/LICENSE +0 -0
  6. {eval_framework-0.3.6 → eval_framework-0.3.7}/README.md +0 -0
  7. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/__init__.py +0 -0
  8. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/base_config.py +0 -0
  9. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/context/__init__.py +0 -0
  10. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/context/determined.py +0 -0
  11. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/context/eval.py +0 -0
  12. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/context/local.py +0 -0
  13. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/evaluation_generator.py +0 -0
  14. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/exceptions.py +0 -0
  15. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/external/drop_process_results.py +0 -0
  16. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/external/ifeval_impl/README.md +0 -0
  17. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
  18. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
  19. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
  20. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
  21. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/llm/__init__.py +0 -0
  22. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/llm/aleph_alpha.py +0 -0
  23. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/llm/base.py +0 -0
  24. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/llm/huggingface.py +0 -0
  25. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/llm/mistral.py +0 -0
  26. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/llm/models.py +0 -0
  27. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/llm/vllm.py +0 -0
  28. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/logger.py +0 -0
  29. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/main.py +0 -0
  30. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/__init__.py +0 -0
  31. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/aggregators/__init__.py +0 -0
  32. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/aggregators/aggregators.py +0 -0
  33. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/base.py +0 -0
  34. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/__init__.py +0 -0
  35. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
  36. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
  37. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/bleu.py +0 -0
  38. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/chrf.py +0 -0
  39. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
  40. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
  41. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/comet.py +0 -0
  42. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
  43. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/csv_format.py +0 -0
  44. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
  45. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/drop_completion.py +0 -0
  46. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
  47. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/f1.py +0 -0
  48. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/format_checker.py +0 -0
  49. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
  50. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/ifeval.py +0 -0
  51. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/json_format.py +0 -0
  52. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/language_checker.py +0 -0
  53. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/length_control.py +0 -0
  54. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/math_minerva_completion.py +0 -0
  55. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
  56. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/minerva_math_utils.py +0 -0
  57. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/multipl_e_assertion.py +0 -0
  58. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
  59. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
  60. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/repetition.py +0 -0
  61. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
  62. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
  63. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
  64. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
  65. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
  66. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/ter.py +0 -0
  67. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/text_counter.py +0 -0
  68. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
  69. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
  70. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/__init__.py +0 -0
  71. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/base.py +0 -0
  72. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
  73. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
  74. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
  75. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
  76. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
  77. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
  78. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
  79. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/language.py +0 -0
  80. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
  81. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/models.py +0 -0
  82. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
  83. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
  84. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
  85. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
  86. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
  87. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
  88. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
  89. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
  90. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
  91. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
  92. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
  93. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
  94. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
  95. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
  96. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
  97. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/utils.py +0 -0
  98. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
  99. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
  100. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
  101. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/loglikelihood/bits_per_byte.py +0 -0
  102. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
  103. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
  104. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
  105. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
  106. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/py.typed +0 -0
  107. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/response_generator.py +0 -0
  108. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/result_processors/__init__.py +0 -0
  109. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/result_processors/base.py +0 -0
  110. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/result_processors/hf_uploader.py +0 -0
  111. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/result_processors/result_processor.py +0 -0
  112. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
  113. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/run.py +0 -0
  114. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/run_direct.py +0 -0
  115. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/shared/types.py +0 -0
  116. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/suite.py +0 -0
  117. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/Dockerfile_codebench +0 -0
  118. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/__init__.py +0 -0
  119. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/base.py +0 -0
  120. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
  121. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/aidanbench.py +0 -0
  122. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
  123. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
  124. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
  125. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/balancedcopa.py +0 -0
  126. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
  127. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
  128. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
  129. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
  130. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
  131. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/csqa.py +0 -0
  132. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/drop.py +0 -0
  133. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
  134. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/flores200.py +0 -0
  135. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
  136. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/global_mmlu.py +0 -0
  137. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/goldenswag.py +0 -0
  138. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
  139. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/gsm8k.py +0 -0
  140. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
  141. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
  142. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
  143. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/include.py +0 -0
  144. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
  145. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/lab_bench.py +0 -0
  146. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/math_reasoning.py +0 -0
  147. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
  148. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/medqa.py +0 -0
  149. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
  150. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
  151. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
  152. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
  153. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/multipl_e.py +0 -0
  154. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/naturalqs_open.py +0 -0
  155. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
  156. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
  157. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
  158. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
  159. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
  160. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
  161. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/social_iqa.py +0 -0
  162. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
  163. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/squad.py +0 -0
  164. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
  165. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
  166. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
  167. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/truthfulqa.py +0 -0
  168. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
  169. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
  170. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
  171. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
  172. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
  173. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/eval_config.py +0 -0
  174. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/perturbation.py +0 -0
  175. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/registry.py +0 -0
  176. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/task_loader.py +0 -0
  177. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/task_names.py +0 -0
  178. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/task_style.py +0 -0
  179. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/utils.py +0 -0
  180. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/utils/constants.py +0 -0
  181. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/utils/file_ops.py +0 -0
  182. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/utils/generate_task_docs.py +0 -0
  183. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/utils/helpers.py +0 -0
  184. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/utils/logging.py +0 -0
  185. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/utils/packaging.py +0 -0
  186. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/utils/tqdm_handler.py +0 -0
  187. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/template_formatting/README.md +0 -0
  188. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/template_formatting/__init__.py +0 -0
  189. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/template_formatting/formatter.py +0 -0
  190. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/template_formatting/mistral_formatter.py +0 -0
  191. {eval_framework-0.3.6 → eval_framework-0.3.7}/src/template_formatting/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: eval-framework
3
- Version: 0.3.6
3
+ Version: 0.3.7
4
4
  Summary: Evalulation Framework
5
5
  Author: Aleph Alpha Research
6
6
  License: Apache License
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "eval-framework"
3
- version = "0.3.6"
3
+ version = "0.3.7"
4
4
  description = "Evalulation Framework"
5
5
  readme = "README.md"
6
6
  license = { file = "LICENSE" }
@@ -8,18 +8,38 @@ from functools import partial
8
8
 
9
9
  import tiktoken
10
10
  from openai import OpenAI
11
- from openai.types.chat import ChatCompletionAssistantMessageParam, ChatCompletionUserMessageParam
11
+ from openai.types.chat import (
12
+ ChatCompletionAssistantMessageParam,
13
+ ChatCompletionMessageParam,
14
+ ChatCompletionSystemMessageParam,
15
+ ChatCompletionUserMessageParam,
16
+ )
12
17
  from tokenizers import Tokenizer
13
18
  from transformers import AutoTokenizer
14
19
 
15
20
  from eval_framework.llm.base import BaseLLM
16
21
  from eval_framework.shared.types import ConcatCompression, Error, RawCompletion, RawLoglikelihood
17
22
  from eval_framework.tasks.base import Sample
18
- from template_formatting.formatter import BaseFormatter, ConcatFormatter, HFFormatter, Message
23
+ from template_formatting.formatter import BaseFormatter, ConcatFormatter, HFFormatter, Message, Role
19
24
 
20
25
  logger = logging.getLogger(__name__)
21
26
 
22
27
 
28
+ def _to_chat_completion_message(message: Message) -> ChatCompletionMessageParam:
29
+ match message.role:
30
+ case Role.SYSTEM:
31
+ return ChatCompletionSystemMessageParam(role="system", content=message.content)
32
+ case Role.USER:
33
+ return ChatCompletionUserMessageParam(role="user", content=message.content)
34
+ case Role.ASSISTANT:
35
+ return ChatCompletionAssistantMessageParam(role="assistant", content=message.content)
36
+ case None:
37
+ raise ValueError(
38
+ "Cannot send a Message without a role through the chat completion API; "
39
+ "the legacy roleless format is only supported for fine-tuning."
40
+ )
41
+
42
+
23
43
  class OpenAIModel(BaseLLM):
24
44
  """
25
45
  LLM wrapper for OpenAI API providing text/chat completions and log-probability evaluation output.
@@ -158,14 +178,7 @@ class OpenAIModel(BaseLLM):
158
178
 
159
179
  else:
160
180
  # Use chat completion API
161
- chat_messages = [
162
- (
163
- ChatCompletionUserMessageParam(role="user", content=m.content)
164
- if m.role is not None and m.role.value.lower() == "user"
165
- else ChatCompletionAssistantMessageParam(role="assistant", content=m.content)
166
- )
167
- for m in single_messages
168
- ]
181
+ chat_messages = [_to_chat_completion_message(m) for m in single_messages]
169
182
  assert self._model_name is not None
170
183
  chat_response = self._client.chat.completions.create(
171
184
  model=self._model_name,
@@ -1,6 +1,7 @@
1
1
  from typing import Any
2
2
 
3
3
  from eval_framework.metrics.completion.ifeval import IFEvalMetric, IFEvalMetricContext
4
+ from eval_framework.metrics.completion.language_checker import LanguageRawConsistencyChecker
4
5
  from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType
5
6
 
6
7
 
@@ -76,3 +77,4 @@ class IFEvalDe(IFEval):
76
77
  DATASET_PATH = "jzhang86/de_ifeval"
77
78
  SUBJECTS = [NO_SUBJECT]
78
79
  LANGUAGE = {NO_SUBJECT: Language.DEU}
80
+ METRICS = [IFEvalMetric, LanguageRawConsistencyChecker]
File without changes
File without changes