eval-framework 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. eval_framework/__init__.py +7 -0
  2. eval_framework/base_config.py +36 -0
  3. eval_framework/context/__init__.py +0 -0
  4. eval_framework/context/determined.py +170 -0
  5. eval_framework/context/eval.py +114 -0
  6. eval_framework/context/local.py +52 -0
  7. eval_framework/evaluation_generator.py +231 -0
  8. eval_framework/exceptions.py +2 -0
  9. eval_framework/external/ifeval_impl/README.md +5 -0
  10. eval_framework/external/ifeval_impl/instructions.py +1523 -0
  11. eval_framework/external/ifeval_impl/instructions_registry.py +161 -0
  12. eval_framework/external/ifeval_impl/instructions_util.py +1689 -0
  13. eval_framework/external/ifeval_impl/utils.py +135 -0
  14. eval_framework/llm/__init__.py +0 -0
  15. eval_framework/llm/aleph_alpha.py +323 -0
  16. eval_framework/llm/base.py +58 -0
  17. eval_framework/llm/huggingface.py +332 -0
  18. eval_framework/llm/mistral.py +73 -0
  19. eval_framework/llm/models.py +16 -0
  20. eval_framework/llm/openai.py +205 -0
  21. eval_framework/llm/vllm.py +438 -0
  22. eval_framework/logger.py +3 -0
  23. eval_framework/main.py +187 -0
  24. eval_framework/metrics/__init__.py +0 -0
  25. eval_framework/metrics/base.py +40 -0
  26. eval_framework/metrics/completion/__init__.py +1 -0
  27. eval_framework/metrics/completion/accuracy_completion.py +16 -0
  28. eval_framework/metrics/completion/bleu.py +76 -0
  29. eval_framework/metrics/completion/chrf.py +62 -0
  30. eval_framework/metrics/completion/code_assertion.py +44 -0
  31. eval_framework/metrics/completion/code_execution_pass_at_one.py +126 -0
  32. eval_framework/metrics/completion/comet.py +56 -0
  33. eval_framework/metrics/completion/concordance_index.py +38 -0
  34. eval_framework/metrics/completion/csv_format.py +102 -0
  35. eval_framework/metrics/completion/cwe_accuracy.py +49 -0
  36. eval_framework/metrics/completion/exponential_similarity.py +65 -0
  37. eval_framework/metrics/completion/f1.py +42 -0
  38. eval_framework/metrics/completion/format_checker.py +56 -0
  39. eval_framework/metrics/completion/grid_difference.py +77 -0
  40. eval_framework/metrics/completion/ifeval.py +73 -0
  41. eval_framework/metrics/completion/json_format.py +171 -0
  42. eval_framework/metrics/completion/language_checker.py +74 -0
  43. eval_framework/metrics/completion/length_control.py +83 -0
  44. eval_framework/metrics/completion/math_reasoning_completion.py +303 -0
  45. eval_framework/metrics/completion/niah_accuracy.py +163 -0
  46. eval_framework/metrics/completion/placeholder_checker.py +27 -0
  47. eval_framework/metrics/completion/repetition.py +88 -0
  48. eval_framework/metrics/completion/rouge_1.py +35 -0
  49. eval_framework/metrics/completion/rouge_2.py +45 -0
  50. eval_framework/metrics/completion/rouge_geometric_mean.py +36 -0
  51. eval_framework/metrics/completion/rouge_l.py +52 -0
  52. eval_framework/metrics/completion/struct_eval_metrics.py +248 -0
  53. eval_framework/metrics/completion/ter.py +67 -0
  54. eval_framework/metrics/completion/text_counter.py +182 -0
  55. eval_framework/metrics/efficiency/__init__.py +0 -0
  56. eval_framework/metrics/efficiency/bytes_per_sequence_position.py +48 -0
  57. eval_framework/metrics/llm/__init__.py +0 -0
  58. eval_framework/metrics/llm/base.py +8 -0
  59. eval_framework/metrics/llm/graders/chatbot_style_grader.py +92 -0
  60. eval_framework/metrics/llm/graders/comparison_grader.py +146 -0
  61. eval_framework/metrics/llm/graders/conciseness_grader.py +93 -0
  62. eval_framework/metrics/llm/graders/contains_names_grader.py +71 -0
  63. eval_framework/metrics/llm/graders/format_correctness_grader.py +109 -0
  64. eval_framework/metrics/llm/graders/instruction_grader.py +177 -0
  65. eval_framework/metrics/llm/graders/language.py +56 -0
  66. eval_framework/metrics/llm/graders/long_context_grader.py +72 -0
  67. eval_framework/metrics/llm/graders/models.py +74 -0
  68. eval_framework/metrics/llm/graders/refusal_grader.py +57 -0
  69. eval_framework/metrics/llm/graders/sql_quality_grader.py +145 -0
  70. eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +103 -0
  71. eval_framework/metrics/llm/llm_judge_chatbot_style.py +36 -0
  72. eval_framework/metrics/llm/llm_judge_completion_accuracy.py +39 -0
  73. eval_framework/metrics/llm/llm_judge_conciseness.py +37 -0
  74. eval_framework/metrics/llm/llm_judge_contains_names.py +36 -0
  75. eval_framework/metrics/llm/llm_judge_format_correctness.py +43 -0
  76. eval_framework/metrics/llm/llm_judge_instruction.py +58 -0
  77. eval_framework/metrics/llm/llm_judge_mtbench_pair.py +205 -0
  78. eval_framework/metrics/llm/llm_judge_mtbench_single.py +188 -0
  79. eval_framework/metrics/llm/llm_judge_refusal.py +35 -0
  80. eval_framework/metrics/llm/llm_judge_sql.py +394 -0
  81. eval_framework/metrics/llm/llm_judge_world_knowledge.py +37 -0
  82. eval_framework/metrics/loglikelihood/__init__.py +0 -0
  83. eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +51 -0
  84. eval_framework/metrics/loglikelihood/probability_mass.py +56 -0
  85. eval_framework/py.typed +0 -0
  86. eval_framework/response_generator.py +416 -0
  87. eval_framework/result_processors/__init__.py +0 -0
  88. eval_framework/result_processors/base.py +74 -0
  89. eval_framework/result_processors/hf_processor.py +87 -0
  90. eval_framework/result_processors/result_processor.py +129 -0
  91. eval_framework/run.py +314 -0
  92. eval_framework/run_direct.py +42 -0
  93. eval_framework/shared/types.py +227 -0
  94. eval_framework/tasks/__init__.py +6 -0
  95. eval_framework/tasks/base.py +314 -0
  96. eval_framework/tasks/benchmarks/__init__.py +0 -0
  97. eval_framework/tasks/benchmarks/arc.py +46 -0
  98. eval_framework/tasks/benchmarks/arc_de.py +46 -0
  99. eval_framework/tasks/benchmarks/arc_fi.py +46 -0
  100. eval_framework/tasks/benchmarks/belebele.py +60 -0
  101. eval_framework/tasks/benchmarks/bigcodebench.py +155 -0
  102. eval_framework/tasks/benchmarks/casehold.py +47 -0
  103. eval_framework/tasks/benchmarks/chembench.py +85 -0
  104. eval_framework/tasks/benchmarks/copa.py +39 -0
  105. eval_framework/tasks/benchmarks/duc.py +91 -0
  106. eval_framework/tasks/benchmarks/flores200.py +62 -0
  107. eval_framework/tasks/benchmarks/flores_plus.py +84 -0
  108. eval_framework/tasks/benchmarks/gpqa.py +177 -0
  109. eval_framework/tasks/benchmarks/gsm8k.py +148 -0
  110. eval_framework/tasks/benchmarks/hellaswag.py +44 -0
  111. eval_framework/tasks/benchmarks/hellaswag_de.py +52 -0
  112. eval_framework/tasks/benchmarks/humaneval.py +97 -0
  113. eval_framework/tasks/benchmarks/ifeval.py +78 -0
  114. eval_framework/tasks/benchmarks/include.py +119 -0
  115. eval_framework/tasks/benchmarks/infinitebench.py +302 -0
  116. eval_framework/tasks/benchmarks/math_reasoning.py +569 -0
  117. eval_framework/tasks/benchmarks/mbpp.py +192 -0
  118. eval_framework/tasks/benchmarks/mmlu.py +190 -0
  119. eval_framework/tasks/benchmarks/mmlu_de.py +109 -0
  120. eval_framework/tasks/benchmarks/mmlu_pro.py +139 -0
  121. eval_framework/tasks/benchmarks/mmmlu.py +529 -0
  122. eval_framework/tasks/benchmarks/openbookqa.py +37 -0
  123. eval_framework/tasks/benchmarks/opengptx_eu20.py +363 -0
  124. eval_framework/tasks/benchmarks/pawsx.py +65 -0
  125. eval_framework/tasks/benchmarks/piqa.py +39 -0
  126. eval_framework/tasks/benchmarks/quality.py +56 -0
  127. eval_framework/tasks/benchmarks/sciq.py +44 -0
  128. eval_framework/tasks/benchmarks/sphyr.py +75 -0
  129. eval_framework/tasks/benchmarks/squad.py +89 -0
  130. eval_framework/tasks/benchmarks/struct_eval.py +110 -0
  131. eval_framework/tasks/benchmarks/tablebench.py +117 -0
  132. eval_framework/tasks/benchmarks/triviaqa.py +42 -0
  133. eval_framework/tasks/benchmarks/truthfulqa.py +95 -0
  134. eval_framework/tasks/benchmarks/winogender.py +39 -0
  135. eval_framework/tasks/benchmarks/winogrande.py +44 -0
  136. eval_framework/tasks/benchmarks/winox.py +57 -0
  137. eval_framework/tasks/benchmarks/wmt.py +160 -0
  138. eval_framework/tasks/benchmarks/zero_scrolls.py +197 -0
  139. eval_framework/tasks/eval_config.py +112 -0
  140. eval_framework/tasks/perturbation.py +83 -0
  141. eval_framework/tasks/registry.py +186 -0
  142. eval_framework/tasks/task_loader.py +80 -0
  143. eval_framework/tasks/task_names.py +138 -0
  144. eval_framework/tasks/utils.py +578 -0
  145. eval_framework/utils/constants.py +9 -0
  146. eval_framework/utils/generate_task_docs.py +229 -0
  147. eval_framework/utils/helpers.py +3 -0
  148. eval_framework/utils/logging.py +50 -0
  149. eval_framework/utils/packaging.py +52 -0
  150. eval_framework-0.2.0.dist-info/METADATA +514 -0
  151. eval_framework-0.2.0.dist-info/RECORD +161 -0
  152. eval_framework-0.2.0.dist-info/WHEEL +4 -0
  153. eval_framework-0.2.0.dist-info/entry_points.txt +3 -0
  154. template_formatting/README.md +83 -0
  155. template_formatting/__init__.py +0 -0
  156. template_formatting/formatter.py +536 -0
  157. template_formatting/mistral_formatter.py +159 -0
  158. template_formatting/py.typed +0 -0
  159. template_formatting/tests/test_formatter_eval.py +408 -0
  160. template_formatting/tests/test_formatter_scaling.py +253 -0
  161. template_formatting/tests/test_mistral_formatter.py +136 -0
@@ -0,0 +1,161 @@
1
+ eval_framework/__init__.py,sha256=dLv--h62kDYK2uN5aFpEowXpW2P9XLwMud-NwoiW_u4,120
2
+ eval_framework/base_config.py,sha256=LJOHr0MtE9PPsfbLmP2tpoa52Tt0rIHMaW3CTYVwehs,1236
3
+ eval_framework/context/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ eval_framework/context/determined.py,sha256=A3o-N7149LF6OqG9fDvmmge6tXi65qGexLuhazi1R30,7070
5
+ eval_framework/context/eval.py,sha256=uFlWBMEZ-xlJFjnvjaxvCphXlgWYXb2F-6xzVuoqXJk,4067
6
+ eval_framework/context/local.py,sha256=rO8yb1lMHdqiKcdFRVktWR6Ux3T9tM6Xa946Fpwxi5k,1962
7
+ eval_framework/evaluation_generator.py,sha256=xlZeGk0Y6jNYLQ3-8qAeCpx-tPBu924mvWLtXPqbGCA,11611
8
+ eval_framework/exceptions.py,sha256=j4jjN2Y-8vMxf0Dfms1buAJHNMzEQ6kZca6l_z-lDBo,38
9
+ eval_framework/external/ifeval_impl/README.md,sha256=fC2t3BSbjW_Hl8iAUoTwiFpblgY1NeqeF67tl5ScWT4,408
10
+ eval_framework/external/ifeval_impl/instructions.py,sha256=fp94wBZv0SQgm7OTTrguh1yiscPoYst8MqoBmoO_A6k,55615
11
+ eval_framework/external/ifeval_impl/instructions_registry.py,sha256=TzNBdO5rHl3jPwvm-o83IpJ8l1o0DoG2jp7gDSd54RU,6722
12
+ eval_framework/external/ifeval_impl/instructions_util.py,sha256=qUb8wipLfBMvHv3UpMTn-yZay_2JU7X_524f141xHJs,26095
13
+ eval_framework/external/ifeval_impl/utils.py,sha256=i2ADNqLmcBlBAdL7BZMa4HoTXJ3DU01UL01-7grcebg,4537
14
+ eval_framework/llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
+ eval_framework/llm/aleph_alpha.py,sha256=2SforEMRAORdeq9BqCho2KURLv8FYjabGOH4fghEW9M,14542
16
+ eval_framework/llm/base.py,sha256=WIA4N4z1RH9my-qiozRLyS6PmMKyd9_Uz98aGnoGe_U,2488
17
+ eval_framework/llm/huggingface.py,sha256=YpqFFAYrBO-G6Aws2AxzNNGsHtEo8CYDWZ0Gddfn95E,15042
18
+ eval_framework/llm/mistral.py,sha256=Fkxop0tSegNo22DCdyAWXUV8mKK30Fbq_aM9iXs-HHA,2732
19
+ eval_framework/llm/models.py,sha256=W0W4vaosvkU2CrHAT-4TxWQwGPuPUaDqIPtzb4G2uDA,638
20
+ eval_framework/llm/openai.py,sha256=x_OlwGUGPh5A6wp7HZBA-mn-4u6BbfIhLPguXQUjqIE,8633
21
+ eval_framework/llm/vllm.py,sha256=T5gaDYglrN6Omv3MpZZ7n5emjxfZPycKHHtjlqvfx5A,17086
22
+ eval_framework/logger.py,sha256=8Bj7S8JRYh-SJZ3dEgueDIoVrhOjRyDsnRuLG61ft9E,61
23
+ eval_framework/main.py,sha256=y-5bpkvjwVhcYWmzHXOPDv9bPABkgD_9GXID8pJI2F0,7552
24
+ eval_framework/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
+ eval_framework/metrics/base.py,sha256=3VmIfC8AN-SXFf_7d_5fnwTQirDKBtJI5JxwDGOhtDU,1013
26
+ eval_framework/metrics/completion/__init__.py,sha256=3m1ekU7MH8JqV-6VHRBRQiPatqpZsNW6tQKpaXjpztE,52
27
+ eval_framework/metrics/completion/accuracy_completion.py,sha256=t-6lJBZ6dhhppepIkLEmB3TSd9qkGE3mrPYaDhnql98,697
28
+ eval_framework/metrics/completion/bleu.py,sha256=IDO3Hn-VgH7eT83iO9FCBI8gBUcj8cMOc1kfm_E73uI,3311
29
+ eval_framework/metrics/completion/chrf.py,sha256=o0zbwOpbL99fg00neET1Pb7jsfT8Sd1n-px_Jql43X8,2526
30
+ eval_framework/metrics/completion/code_assertion.py,sha256=f9XYPJzP6XWD2wqZ1_qWYyw56IhBnxp7hBmymw30ExA,1489
31
+ eval_framework/metrics/completion/code_execution_pass_at_one.py,sha256=1GyruuwS12UXfLSo3K7rRy2CZhC3W0oOAilFzAUtOeU,5123
32
+ eval_framework/metrics/completion/comet.py,sha256=M_4ITNfthjxqX8CgVKlxK5W7Gdu08FbXsmbGOx4SfSA,2333
33
+ eval_framework/metrics/completion/concordance_index.py,sha256=LfmM4KmXKiPbztoJaBRCDMA6lQdPFhHcRTYjNP0olQk,1369
34
+ eval_framework/metrics/completion/csv_format.py,sha256=sxo8xnEkGUw7FnkkZC2k58yn3GPuJQ_rJAFNLLo2sNE,3640
35
+ eval_framework/metrics/completion/cwe_accuracy.py,sha256=1LV35uxoDlKzLE_XWBItMMVsGBLqXP2DfqiI0L2T-dI,2130
36
+ eval_framework/metrics/completion/exponential_similarity.py,sha256=93rQV_pG7RbFMt0DWCDQe8iUiF9GzcTneHRxvH9tIgI,2702
37
+ eval_framework/metrics/completion/f1.py,sha256=ddHQXsQv5keZDrJvoY_nPPZtqZMEfrRrafeSWg6HQys,1512
38
+ eval_framework/metrics/completion/format_checker.py,sha256=JUgx3EbxsZEJr0bNlmQFQdQzkghvegq8QtC4vxQjvaI,1997
39
+ eval_framework/metrics/completion/grid_difference.py,sha256=sun639fzMNkhjoesfgRIsy7dofF5vxzbKlvVvUfA_y4,3104
40
+ eval_framework/metrics/completion/ifeval.py,sha256=93KxO8qfE6-9snppzpr3a7jCmCT2ciJOqWcK31VB2No,2578
41
+ eval_framework/metrics/completion/json_format.py,sha256=MJz8tFASxEsqPwCzhz66Z3m2y4eHPYchRMkk0n4kD6I,6209
42
+ eval_framework/metrics/completion/language_checker.py,sha256=QO9yhHe99ZkvZxLSZ5m5B8N_oRVNsZeklg0b5MfUadg,3323
43
+ eval_framework/metrics/completion/length_control.py,sha256=15_S5m7SNFNR5KXNhmvTy3pGhtsuawlRU76w-ehLix8,3294
44
+ eval_framework/metrics/completion/math_reasoning_completion.py,sha256=L5GH_aQI6Azngv9a2DMueraNFBPmT3Ges09CO4naTXM,12050
45
+ eval_framework/metrics/completion/niah_accuracy.py,sha256=ycFUVXpJqdA_-aBvmzKUfaSpPi_-nCDY4F27kQjsPks,5803
46
+ eval_framework/metrics/completion/placeholder_checker.py,sha256=PhpPlcrP_QDYCOJuWK12ZfcUAOYys9IxZOKICTNUa1U,1147
47
+ eval_framework/metrics/completion/repetition.py,sha256=MRsap8ZDISDfC5luqWlQA05W_anjFU6XzzvD55LsM_M,3340
48
+ eval_framework/metrics/completion/rouge_1.py,sha256=Y1m7e9q258cIFjIfGShssneFn08_85ZQF6-YqIgOORQ,1514
49
+ eval_framework/metrics/completion/rouge_2.py,sha256=3GKFHVXHKvPOjk4SaU6D1vbykK5WeE6Q2Ogjhasa1uk,1978
50
+ eval_framework/metrics/completion/rouge_geometric_mean.py,sha256=0fqiWx72eJscuLkekh901CwhFInN9HoxQ2LJod40fJs,1730
51
+ eval_framework/metrics/completion/rouge_l.py,sha256=SwM1s7MQWKjVPlS0KyHcEH9pzkA-hlidz-4gM9kiTu4,2360
52
+ eval_framework/metrics/completion/struct_eval_metrics.py,sha256=8wBx7yTfzjww1wPST57X9sjrVNHavtKXZcOiCkbNrZk,8148
53
+ eval_framework/metrics/completion/ter.py,sha256=mskQejjl1RX0WuSQk1e42-L1QfH0kwTVIhDwqbaBNEc,2614
54
+ eval_framework/metrics/completion/text_counter.py,sha256=UXBOt7okRZHx6BuVcyAS9IeNoYSnryLKkdgYn0FArF8,7100
55
+ eval_framework/metrics/efficiency/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
56
+ eval_framework/metrics/efficiency/bytes_per_sequence_position.py,sha256=fPNqu_fQSqy__1Es5Zbm0niBr8N6j-jnprY-ysAFrds,1849
57
+ eval_framework/metrics/llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
58
+ eval_framework/metrics/llm/base.py,sha256=uCOGxMM7d3oS2ECZTg-Xy3GOQow68TOsB8YBKXGRnTI,286
59
+ eval_framework/metrics/llm/graders/chatbot_style_grader.py,sha256=7tplUGC7G_F730t9Ij242dBRbQKUaCzURP1iX6ZKgrI,4114
60
+ eval_framework/metrics/llm/graders/comparison_grader.py,sha256=9fIWPqVeky5MrwvWHrQNzFeeDlu6LH8bw5-eBHdc82g,5363
61
+ eval_framework/metrics/llm/graders/conciseness_grader.py,sha256=-WE7dOo7Jo57UzmesAr61WKurB9NegNBVtPLmViLOZw,3562
62
+ eval_framework/metrics/llm/graders/contains_names_grader.py,sha256=5NUGVcAzkyGJ1or5uReCbUJT3psplnHTd7dUkf_iR0Y,2724
63
+ eval_framework/metrics/llm/graders/format_correctness_grader.py,sha256=1ewPCXj97favA3BovNSOpHRILhtsTbmp5vWJfzk-968,4549
64
+ eval_framework/metrics/llm/graders/instruction_grader.py,sha256=v9ew30JHpO8LK99D2FYhFz6E-ikE4PIld3sCT79u0gk,11625
65
+ eval_framework/metrics/llm/graders/language.py,sha256=9YlEE3BjvzfHfQtRMTWrP_NxGbjKbZRbAjqo3GvL_wE,1720
66
+ eval_framework/metrics/llm/graders/long_context_grader.py,sha256=BX29D8BsVoVGOfGlQjAfFMJFw2Nn77puwMOBnHJvJoE,2476
67
+ eval_framework/metrics/llm/graders/models.py,sha256=PVGzyjOcmm-DN-NpoO8SzFyUNVoDLG330f3uFXG0SfE,2206
68
+ eval_framework/metrics/llm/graders/refusal_grader.py,sha256=SUFUiveL36LWyKR5w8LUgYl2Kx4aAc5IPu5uV8j4N5k,2272
69
+ eval_framework/metrics/llm/graders/sql_quality_grader.py,sha256=ooNCxBNKeyqFxf2nAKdtUcd7aIMQpmxcEn9iTo5XhiQ,5624
70
+ eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py,sha256=lZJzXyMNYLhY4RmrPZsBxJByPXWMk8aeDjGxUArwv4U,4599
71
+ eval_framework/metrics/llm/llm_judge_chatbot_style.py,sha256=pb_GWN5xVHRuk64XPNkIqRV1htKaNmr-Cdjx9jxgGiw,1400
72
+ eval_framework/metrics/llm/llm_judge_completion_accuracy.py,sha256=KS1Fn3cZAyIfXd8LM_o2s9IjoHpftqtuSIJ3fGDAr6Y,1523
73
+ eval_framework/metrics/llm/llm_judge_conciseness.py,sha256=v2iSxBeUU3QTjdy0hx-9t5j0pf4LMnp5z2JCiqpN9_8,1439
74
+ eval_framework/metrics/llm/llm_judge_contains_names.py,sha256=7r-sAI6Qwej4fgQIhmotXtEK5ZaLcHxgyjbP7TYzRtE,1401
75
+ eval_framework/metrics/llm/llm_judge_format_correctness.py,sha256=AwHLblRtWSo7hg0sJpcdQAZP7ldrfZFDp2rGB9-6rns,1668
76
+ eval_framework/metrics/llm/llm_judge_instruction.py,sha256=PcXACNijZSYIfLoks-bqCgjqo0YPqQpX4O5GinC2SvE,2170
77
+ eval_framework/metrics/llm/llm_judge_mtbench_pair.py,sha256=YCZcXA-HxQww7HUCgzNapJIUfPW3I0YP6WGG-dtRD9w,24787
78
+ eval_framework/metrics/llm/llm_judge_mtbench_single.py,sha256=20iNNtGm1Ch8Upt0Vk7MGlKvwYyN4i0lvRDym3AnK0w,17362
79
+ eval_framework/metrics/llm/llm_judge_refusal.py,sha256=iAoOstgOvKtk9M9wqVqrf21mM0Xbss4EraO7R3g9FBQ,1418
80
+ eval_framework/metrics/llm/llm_judge_sql.py,sha256=qMj2pHzijq2lVHqToewQL_xJSgKLulZWSb64996ztnQ,14480
81
+ eval_framework/metrics/llm/llm_judge_world_knowledge.py,sha256=C48aHS6bcVtGMk0YxzqDAGiHekypyeo--SK7EFVN5Jc,1517
82
+ eval_framework/metrics/loglikelihood/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
83
+ eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py,sha256=l0OxJFSQiLnwJdfX72SH9k-krZr8AI1FOUYlHfiT2Q0,1921
84
+ eval_framework/metrics/loglikelihood/probability_mass.py,sha256=I3AhKlwSYQEnKFfagopqmc6-Mdnui43GR0LSFfsrJVk,2291
85
+ eval_framework/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
86
+ eval_framework/response_generator.py,sha256=-6qQ2U_he0uNKG5kcir5tiZcQ4cOybTiQ1KWTmLe3cI,19244
87
+ eval_framework/result_processors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
88
+ eval_framework/result_processors/base.py,sha256=wuoOQENw6GPDsRxwk5lxeNUprplNa33D1XuQ4nJdJI0,2017
89
+ eval_framework/result_processors/hf_processor.py,sha256=p-b9EBKx0ahYK8tu-h8l76rDlYtcZzx_zKIe2p8HDBI,3071
90
+ eval_framework/result_processors/result_processor.py,sha256=zMQ_SJHbr81og4I_6Q1OrQGSJWCxlES_3xklMmU1S0Q,5362
91
+ eval_framework/run.py,sha256=lWn_u5Sfp4iZVtubArg2VfpD-qIRhuIBwp-lz-2q8o8,9896
92
+ eval_framework/run_direct.py,sha256=KMWkLDuDt-HPlmjsSGKAiXd7LlrpVUPKv89Gk3i0snA,1176
93
+ eval_framework/shared/types.py,sha256=lPA5uhdRgs3H---SFsjUOYwUkqBYL0K2Y2JvxCOyMLc,8841
94
+ eval_framework/tasks/__init__.py,sha256=Fzs8DY53Dt0Gsu34Ro6Dk6by9qgaFF0UIIHERl6PO5g,120
95
+ eval_framework/tasks/base.py,sha256=ujLYsjgRtOpEccx8RligP2HJd8G-A-ct1Tr8qzxJTMM,12558
96
+ eval_framework/tasks/benchmarks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
97
+ eval_framework/tasks/benchmarks/arc.py,sha256=KAu4etBPiLde4sLR706H536msLe-cc2ItE8eYCtV5ro,1834
98
+ eval_framework/tasks/benchmarks/arc_de.py,sha256=Ng7n0HeAEPh2SEHOTwIl1ccVCWH9iI0-U11mBe0aR38,1871
99
+ eval_framework/tasks/benchmarks/arc_fi.py,sha256=fgjdHN0pq8V_R_vMsLsk9Q2Mf7JwKF0vVFazESHtf2o,1858
100
+ eval_framework/tasks/benchmarks/belebele.py,sha256=x3V5DsNdkJpwMCWi5SOVIMJS7-ZZhi5E3XttGa23bR8,2213
101
+ eval_framework/tasks/benchmarks/bigcodebench.py,sha256=vrEySwqQTAEFnWGDoTzAOL6IVSwulbK7Rp60hxyOlPc,5892
102
+ eval_framework/tasks/benchmarks/casehold.py,sha256=hFWW1LnzVHk8un58flGLh_JOv3h95XHwOB-WenIWOJc,1727
103
+ eval_framework/tasks/benchmarks/chembench.py,sha256=GaPW0oBYLx4cQaZuvcFdDIL5XG5YUurqRcdaWXSzVgo,3522
104
+ eval_framework/tasks/benchmarks/copa.py,sha256=yJDy93Kjv1mVfwUORZidsAdPVrRjr6HKP8LREuOlQyU,1483
105
+ eval_framework/tasks/benchmarks/duc.py,sha256=6VAk38UrbuG63gr-K69WZ016g-EH0ONxTzictJ-AcN0,3516
106
+ eval_framework/tasks/benchmarks/flores200.py,sha256=HzgSaARfqTJaSQo71GUBJucY9ZZg6m0d4T3XNz9QARo,2176
107
+ eval_framework/tasks/benchmarks/flores_plus.py,sha256=bTKH8ECFdZRw-3RV-37a2clpY7u1Y2QigVeXIQoI2c0,3346
108
+ eval_framework/tasks/benchmarks/gpqa.py,sha256=bOg1oV1OXLsLHdYR8rlabJi2o4FA8szOrbiyRIsNGQs,7873
109
+ eval_framework/tasks/benchmarks/gsm8k.py,sha256=-JKQIMOZP0tF-GpZzGZNmqoAfMtCoDFAyagASln2Vbc,5790
110
+ eval_framework/tasks/benchmarks/hellaswag.py,sha256=j5KzRNSdIC_oBMrDCIOTsWoDBpfksodECfbneWKL2kU,1691
111
+ eval_framework/tasks/benchmarks/hellaswag_de.py,sha256=09sItfKknm6Xm-NKm5HcBgm-EYlm0dBqdgkEcXCReVk,2091
112
+ eval_framework/tasks/benchmarks/humaneval.py,sha256=wDP8ymSaqrhe28pTXavt0fxayA-cdUU8eOp5V8Q6T40,3370
113
+ eval_framework/tasks/benchmarks/ifeval.py,sha256=sww3y21udT1xCdf1fmh7z4EZ6-XLMR5fFgqUdwCUmZY,2826
114
+ eval_framework/tasks/benchmarks/include.py,sha256=Io4IFYTOCEoolVMRvjMEc58YJSJh4FcNnJ7wCYOmeIo,3380
115
+ eval_framework/tasks/benchmarks/infinitebench.py,sha256=bDNkNNe2v1FNOwGR9fHbTaUXFJJxNlHR6emarFjPFE4,11024
116
+ eval_framework/tasks/benchmarks/math_reasoning.py,sha256=cFs2x67imAGxdiPbv4YrgmkotxXtUdkr1v0PaDvLElg,22127
117
+ eval_framework/tasks/benchmarks/mbpp.py,sha256=hX8NnmI8iV4L35BX1-OCESNtfQq6hPryVKXJ_rsYQCI,7530
118
+ eval_framework/tasks/benchmarks/mmlu.py,sha256=pSIdhMABfKGY210M-XWZ7yZCRv-DULJM6-fR2iZDCHY,6657
119
+ eval_framework/tasks/benchmarks/mmlu_de.py,sha256=MAmiVWR-tSQFT383lKz_z0b0pdhWQ7PjPwtUuIGCb64,4634
120
+ eval_framework/tasks/benchmarks/mmlu_pro.py,sha256=zlUNC1M0XmydOccv_2pr5RYoFEmUdJKfHdn7Pt3bs4Q,5288
121
+ eval_framework/tasks/benchmarks/mmmlu.py,sha256=Kr6WVv4Z2SH3VJU549LF1yUXIPf1wvL60t7WdxfOcY8,24029
122
+ eval_framework/tasks/benchmarks/openbookqa.py,sha256=8OaC9hECRg9G0CBNhpx0OBThwD4OgjodD2xkvSshBPo,1524
123
+ eval_framework/tasks/benchmarks/opengptx_eu20.py,sha256=cg24VDABCug5pO2NOeLRNh2L5c536IJw_WNob4Zr1K0,14890
124
+ eval_framework/tasks/benchmarks/pawsx.py,sha256=VnTTi396NQzBiUfUxsJ14WRvDibYEDL4S7cHDjoQ96Y,3106
125
+ eval_framework/tasks/benchmarks/piqa.py,sha256=vnt6OHSFit7IHGfhz2qlNJWea4TmWXgIZulyU7lWNqY,1465
126
+ eval_framework/tasks/benchmarks/quality.py,sha256=8GTmOAsX3cxgNDYaTT31cN8N-xSr6iVIR0pvhD8aqeo,1955
127
+ eval_framework/tasks/benchmarks/sciq.py,sha256=oNjEJJ1dy-uPtq7uSojOAj1znR71fnI4Fa2awZJ8hTk,1593
128
+ eval_framework/tasks/benchmarks/sphyr.py,sha256=68miYCDlJHBTU7vTXXnPUQ54DJy_QwLLq-pjiAaXbJw,3004
129
+ eval_framework/tasks/benchmarks/squad.py,sha256=LqF4NTC11HazddgeLFsosd2b2KYJnQ-SWL6hHKuZDpE,3247
130
+ eval_framework/tasks/benchmarks/struct_eval.py,sha256=vz6b26q_uz1Yyk7mmAeCo3UP0oA-Ih5-PG6S-6ojYeg,3850
131
+ eval_framework/tasks/benchmarks/tablebench.py,sha256=MxQlW7d62hBqm0HZ93XxzVHVn_JIeqK4UJO4-b5tO3U,4912
132
+ eval_framework/tasks/benchmarks/triviaqa.py,sha256=S1RTJrJaeowmgjtMkcDxjUbu_9y35Eo1x9HouOqA_M0,1646
133
+ eval_framework/tasks/benchmarks/truthfulqa.py,sha256=cXS8z8udxHqxB0YRo3ZTuqJiqVcTa8DIGvhigu4MJMY,3845
134
+ eval_framework/tasks/benchmarks/winogender.py,sha256=ejv27r22F-hoU_kRvhe7YyZgr4ZjlVHzAnvEHYCFS0s,1541
135
+ eval_framework/tasks/benchmarks/winogrande.py,sha256=t9irSc8FqD2fbFredvvf-ACEXc1-QlU-4x39KXP-YXs,1634
136
+ eval_framework/tasks/benchmarks/winox.py,sha256=Qvbb_HLYpQlEKeOkJ22nFGAWlYjPQhMEp-Ra95_6k0U,1996
137
+ eval_framework/tasks/benchmarks/wmt.py,sha256=I54AmQNX4uv_7k5U3nYdQi2DhQIuAkOzRZCbUZYyZ64,5690
138
+ eval_framework/tasks/benchmarks/zero_scrolls.py,sha256=qUgoDICnmEXM1EZdV2PTUXf4YzUjyOCwmgWACN_-0zE,7862
139
+ eval_framework/tasks/eval_config.py,sha256=d49KUjBqIiH2sBGaIs0sylR2WQKsR5sDnbFXL_P3n1Q,4311
140
+ eval_framework/tasks/perturbation.py,sha256=ZtXMqPk9YSWiX6ytgUXEeacA1LhAgvkJyxuO7MruJho,3532
141
+ eval_framework/tasks/registry.py,sha256=d4uYpg8JOfStl-r0mExaJOYL4rqsXD9RAQ93fi32D7I,5738
142
+ eval_framework/tasks/task_loader.py,sha256=js6Um6ZoFFVl1FHfrbH4EQnFEX8-_jgdDAHvrPNULDw,3669
143
+ eval_framework/tasks/task_names.py,sha256=JbOHvhkii11-8nXmAqaqN_AuhRJMvMv514KmSfSI1Mw,8954
144
+ eval_framework/tasks/utils.py,sha256=QdTXOeWkUbC0hCcZbAPFRWFCu1KkP-9kFukIoyaSJkc,19785
145
+ eval_framework/utils/constants.py,sha256=LEElGdYrkIWm8dJa7lfD5LbL-fwkF17Z0nQ7_XVg098,164
146
+ eval_framework/utils/generate_task_docs.py,sha256=KBLicduL27fwa0bWwxPw_tfDcChn2E0GYpUt3dB9ldo,8999
147
+ eval_framework/utils/helpers.py,sha256=KCVUcGw-Hvkf6Qs7h4fqRxELORIlPXTEMDV5zO_2IYU,160
148
+ eval_framework/utils/logging.py,sha256=mpGNSoam2N3YkTClKF2hoe4XftsOZFa4bE9HHU6EXV0,1648
149
+ eval_framework/utils/packaging.py,sha256=Z_eXjzcgCvifJwJ-pqeAtNPVjscgR6QYNKe4E-iSnFc,1889
150
+ template_formatting/README.md,sha256=gVrps3xXKPP87lWmKzaaEjNMBikQVj_MMZ-FdG3O6Xg,3787
151
+ template_formatting/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
152
+ template_formatting/formatter.py,sha256=2U39HhQVWEy__5WysrapUR_GBjogcgmqnp47IKJf57Y,21490
153
+ template_formatting/mistral_formatter.py,sha256=NJESWDsd_QFx8USoTGJ1QOYczwXf-ObjnpTy7mqzjK0,6734
154
+ template_formatting/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
155
+ template_formatting/tests/test_formatter_eval.py,sha256=Z7YLmPLzOqJP7TbtdDDAWR_WBhlnCKBRNnm2NXT8jmk,17186
156
+ template_formatting/tests/test_formatter_scaling.py,sha256=jMdul-2urA6ouVY6d5iKdGhiaLMEp5cDHr5tVIJaVrU,9405
157
+ template_formatting/tests/test_mistral_formatter.py,sha256=V95tEjxoEzH3Eai7aHaqDkOou0nPkc9v9mx9yV48PqQ,6068
158
+ eval_framework-0.2.0.dist-info/WHEEL,sha256=Jb20R3Ili4n9P1fcwuLup21eQ5r9WXhs4_qy7VTrgPI,79
159
+ eval_framework-0.2.0.dist-info/entry_points.txt,sha256=k4dpbNwZ5XnovyqrScWTZ-UYzf_EPYOvZA2QTkqrYlk,59
160
+ eval_framework-0.2.0.dist-info/METADATA,sha256=ykdyXfaEI1hexHDSKwvUICz15Bzz-pfeuypJtYW-mFU,25935
161
+ eval_framework-0.2.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: uv 0.8.15
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ eval_framework = eval_framework.run:run
3
+
@@ -0,0 +1,83 @@
1
+
2
+ # Internal template formatting package
3
+
4
+ Single source of truth for internal template formatting. Ensures compatibility between `scaling-internal` and `eval-framework`
5
+
6
+ ### Install uv
7
+
8
+ `uv` is used for dependency management and packaging in Python projects. To install uv, follow the [official instructions](https://docs.astral.sh/uv/getting-started/installation/).
9
+
10
+
11
+ ## Project Structure
12
+ - src/: Contains the template formatting code
13
+ - tests/: Contains pytest test cases.
14
+ - test_formatter_eval.py: Basic unit tests for the template formatter derived from `eval-framework`
15
+ - test_formatter_scaling.py: Basic unit tests for the template formatter derived from `scaling-internal`
16
+ - pyproject.toml: Configuration file for uv and other tools like MyPy, ruff and pytest.
17
+
18
+
19
+ ## Adding dependencies
20
+
21
+ - **Adding Production Dependencies**: These are dependencies necessary for your project to run. For example, if your project uses Pydantic for data validation, you would add it as a production dependency:
22
+
23
+ ```bash
24
+ uv add pydantic
25
+ ```
26
+ - **Adding Development Dependencies**: These are dependencies that are only needed during development, such as testing libraries or linters. For instance, to add pytest for writing and running tests, you would specify it as a development dependency:
27
+
28
+ ```bash
29
+ uv add --group dev pytest
30
+ ```
31
+
32
+ After adding any new dependencies, you need to install them to update your project's virtual environment:
33
+ ```bash
34
+ uv sync
35
+ ```
36
+ This command ensures that all dependencies listed in your pyproject.toml file are correctly installed and available for use in your project.
37
+
38
+ To install all dependencies (including optional ones), run
39
+ ```bash
40
+ uv install --extras optional
41
+ ```
42
+
43
+ ## Usage
44
+ **Running Commands with uv**
45
+
46
+ `uv`` creates a virtual environment for your project, which isolates your dependencies from the global Python environment. This isolation helps prevent version conflicts and ensures reproducibility. Here's how to use `uv`` to run commands:
47
+
48
+ - **Installation**: To set up pre-commit hooks, you first need to install the pre-commit package and then install the hooks.
49
+
50
+ You can either follow the [install instructions](https://pre-commit.com/#install) or install it globally through `uv tool install pre-commit`
51
+
52
+ ```bash
53
+ pre-commit install
54
+ ```
55
+
56
+ - **Running Hooks Manually**: Although pre-commit hooks are triggered automatically before each commit, you can also run them manually to check your files at any time:
57
+
58
+ ```bash
59
+ pre-commit run -a
60
+ ```
61
+ This command runs all hooks against all files, which is useful for initial setup or periodic checks.
62
+
63
+ - **Current Hooks**:
64
+ - **Check JSON**: Ensures JSON files are valid.
65
+ - **Pretty format JSON**: Formats JSON files to be more readable.
66
+ - **Fix End of Files**: Ensures files end with a newline.
67
+ - **Trim Trailing Whitespace**: Removes unnecessary trailing whitespace.
68
+ - **Ruff**: Runs the Ruff linter to check Python code for stylistic and programming errors.
69
+ - **Ruff-format**: Automatically formats Python code using Ruff.
70
+
71
+
72
+ - **Static Type Checking with MyPy**: To ensure your code is type-safe, run MyPy to check for type errors. This should be done frequently during development to catch type-related issues early:
73
+ ```bash
74
+ uv run --all-extras mypy ./src
75
+ uv run --all-extras mypy ./tests
76
+ ```
77
+ Run these commands after making changes to your source or test files to verify that your changes haven't introduced type errors.
78
+
79
+ - **Running Tests with pytest**: To ensure your code works as expected and hasn't broken existing functionality, run your tests:
80
+ ```bash
81
+ uv run --all-extras pytest
82
+ ```
83
+ Run this command frequently during development, especially before committing changes, to ensure all tests pass.
File without changes