eval-framework 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (170) hide show
  1. eval_framework/__init__.py +7 -0
  2. eval_framework/base_config.py +36 -0
  3. eval_framework/context/__init__.py +0 -0
  4. eval_framework/context/determined.py +177 -0
  5. eval_framework/context/eval.py +121 -0
  6. eval_framework/context/local.py +78 -0
  7. eval_framework/evaluation_generator.py +234 -0
  8. eval_framework/exceptions.py +2 -0
  9. eval_framework/external/ifeval_impl/README.md +5 -0
  10. eval_framework/external/ifeval_impl/instructions.py +1523 -0
  11. eval_framework/external/ifeval_impl/instructions_registry.py +161 -0
  12. eval_framework/external/ifeval_impl/instructions_util.py +1689 -0
  13. eval_framework/external/ifeval_impl/utils.py +135 -0
  14. eval_framework/llm/__init__.py +0 -0
  15. eval_framework/llm/aleph_alpha.py +432 -0
  16. eval_framework/llm/base.py +180 -0
  17. eval_framework/llm/huggingface.py +418 -0
  18. eval_framework/llm/mistral.py +88 -0
  19. eval_framework/llm/models.py +28 -0
  20. eval_framework/llm/openai.py +400 -0
  21. eval_framework/llm/vllm.py +554 -0
  22. eval_framework/logger.py +3 -0
  23. eval_framework/main.py +166 -0
  24. eval_framework/metrics/__init__.py +0 -0
  25. eval_framework/metrics/base.py +40 -0
  26. eval_framework/metrics/completion/__init__.py +1 -0
  27. eval_framework/metrics/completion/accuracy_completion.py +16 -0
  28. eval_framework/metrics/completion/aidanbench.py +28 -0
  29. eval_framework/metrics/completion/bleu.py +76 -0
  30. eval_framework/metrics/completion/chrf.py +62 -0
  31. eval_framework/metrics/completion/code_assertion.py +44 -0
  32. eval_framework/metrics/completion/code_execution_pass_at_one.py +126 -0
  33. eval_framework/metrics/completion/comet.py +56 -0
  34. eval_framework/metrics/completion/concordance_index.py +38 -0
  35. eval_framework/metrics/completion/csv_format.py +102 -0
  36. eval_framework/metrics/completion/cwe_accuracy.py +49 -0
  37. eval_framework/metrics/completion/exponential_similarity.py +65 -0
  38. eval_framework/metrics/completion/f1.py +42 -0
  39. eval_framework/metrics/completion/format_checker.py +56 -0
  40. eval_framework/metrics/completion/grid_difference.py +77 -0
  41. eval_framework/metrics/completion/ifeval.py +73 -0
  42. eval_framework/metrics/completion/json_format.py +179 -0
  43. eval_framework/metrics/completion/language_checker.py +74 -0
  44. eval_framework/metrics/completion/length_control.py +83 -0
  45. eval_framework/metrics/completion/math_reasoning_completion.py +307 -0
  46. eval_framework/metrics/completion/niah_accuracy.py +163 -0
  47. eval_framework/metrics/completion/placeholder_checker.py +27 -0
  48. eval_framework/metrics/completion/repetition.py +88 -0
  49. eval_framework/metrics/completion/rouge_1.py +35 -0
  50. eval_framework/metrics/completion/rouge_2.py +45 -0
  51. eval_framework/metrics/completion/rouge_geometric_mean.py +36 -0
  52. eval_framework/metrics/completion/rouge_l.py +52 -0
  53. eval_framework/metrics/completion/struct_eval_metrics.py +248 -0
  54. eval_framework/metrics/completion/ter.py +67 -0
  55. eval_framework/metrics/completion/text_counter.py +182 -0
  56. eval_framework/metrics/efficiency/__init__.py +0 -0
  57. eval_framework/metrics/efficiency/bytes_per_sequence_position.py +48 -0
  58. eval_framework/metrics/llm/__init__.py +0 -0
  59. eval_framework/metrics/llm/base.py +34 -0
  60. eval_framework/metrics/llm/graders/chatbot_style_grader.py +92 -0
  61. eval_framework/metrics/llm/graders/coherence_grader.py +115 -0
  62. eval_framework/metrics/llm/graders/comparison_grader.py +198 -0
  63. eval_framework/metrics/llm/graders/conciseness_grader.py +93 -0
  64. eval_framework/metrics/llm/graders/contains_names_grader.py +71 -0
  65. eval_framework/metrics/llm/graders/format_correctness_grader.py +109 -0
  66. eval_framework/metrics/llm/graders/instruction_grader.py +177 -0
  67. eval_framework/metrics/llm/graders/language.py +56 -0
  68. eval_framework/metrics/llm/graders/long_context_grader.py +72 -0
  69. eval_framework/metrics/llm/graders/models.py +74 -0
  70. eval_framework/metrics/llm/graders/refusal_grader.py +57 -0
  71. eval_framework/metrics/llm/graders/sql_quality_grader.py +145 -0
  72. eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +103 -0
  73. eval_framework/metrics/llm/llm_judge_chatbot_style.py +36 -0
  74. eval_framework/metrics/llm/llm_judge_coherence.py +44 -0
  75. eval_framework/metrics/llm/llm_judge_completion_accuracy.py +39 -0
  76. eval_framework/metrics/llm/llm_judge_conciseness.py +37 -0
  77. eval_framework/metrics/llm/llm_judge_contains_names.py +36 -0
  78. eval_framework/metrics/llm/llm_judge_format_correctness.py +43 -0
  79. eval_framework/metrics/llm/llm_judge_instruction.py +58 -0
  80. eval_framework/metrics/llm/llm_judge_mtbench_pair.py +306 -0
  81. eval_framework/metrics/llm/llm_judge_mtbench_single.py +210 -0
  82. eval_framework/metrics/llm/llm_judge_refusal.py +35 -0
  83. eval_framework/metrics/llm/llm_judge_sql.py +394 -0
  84. eval_framework/metrics/llm/llm_judge_world_knowledge.py +37 -0
  85. eval_framework/metrics/llm/utils.py +20 -0
  86. eval_framework/metrics/loglikelihood/__init__.py +0 -0
  87. eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +51 -0
  88. eval_framework/metrics/loglikelihood/base.py +50 -0
  89. eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +25 -0
  90. eval_framework/metrics/loglikelihood/dcs.py +43 -0
  91. eval_framework/metrics/loglikelihood/probability_mass.py +53 -0
  92. eval_framework/metrics/loglikelihood/ternary.py +42 -0
  93. eval_framework/py.typed +0 -0
  94. eval_framework/response_generator.py +351 -0
  95. eval_framework/result_processors/__init__.py +0 -0
  96. eval_framework/result_processors/base.py +88 -0
  97. eval_framework/result_processors/hf_uploader.py +75 -0
  98. eval_framework/result_processors/result_processor.py +129 -0
  99. eval_framework/result_processors/wandb_uploader.py +137 -0
  100. eval_framework/run.py +369 -0
  101. eval_framework/run_direct.py +42 -0
  102. eval_framework/shared/types.py +227 -0
  103. eval_framework/tasks/__init__.py +6 -0
  104. eval_framework/tasks/base.py +392 -0
  105. eval_framework/tasks/benchmarks/__init__.py +0 -0
  106. eval_framework/tasks/benchmarks/aidanbench.py +211 -0
  107. eval_framework/tasks/benchmarks/arc.py +70 -0
  108. eval_framework/tasks/benchmarks/arc_de.py +46 -0
  109. eval_framework/tasks/benchmarks/arc_fi.py +46 -0
  110. eval_framework/tasks/benchmarks/belebele.py +60 -0
  111. eval_framework/tasks/benchmarks/bigcodebench.py +155 -0
  112. eval_framework/tasks/benchmarks/casehold.py +47 -0
  113. eval_framework/tasks/benchmarks/chembench.py +85 -0
  114. eval_framework/tasks/benchmarks/copa.py +64 -0
  115. eval_framework/tasks/benchmarks/duc.py +91 -0
  116. eval_framework/tasks/benchmarks/flores200.py +133 -0
  117. eval_framework/tasks/benchmarks/flores_plus.py +84 -0
  118. eval_framework/tasks/benchmarks/gpqa.py +201 -0
  119. eval_framework/tasks/benchmarks/gsm8k.py +150 -0
  120. eval_framework/tasks/benchmarks/hellaswag.py +69 -0
  121. eval_framework/tasks/benchmarks/hellaswag_de.py +52 -0
  122. eval_framework/tasks/benchmarks/humaneval.py +97 -0
  123. eval_framework/tasks/benchmarks/ifeval.py +78 -0
  124. eval_framework/tasks/benchmarks/include.py +119 -0
  125. eval_framework/tasks/benchmarks/infinitebench.py +302 -0
  126. eval_framework/tasks/benchmarks/math_reasoning.py +580 -0
  127. eval_framework/tasks/benchmarks/mbpp.py +192 -0
  128. eval_framework/tasks/benchmarks/mmlu.py +215 -0
  129. eval_framework/tasks/benchmarks/mmlu_de.py +109 -0
  130. eval_framework/tasks/benchmarks/mmlu_pro.py +164 -0
  131. eval_framework/tasks/benchmarks/mmmlu.py +529 -0
  132. eval_framework/tasks/benchmarks/openbookqa.py +85 -0
  133. eval_framework/tasks/benchmarks/opengptx_eu20.py +363 -0
  134. eval_framework/tasks/benchmarks/pawsx.py +65 -0
  135. eval_framework/tasks/benchmarks/piqa.py +64 -0
  136. eval_framework/tasks/benchmarks/quality.py +56 -0
  137. eval_framework/tasks/benchmarks/sciq.py +110 -0
  138. eval_framework/tasks/benchmarks/sphyr.py +79 -0
  139. eval_framework/tasks/benchmarks/squad.py +211 -0
  140. eval_framework/tasks/benchmarks/struct_eval.py +116 -0
  141. eval_framework/tasks/benchmarks/tablebench.py +117 -0
  142. eval_framework/tasks/benchmarks/triviaqa.py +42 -0
  143. eval_framework/tasks/benchmarks/truthfulqa.py +119 -0
  144. eval_framework/tasks/benchmarks/winogender.py +64 -0
  145. eval_framework/tasks/benchmarks/winogrande.py +69 -0
  146. eval_framework/tasks/benchmarks/winox.py +57 -0
  147. eval_framework/tasks/benchmarks/wmt.py +160 -0
  148. eval_framework/tasks/benchmarks/zero_scrolls.py +197 -0
  149. eval_framework/tasks/eval_config.py +136 -0
  150. eval_framework/tasks/perturbation.py +83 -0
  151. eval_framework/tasks/registry.py +186 -0
  152. eval_framework/tasks/task_loader.py +81 -0
  153. eval_framework/tasks/task_names.py +324 -0
  154. eval_framework/tasks/utils.py +584 -0
  155. eval_framework/utils/constants.py +9 -0
  156. eval_framework/utils/file_ops.py +245 -0
  157. eval_framework/utils/generate_task_docs.py +244 -0
  158. eval_framework/utils/helpers.py +32 -0
  159. eval_framework/utils/logging.py +62 -0
  160. eval_framework/utils/packaging.py +52 -0
  161. eval_framework/utils/tqdm_handler.py +14 -0
  162. eval_framework-0.2.7.dist-info/METADATA +548 -0
  163. eval_framework-0.2.7.dist-info/RECORD +170 -0
  164. eval_framework-0.2.7.dist-info/WHEEL +4 -0
  165. eval_framework-0.2.7.dist-info/entry_points.txt +3 -0
  166. template_formatting/README.md +83 -0
  167. template_formatting/__init__.py +0 -0
  168. template_formatting/formatter.py +537 -0
  169. template_formatting/mistral_formatter.py +159 -0
  170. template_formatting/py.typed +0 -0
@@ -0,0 +1,170 @@
1
+ eval_framework/__init__.py,sha256=dLv--h62kDYK2uN5aFpEowXpW2P9XLwMud-NwoiW_u4,120
2
+ eval_framework/base_config.py,sha256=LJOHr0MtE9PPsfbLmP2tpoa52Tt0rIHMaW3CTYVwehs,1236
3
+ eval_framework/context/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ eval_framework/context/determined.py,sha256=YNUnwZC8lIAOcsCV6ecHu47lWIMpMdXwpLsLm-DOAUk,7372
5
+ eval_framework/context/eval.py,sha256=pxd8L-GAjFz40sYcRskm6bAt9nno1jRaNSOJRU2S4YU,4515
6
+ eval_framework/context/local.py,sha256=XFqWSeaeG-ASioU3eabmbGvZPN9CEqa18sE3ukiVRXg,3192
7
+ eval_framework/evaluation_generator.py,sha256=edktgkM357yRwgeukDeZcJBYsClqWAQ3mLTRzvGOFXA,11846
8
+ eval_framework/exceptions.py,sha256=j4jjN2Y-8vMxf0Dfms1buAJHNMzEQ6kZca6l_z-lDBo,38
9
+ eval_framework/external/ifeval_impl/README.md,sha256=fC2t3BSbjW_Hl8iAUoTwiFpblgY1NeqeF67tl5ScWT4,408
10
+ eval_framework/external/ifeval_impl/instructions.py,sha256=fp94wBZv0SQgm7OTTrguh1yiscPoYst8MqoBmoO_A6k,55615
11
+ eval_framework/external/ifeval_impl/instructions_registry.py,sha256=TzNBdO5rHl3jPwvm-o83IpJ8l1o0DoG2jp7gDSd54RU,6722
12
+ eval_framework/external/ifeval_impl/instructions_util.py,sha256=qUb8wipLfBMvHv3UpMTn-yZay_2JU7X_524f141xHJs,26095
13
+ eval_framework/external/ifeval_impl/utils.py,sha256=i2ADNqLmcBlBAdL7BZMa4HoTXJ3DU01UL01-7grcebg,4537
14
+ eval_framework/llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
+ eval_framework/llm/aleph_alpha.py,sha256=xvUicZoILrWtdgOaYTwtyRZ7WR41i61nBrGBUd3pieg,18461
16
+ eval_framework/llm/base.py,sha256=5JzVxVyix0DG9cVtViMMxN5Wt0cRyVKmiWq5L-1iEoE,7948
17
+ eval_framework/llm/huggingface.py,sha256=Ovq3QZ4ducKxPJp8FisTX5Q57EKB276yfzGPd0pD8KA,18041
18
+ eval_framework/llm/mistral.py,sha256=vn1spuH0uXCtL7zi9cmteoLIiCQy6c8EvyrD0BpVBOs,3544
19
+ eval_framework/llm/models.py,sha256=tSq3jpVBG9OVK4i1MWesZGtEWzbwfn6Vjv6PqLYrhak,937
20
+ eval_framework/llm/openai.py,sha256=QZo3vPPUrRxD76NlIGFgcMh84zWF1TrW706fgoUX-gw,16447
21
+ eval_framework/llm/vllm.py,sha256=9Oa712oJKYNTlKdu30pDS-R13HW9AoyQL_iF0AosRGU,21766
22
+ eval_framework/logger.py,sha256=8Bj7S8JRYh-SJZ3dEgueDIoVrhOjRyDsnRuLG61ft9E,61
23
+ eval_framework/main.py,sha256=yM0BlPAUTbUZ2VD_WdZK7nRbps8bSrWnjOQCu5-VhFE,6829
24
+ eval_framework/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
+ eval_framework/metrics/base.py,sha256=3VmIfC8AN-SXFf_7d_5fnwTQirDKBtJI5JxwDGOhtDU,1013
26
+ eval_framework/metrics/completion/__init__.py,sha256=3m1ekU7MH8JqV-6VHRBRQiPatqpZsNW6tQKpaXjpztE,52
27
+ eval_framework/metrics/completion/accuracy_completion.py,sha256=t-6lJBZ6dhhppepIkLEmB3TSd9qkGE3mrPYaDhnql98,697
28
+ eval_framework/metrics/completion/aidanbench.py,sha256=Kyr9aW2jdyt0NCfE8ytRCmNuU6f5lRImb3fHKLgoJUA,1048
29
+ eval_framework/metrics/completion/bleu.py,sha256=IDO3Hn-VgH7eT83iO9FCBI8gBUcj8cMOc1kfm_E73uI,3311
30
+ eval_framework/metrics/completion/chrf.py,sha256=o0zbwOpbL99fg00neET1Pb7jsfT8Sd1n-px_Jql43X8,2526
31
+ eval_framework/metrics/completion/code_assertion.py,sha256=f9XYPJzP6XWD2wqZ1_qWYyw56IhBnxp7hBmymw30ExA,1489
32
+ eval_framework/metrics/completion/code_execution_pass_at_one.py,sha256=1GyruuwS12UXfLSo3K7rRy2CZhC3W0oOAilFzAUtOeU,5123
33
+ eval_framework/metrics/completion/comet.py,sha256=M_4ITNfthjxqX8CgVKlxK5W7Gdu08FbXsmbGOx4SfSA,2333
34
+ eval_framework/metrics/completion/concordance_index.py,sha256=LfmM4KmXKiPbztoJaBRCDMA6lQdPFhHcRTYjNP0olQk,1369
35
+ eval_framework/metrics/completion/csv_format.py,sha256=sxo8xnEkGUw7FnkkZC2k58yn3GPuJQ_rJAFNLLo2sNE,3640
36
+ eval_framework/metrics/completion/cwe_accuracy.py,sha256=1LV35uxoDlKzLE_XWBItMMVsGBLqXP2DfqiI0L2T-dI,2130
37
+ eval_framework/metrics/completion/exponential_similarity.py,sha256=93rQV_pG7RbFMt0DWCDQe8iUiF9GzcTneHRxvH9tIgI,2702
38
+ eval_framework/metrics/completion/f1.py,sha256=ddHQXsQv5keZDrJvoY_nPPZtqZMEfrRrafeSWg6HQys,1512
39
+ eval_framework/metrics/completion/format_checker.py,sha256=JUgx3EbxsZEJr0bNlmQFQdQzkghvegq8QtC4vxQjvaI,1997
40
+ eval_framework/metrics/completion/grid_difference.py,sha256=sun639fzMNkhjoesfgRIsy7dofF5vxzbKlvVvUfA_y4,3104
41
+ eval_framework/metrics/completion/ifeval.py,sha256=93KxO8qfE6-9snppzpr3a7jCmCT2ciJOqWcK31VB2No,2578
42
+ eval_framework/metrics/completion/json_format.py,sha256=EV2Zb9OhETx-i2eJm48qR62S13r_2XHHVjM6UuZfKb4,6522
43
+ eval_framework/metrics/completion/language_checker.py,sha256=QO9yhHe99ZkvZxLSZ5m5B8N_oRVNsZeklg0b5MfUadg,3323
44
+ eval_framework/metrics/completion/length_control.py,sha256=15_S5m7SNFNR5KXNhmvTy3pGhtsuawlRU76w-ehLix8,3294
45
+ eval_framework/metrics/completion/math_reasoning_completion.py,sha256=wzhdNggAQxwC8Kpmb_ZsX6_SZx7h9IRfnFJuDBIBFxA,12221
46
+ eval_framework/metrics/completion/niah_accuracy.py,sha256=ycFUVXpJqdA_-aBvmzKUfaSpPi_-nCDY4F27kQjsPks,5803
47
+ eval_framework/metrics/completion/placeholder_checker.py,sha256=PhpPlcrP_QDYCOJuWK12ZfcUAOYys9IxZOKICTNUa1U,1147
48
+ eval_framework/metrics/completion/repetition.py,sha256=MRsap8ZDISDfC5luqWlQA05W_anjFU6XzzvD55LsM_M,3340
49
+ eval_framework/metrics/completion/rouge_1.py,sha256=Y1m7e9q258cIFjIfGShssneFn08_85ZQF6-YqIgOORQ,1514
50
+ eval_framework/metrics/completion/rouge_2.py,sha256=3GKFHVXHKvPOjk4SaU6D1vbykK5WeE6Q2Ogjhasa1uk,1978
51
+ eval_framework/metrics/completion/rouge_geometric_mean.py,sha256=0fqiWx72eJscuLkekh901CwhFInN9HoxQ2LJod40fJs,1730
52
+ eval_framework/metrics/completion/rouge_l.py,sha256=SwM1s7MQWKjVPlS0KyHcEH9pzkA-hlidz-4gM9kiTu4,2360
53
+ eval_framework/metrics/completion/struct_eval_metrics.py,sha256=8wBx7yTfzjww1wPST57X9sjrVNHavtKXZcOiCkbNrZk,8148
54
+ eval_framework/metrics/completion/ter.py,sha256=mskQejjl1RX0WuSQk1e42-L1QfH0kwTVIhDwqbaBNEc,2614
55
+ eval_framework/metrics/completion/text_counter.py,sha256=UXBOt7okRZHx6BuVcyAS9IeNoYSnryLKkdgYn0FArF8,7100
56
+ eval_framework/metrics/efficiency/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
57
+ eval_framework/metrics/efficiency/bytes_per_sequence_position.py,sha256=fPNqu_fQSqy__1Es5Zbm0niBr8N6j-jnprY-ysAFrds,1849
58
+ eval_framework/metrics/llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
59
+ eval_framework/metrics/llm/base.py,sha256=pRqppTNG0MUpG-5rJqlQ4sGlR4lFcK1cZ9B7g9XikeM,1276
60
+ eval_framework/metrics/llm/graders/chatbot_style_grader.py,sha256=7tplUGC7G_F730t9Ij242dBRbQKUaCzURP1iX6ZKgrI,4114
61
+ eval_framework/metrics/llm/graders/coherence_grader.py,sha256=j-opPaQfv6co3_SXEjc8ICIeb-3rQ7I6sct8nLe-R1c,4208
62
+ eval_framework/metrics/llm/graders/comparison_grader.py,sha256=jEBnXQN6ebyexPCqg48L4ZpFoVYnT3WU-pjOY1NWzz0,7461
63
+ eval_framework/metrics/llm/graders/conciseness_grader.py,sha256=-WE7dOo7Jo57UzmesAr61WKurB9NegNBVtPLmViLOZw,3562
64
+ eval_framework/metrics/llm/graders/contains_names_grader.py,sha256=5NUGVcAzkyGJ1or5uReCbUJT3psplnHTd7dUkf_iR0Y,2724
65
+ eval_framework/metrics/llm/graders/format_correctness_grader.py,sha256=1ewPCXj97favA3BovNSOpHRILhtsTbmp5vWJfzk-968,4549
66
+ eval_framework/metrics/llm/graders/instruction_grader.py,sha256=v9ew30JHpO8LK99D2FYhFz6E-ikE4PIld3sCT79u0gk,11625
67
+ eval_framework/metrics/llm/graders/language.py,sha256=9YlEE3BjvzfHfQtRMTWrP_NxGbjKbZRbAjqo3GvL_wE,1720
68
+ eval_framework/metrics/llm/graders/long_context_grader.py,sha256=BX29D8BsVoVGOfGlQjAfFMJFw2Nn77puwMOBnHJvJoE,2476
69
+ eval_framework/metrics/llm/graders/models.py,sha256=PVGzyjOcmm-DN-NpoO8SzFyUNVoDLG330f3uFXG0SfE,2206
70
+ eval_framework/metrics/llm/graders/refusal_grader.py,sha256=SUFUiveL36LWyKR5w8LUgYl2Kx4aAc5IPu5uV8j4N5k,2272
71
+ eval_framework/metrics/llm/graders/sql_quality_grader.py,sha256=ooNCxBNKeyqFxf2nAKdtUcd7aIMQpmxcEn9iTo5XhiQ,5624
72
+ eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py,sha256=lZJzXyMNYLhY4RmrPZsBxJByPXWMk8aeDjGxUArwv4U,4599
73
+ eval_framework/metrics/llm/llm_judge_chatbot_style.py,sha256=pb_GWN5xVHRuk64XPNkIqRV1htKaNmr-Cdjx9jxgGiw,1400
74
+ eval_framework/metrics/llm/llm_judge_coherence.py,sha256=NrzkJ2vMQnVKFuKd834StrpYVNBNre8-StAii0g59E8,1559
75
+ eval_framework/metrics/llm/llm_judge_completion_accuracy.py,sha256=KS1Fn3cZAyIfXd8LM_o2s9IjoHpftqtuSIJ3fGDAr6Y,1523
76
+ eval_framework/metrics/llm/llm_judge_conciseness.py,sha256=v2iSxBeUU3QTjdy0hx-9t5j0pf4LMnp5z2JCiqpN9_8,1439
77
+ eval_framework/metrics/llm/llm_judge_contains_names.py,sha256=7r-sAI6Qwej4fgQIhmotXtEK5ZaLcHxgyjbP7TYzRtE,1401
78
+ eval_framework/metrics/llm/llm_judge_format_correctness.py,sha256=AwHLblRtWSo7hg0sJpcdQAZP7ldrfZFDp2rGB9-6rns,1668
79
+ eval_framework/metrics/llm/llm_judge_instruction.py,sha256=PcXACNijZSYIfLoks-bqCgjqo0YPqQpX4O5GinC2SvE,2170
80
+ eval_framework/metrics/llm/llm_judge_mtbench_pair.py,sha256=DhaM5iDJNDgg9TZNo7FPXldmZwuVtZWCPxO6ppFj1O0,29297
81
+ eval_framework/metrics/llm/llm_judge_mtbench_single.py,sha256=gAeewUHh-EuS9mP57Iiptl1Z0RuSHzEF8ldI_2Howkc,18468
82
+ eval_framework/metrics/llm/llm_judge_refusal.py,sha256=iAoOstgOvKtk9M9wqVqrf21mM0Xbss4EraO7R3g9FBQ,1418
83
+ eval_framework/metrics/llm/llm_judge_sql.py,sha256=qMj2pHzijq2lVHqToewQL_xJSgKLulZWSb64996ztnQ,14480
84
+ eval_framework/metrics/llm/llm_judge_world_knowledge.py,sha256=C48aHS6bcVtGMk0YxzqDAGiHekypyeo--SK7EFVN5Jc,1517
85
+ eval_framework/metrics/llm/utils.py,sha256=3rfaP7O1c8OOatOGNO3kZcLFCvZXoPplSjkju7eck3E,728
86
+ eval_framework/metrics/loglikelihood/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
87
+ eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py,sha256=l0OxJFSQiLnwJdfX72SH9k-krZr8AI1FOUYlHfiT2Q0,1921
88
+ eval_framework/metrics/loglikelihood/base.py,sha256=TJoJ5jXjPiC0xHeIQiWWRha2C_h1A1Bd0U9pwhQBdRg,1935
89
+ eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py,sha256=0meEs3EaVU72SwflS0Em5DvSEaFbu2zD_NOtVjIr6CQ,1148
90
+ eval_framework/metrics/loglikelihood/dcs.py,sha256=HqqpL_BdoB-Uq_6buDSUK2YVUkc20nstnHPqGdDM2RY,1935
91
+ eval_framework/metrics/loglikelihood/probability_mass.py,sha256=HyBlsz64lGcU2PCL2AYZQ9qS7olOofvtpUnskdT8D0s,2196
92
+ eval_framework/metrics/loglikelihood/ternary.py,sha256=aU1RyXBnKnElHSNOxXOw3fvL0iUuOLPlh-TQCTAk0Bw,1825
93
+ eval_framework/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
94
+ eval_framework/response_generator.py,sha256=blIOIzP25JPEmQYPUqqLCkR2NRgOnlBLfN1T1DpCgHU,16383
95
+ eval_framework/result_processors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
96
+ eval_framework/result_processors/base.py,sha256=30HzZLs4Rk0MhHKuE13wnkMxz47yBjeMraLzmz6f-Yc,2445
97
+ eval_framework/result_processors/hf_uploader.py,sha256=P2EUX5KB5DFAl4PZYTZWOG2a_8aYIBwFKSeVMu2tGs0,2898
98
+ eval_framework/result_processors/result_processor.py,sha256=eIzs30XwbhAO7vODudNw0oS8jsk9N04G0_wdhcI9MkQ,5456
99
+ eval_framework/result_processors/wandb_uploader.py,sha256=3noM9S0kVkx9RbblvB5I4Fy4jtRTWmq1pGQSSR1Des8,6266
100
+ eval_framework/run.py,sha256=-oqjRdkIGsQgXWXI3cGcTQAk9FDpvFfiUbiFu-ygn5U,11967
101
+ eval_framework/run_direct.py,sha256=KMWkLDuDt-HPlmjsSGKAiXd7LlrpVUPKv89Gk3i0snA,1176
102
+ eval_framework/shared/types.py,sha256=lPA5uhdRgs3H---SFsjUOYwUkqBYL0K2Y2JvxCOyMLc,8841
103
+ eval_framework/tasks/__init__.py,sha256=Fzs8DY53Dt0Gsu34Ro6Dk6by9qgaFF0UIIHERl6PO5g,120
104
+ eval_framework/tasks/base.py,sha256=LWkpIrdBDSq9VYi8W8iKcWve2T1_Oh76YDmczJcvOrA,15988
105
+ eval_framework/tasks/benchmarks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
106
+ eval_framework/tasks/benchmarks/aidanbench.py,sha256=KIilefSLgM3SINYlALxKiWE9hlOwuNZaTZHnIJerOv4,9963
107
+ eval_framework/tasks/benchmarks/arc.py,sha256=XC968m7yMLDdUWQn3S8CzlUVzzyC2CwPziiTB5yjf1s,2854
108
+ eval_framework/tasks/benchmarks/arc_de.py,sha256=Ng7n0HeAEPh2SEHOTwIl1ccVCWH9iI0-U11mBe0aR38,1871
109
+ eval_framework/tasks/benchmarks/arc_fi.py,sha256=fgjdHN0pq8V_R_vMsLsk9Q2Mf7JwKF0vVFazESHtf2o,1858
110
+ eval_framework/tasks/benchmarks/belebele.py,sha256=x3V5DsNdkJpwMCWi5SOVIMJS7-ZZhi5E3XttGa23bR8,2213
111
+ eval_framework/tasks/benchmarks/bigcodebench.py,sha256=vrEySwqQTAEFnWGDoTzAOL6IVSwulbK7Rp60hxyOlPc,5892
112
+ eval_framework/tasks/benchmarks/casehold.py,sha256=hFWW1LnzVHk8un58flGLh_JOv3h95XHwOB-WenIWOJc,1727
113
+ eval_framework/tasks/benchmarks/chembench.py,sha256=GaPW0oBYLx4cQaZuvcFdDIL5XG5YUurqRcdaWXSzVgo,3522
114
+ eval_framework/tasks/benchmarks/copa.py,sha256=pTOBAnahtY8jiT8b0RV8AJwjzF_HBp89Fiu18msXulw,2535
115
+ eval_framework/tasks/benchmarks/duc.py,sha256=6VAk38UrbuG63gr-K69WZ016g-EH0ONxTzictJ-AcN0,3516
116
+ eval_framework/tasks/benchmarks/flores200.py,sha256=WOhjn2RzwFuvMIHL_t2pvSBTS2-zPIsonJD1ZGXhJdc,5177
117
+ eval_framework/tasks/benchmarks/flores_plus.py,sha256=bTKH8ECFdZRw-3RV-37a2clpY7u1Y2QigVeXIQoI2c0,3346
118
+ eval_framework/tasks/benchmarks/gpqa.py,sha256=n_CzSMeSWDAh21g60cC08Ut_PwcQn6vZwWTgtCCnP1U,8873
119
+ eval_framework/tasks/benchmarks/gsm8k.py,sha256=nU2iVGI9YEa-mh1Z3nzCwEcPNXEX15XlUWrnLpaAk2M,5874
120
+ eval_framework/tasks/benchmarks/hellaswag.py,sha256=PtC0AkFceUEFqt5HVbMHRAAQagsxq1x36yWcAoRp5YQ,2763
121
+ eval_framework/tasks/benchmarks/hellaswag_de.py,sha256=09sItfKknm6Xm-NKm5HcBgm-EYlm0dBqdgkEcXCReVk,2091
122
+ eval_framework/tasks/benchmarks/humaneval.py,sha256=wDP8ymSaqrhe28pTXavt0fxayA-cdUU8eOp5V8Q6T40,3370
123
+ eval_framework/tasks/benchmarks/ifeval.py,sha256=sww3y21udT1xCdf1fmh7z4EZ6-XLMR5fFgqUdwCUmZY,2826
124
+ eval_framework/tasks/benchmarks/include.py,sha256=Io4IFYTOCEoolVMRvjMEc58YJSJh4FcNnJ7wCYOmeIo,3380
125
+ eval_framework/tasks/benchmarks/infinitebench.py,sha256=bDNkNNe2v1FNOwGR9fHbTaUXFJJxNlHR6emarFjPFE4,11024
126
+ eval_framework/tasks/benchmarks/math_reasoning.py,sha256=mCqNY9ZKMp_k09S3ropdgiFma7SzWbs65rUsYyyouOA,22750
127
+ eval_framework/tasks/benchmarks/mbpp.py,sha256=hX8NnmI8iV4L35BX1-OCESNtfQq6hPryVKXJ_rsYQCI,7530
128
+ eval_framework/tasks/benchmarks/mmlu.py,sha256=O2RMG9u8zoUsQ06A8LIXNTBXZTh84_95REJ2sy4JN30,7755
129
+ eval_framework/tasks/benchmarks/mmlu_de.py,sha256=MAmiVWR-tSQFT383lKz_z0b0pdhWQ7PjPwtUuIGCb64,4634
130
+ eval_framework/tasks/benchmarks/mmlu_pro.py,sha256=II_adK324gobsZmrK2EP3yLgFaSV9WHhchwVXRky_UI,6398
131
+ eval_framework/tasks/benchmarks/mmmlu.py,sha256=Kr6WVv4Z2SH3VJU549LF1yUXIPf1wvL60t7WdxfOcY8,24029
132
+ eval_framework/tasks/benchmarks/openbookqa.py,sha256=k-wqRliIdzNsCoExlO9NX-cL6Fzoi5do2A5kBwcAQRY,3663
133
+ eval_framework/tasks/benchmarks/opengptx_eu20.py,sha256=w2ITsVeyLKakZr_VCMaLckvqqSL3MXu9c8ZcywJjV6E,14923
134
+ eval_framework/tasks/benchmarks/pawsx.py,sha256=VnTTi396NQzBiUfUxsJ14WRvDibYEDL4S7cHDjoQ96Y,3106
135
+ eval_framework/tasks/benchmarks/piqa.py,sha256=_dyCkVzXrIm1t_eTjPi2xqhBtC9xaNnsLwui4JEgnIk,2517
136
+ eval_framework/tasks/benchmarks/quality.py,sha256=8GTmOAsX3cxgNDYaTT31cN8N-xSr6iVIR0pvhD8aqeo,1955
137
+ eval_framework/tasks/benchmarks/sciq.py,sha256=CTZuKsyr2CuK-tbz2XnNrQSdttb1_QwXqkBLoEcsRS8,4271
138
+ eval_framework/tasks/benchmarks/sphyr.py,sha256=CLp3eLkKly60IkYKyV_-nos3bUKkBhQLpakct2qiEOg,3199
139
+ eval_framework/tasks/benchmarks/squad.py,sha256=zrd-PPEMLx12SEyw0-qLe5YKC424fZAf7AWnX9AjTow,8201
140
+ eval_framework/tasks/benchmarks/struct_eval.py,sha256=9NnDdkMWAitDfT9ksrb-F2_GCMZU326xU1Hevwk0ysY,4050
141
+ eval_framework/tasks/benchmarks/tablebench.py,sha256=MxQlW7d62hBqm0HZ93XxzVHVn_JIeqK4UJO4-b5tO3U,4912
142
+ eval_framework/tasks/benchmarks/triviaqa.py,sha256=S1RTJrJaeowmgjtMkcDxjUbu_9y35Eo1x9HouOqA_M0,1646
143
+ eval_framework/tasks/benchmarks/truthfulqa.py,sha256=JO2-oCuf7DjxOkpESvEeWzAUh7oMhicOaIGLTe017D4,4915
144
+ eval_framework/tasks/benchmarks/winogender.py,sha256=18OLgtKZIsnN2P1YOPzc8LrjnsRy4zXSGXdKRNn_hxQ,2609
145
+ eval_framework/tasks/benchmarks/winogrande.py,sha256=QDTBHt4XtNQ1QF_Y7wScI9_pK_bY_b6O9dQ9VrVTqC4,2705
146
+ eval_framework/tasks/benchmarks/winox.py,sha256=Qvbb_HLYpQlEKeOkJ22nFGAWlYjPQhMEp-Ra95_6k0U,1996
147
+ eval_framework/tasks/benchmarks/wmt.py,sha256=I54AmQNX4uv_7k5U3nYdQi2DhQIuAkOzRZCbUZYyZ64,5690
148
+ eval_framework/tasks/benchmarks/zero_scrolls.py,sha256=qUgoDICnmEXM1EZdV2PTUXf4YzUjyOCwmgWACN_-0zE,7862
149
+ eval_framework/tasks/eval_config.py,sha256=xfFhzdfCHOMx1v2vl8Lp4XXC2SxN34PlbDzqo9nrGc8,5427
150
+ eval_framework/tasks/perturbation.py,sha256=ZtXMqPk9YSWiX6ytgUXEeacA1LhAgvkJyxuO7MruJho,3532
151
+ eval_framework/tasks/registry.py,sha256=d4uYpg8JOfStl-r0mExaJOYL4rqsXD9RAQ93fi32D7I,5738
152
+ eval_framework/tasks/task_loader.py,sha256=uXurAyS35y90cMUW_Sc2bFZBXuN34FiACzUniLRxjw4,3784
153
+ eval_framework/tasks/task_names.py,sha256=VH5eIrHdDps8zXMlKEwiGFuEgHpeZY9sx7txJq0H-qw,16463
154
+ eval_framework/tasks/utils.py,sha256=jjo5JDDTzlTevx5angDp9terg-eW6z8g1ZmAuFXdslw,20006
155
+ eval_framework/utils/constants.py,sha256=LEElGdYrkIWm8dJa7lfD5LbL-fwkF17Z0nQ7_XVg098,164
156
+ eval_framework/utils/file_ops.py,sha256=tGLHnwnZsm3gc8D6kGFzONIqR5wtdKDEzDKziRYoyo8,10922
157
+ eval_framework/utils/generate_task_docs.py,sha256=aPTz8M-Dlh06z4Ce41OIo81KnCpYnHIuusZQtpid2I4,9780
158
+ eval_framework/utils/helpers.py,sha256=GZJNUWaKg-6LcSU4gm585fX0kKd9Y_gvlAevYaNitSg,1253
159
+ eval_framework/utils/logging.py,sha256=xqwop0qpSRG8KTvzY31hX6Ew0ly_LqtZ16RItHpPmFE,1945
160
+ eval_framework/utils/packaging.py,sha256=Z_eXjzcgCvifJwJ-pqeAtNPVjscgR6QYNKe4E-iSnFc,1889
161
+ eval_framework/utils/tqdm_handler.py,sha256=-FHPrX29u5dWhbzworXIJ_I1EdfeoWZsWbwmRXz3Fuk,298
162
+ template_formatting/README.md,sha256=gVrps3xXKPP87lWmKzaaEjNMBikQVj_MMZ-FdG3O6Xg,3787
163
+ template_formatting/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
164
+ template_formatting/formatter.py,sha256=J6qGovNQYo5Cc_R7XV8iCO0d-UNaCdO-1N_FyD4mhpw,21522
165
+ template_formatting/mistral_formatter.py,sha256=NJESWDsd_QFx8USoTGJ1QOYczwXf-ObjnpTy7mqzjK0,6734
166
+ template_formatting/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
167
+ eval_framework-0.2.7.dist-info/WHEEL,sha256=KSLUh82mDPEPk0Bx0ScXlWL64bc8KmzIPNcpQZFV-6E,79
168
+ eval_framework-0.2.7.dist-info/entry_points.txt,sha256=k4dpbNwZ5XnovyqrScWTZ-UYzf_EPYOvZA2QTkqrYlk,59
169
+ eval_framework-0.2.7.dist-info/METADATA,sha256=Ye0vaxk9Xv_aO41sRyognvdKde7eu2Ef-nXAypQljsA,29424
170
+ eval_framework-0.2.7.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: uv 0.9.22
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ eval_framework = eval_framework.run:run
3
+
@@ -0,0 +1,83 @@
1
+
2
+ # Internal template formatting package
3
+
4
+ Single source of truth for internal template formatting. Ensures compatibility between `scaling-internal` and `eval-framework`
5
+
6
+ ### Install uv
7
+
8
+ `uv` is used for dependency management and packaging in Python projects. To install uv, follow the [official instructions](https://docs.astral.sh/uv/getting-started/installation/).
9
+
10
+
11
+ ## Project Structure
12
+ - src/: Contains the template formatting code
13
+ - tests/: Contains pytest test cases.
14
+ - test_formatter_eval.py: Basic unit tests for the template formatter derived from `eval-framework`
15
+ - test_formatter_scaling.py: Basic unit tests for the template formatter derived from `scaling-internal`
16
+ - pyproject.toml: Configuration file for uv and other tools like MyPy, ruff and pytest.
17
+
18
+
19
+ ## Adding dependencies
20
+
21
+ - **Adding Production Dependencies**: These are dependencies necessary for your project to run. For example, if your project uses Pydantic for data validation, you would add it as a production dependency:
22
+
23
+ ```bash
24
+ uv add pydantic
25
+ ```
26
+ - **Adding Development Dependencies**: These are dependencies that are only needed during development, such as testing libraries or linters. For instance, to add pytest for writing and running tests, you would specify it as a development dependency:
27
+
28
+ ```bash
29
+ uv add --group dev pytest
30
+ ```
31
+
32
+ After adding any new dependencies, you need to install them to update your project's virtual environment:
33
+ ```bash
34
+ uv sync
35
+ ```
36
+ This command ensures that all dependencies listed in your pyproject.toml file are correctly installed and available for use in your project.
37
+
38
+ To install all dependencies (including optional ones), run
39
+ ```bash
40
+ uv install --extras optional
41
+ ```
42
+
43
+ ## Usage
44
+ **Running Commands with uv**
45
+
46
+ `uv`` creates a virtual environment for your project, which isolates your dependencies from the global Python environment. This isolation helps prevent version conflicts and ensures reproducibility. Here's how to use `uv`` to run commands:
47
+
48
+ - **Installation**: To set up pre-commit hooks, you first need to install the pre-commit package and then install the hooks.
49
+
50
+ You can either follow the [install instructions](https://pre-commit.com/#install) or install it globally through `uv tool install pre-commit`
51
+
52
+ ```bash
53
+ pre-commit install
54
+ ```
55
+
56
+ - **Running Hooks Manually**: Although pre-commit hooks are triggered automatically before each commit, you can also run them manually to check your files at any time:
57
+
58
+ ```bash
59
+ pre-commit run -a
60
+ ```
61
+ This command runs all hooks against all files, which is useful for initial setup or periodic checks.
62
+
63
+ - **Current Hooks**:
64
+ - **Check JSON**: Ensures JSON files are valid.
65
+ - **Pretty format JSON**: Formats JSON files to be more readable.
66
+ - **Fix End of Files**: Ensures files end with a newline.
67
+ - **Trim Trailing Whitespace**: Removes unnecessary trailing whitespace.
68
+ - **Ruff**: Runs the Ruff linter to check Python code for stylistic and programming errors.
69
+ - **Ruff-format**: Automatically formats Python code using Ruff.
70
+
71
+
72
+ - **Static Type Checking with MyPy**: To ensure your code is type-safe, run MyPy to check for type errors. This should be done frequently during development to catch type-related issues early:
73
+ ```bash
74
+ uv run --all-extras mypy ./src
75
+ uv run --all-extras mypy ./tests
76
+ ```
77
+ Run these commands after making changes to your source or test files to verify that your changes haven't introduced type errors.
78
+
79
+ - **Running Tests with pytest**: To ensure your code works as expected and hasn't broken existing functionality, run your tests:
80
+ ```bash
81
+ uv run --all-extras pytest
82
+ ```
83
+ Run this command frequently during development, especially before committing changes, to ensure all tests pass.
File without changes