eval-framework 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- eval_framework/__init__.py +7 -0
- eval_framework/base_config.py +36 -0
- eval_framework/context/__init__.py +0 -0
- eval_framework/context/determined.py +170 -0
- eval_framework/context/eval.py +114 -0
- eval_framework/context/local.py +52 -0
- eval_framework/evaluation_generator.py +231 -0
- eval_framework/exceptions.py +2 -0
- eval_framework/external/ifeval_impl/README.md +5 -0
- eval_framework/external/ifeval_impl/instructions.py +1523 -0
- eval_framework/external/ifeval_impl/instructions_registry.py +161 -0
- eval_framework/external/ifeval_impl/instructions_util.py +1689 -0
- eval_framework/external/ifeval_impl/utils.py +135 -0
- eval_framework/llm/__init__.py +0 -0
- eval_framework/llm/aleph_alpha.py +323 -0
- eval_framework/llm/base.py +58 -0
- eval_framework/llm/huggingface.py +332 -0
- eval_framework/llm/mistral.py +73 -0
- eval_framework/llm/models.py +16 -0
- eval_framework/llm/openai.py +205 -0
- eval_framework/llm/vllm.py +438 -0
- eval_framework/logger.py +3 -0
- eval_framework/main.py +187 -0
- eval_framework/metrics/__init__.py +0 -0
- eval_framework/metrics/base.py +40 -0
- eval_framework/metrics/completion/__init__.py +1 -0
- eval_framework/metrics/completion/accuracy_completion.py +16 -0
- eval_framework/metrics/completion/bleu.py +76 -0
- eval_framework/metrics/completion/chrf.py +62 -0
- eval_framework/metrics/completion/code_assertion.py +44 -0
- eval_framework/metrics/completion/code_execution_pass_at_one.py +126 -0
- eval_framework/metrics/completion/comet.py +56 -0
- eval_framework/metrics/completion/concordance_index.py +38 -0
- eval_framework/metrics/completion/csv_format.py +102 -0
- eval_framework/metrics/completion/cwe_accuracy.py +49 -0
- eval_framework/metrics/completion/exponential_similarity.py +65 -0
- eval_framework/metrics/completion/f1.py +42 -0
- eval_framework/metrics/completion/format_checker.py +56 -0
- eval_framework/metrics/completion/grid_difference.py +77 -0
- eval_framework/metrics/completion/ifeval.py +73 -0
- eval_framework/metrics/completion/json_format.py +171 -0
- eval_framework/metrics/completion/language_checker.py +74 -0
- eval_framework/metrics/completion/length_control.py +83 -0
- eval_framework/metrics/completion/math_reasoning_completion.py +303 -0
- eval_framework/metrics/completion/niah_accuracy.py +163 -0
- eval_framework/metrics/completion/placeholder_checker.py +27 -0
- eval_framework/metrics/completion/repetition.py +88 -0
- eval_framework/metrics/completion/rouge_1.py +35 -0
- eval_framework/metrics/completion/rouge_2.py +45 -0
- eval_framework/metrics/completion/rouge_geometric_mean.py +36 -0
- eval_framework/metrics/completion/rouge_l.py +52 -0
- eval_framework/metrics/completion/struct_eval_metrics.py +248 -0
- eval_framework/metrics/completion/ter.py +67 -0
- eval_framework/metrics/completion/text_counter.py +182 -0
- eval_framework/metrics/efficiency/__init__.py +0 -0
- eval_framework/metrics/efficiency/bytes_per_sequence_position.py +48 -0
- eval_framework/metrics/llm/__init__.py +0 -0
- eval_framework/metrics/llm/base.py +8 -0
- eval_framework/metrics/llm/graders/chatbot_style_grader.py +92 -0
- eval_framework/metrics/llm/graders/comparison_grader.py +146 -0
- eval_framework/metrics/llm/graders/conciseness_grader.py +93 -0
- eval_framework/metrics/llm/graders/contains_names_grader.py +71 -0
- eval_framework/metrics/llm/graders/format_correctness_grader.py +109 -0
- eval_framework/metrics/llm/graders/instruction_grader.py +177 -0
- eval_framework/metrics/llm/graders/language.py +56 -0
- eval_framework/metrics/llm/graders/long_context_grader.py +72 -0
- eval_framework/metrics/llm/graders/models.py +74 -0
- eval_framework/metrics/llm/graders/refusal_grader.py +57 -0
- eval_framework/metrics/llm/graders/sql_quality_grader.py +145 -0
- eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +103 -0
- eval_framework/metrics/llm/llm_judge_chatbot_style.py +36 -0
- eval_framework/metrics/llm/llm_judge_completion_accuracy.py +39 -0
- eval_framework/metrics/llm/llm_judge_conciseness.py +37 -0
- eval_framework/metrics/llm/llm_judge_contains_names.py +36 -0
- eval_framework/metrics/llm/llm_judge_format_correctness.py +43 -0
- eval_framework/metrics/llm/llm_judge_instruction.py +58 -0
- eval_framework/metrics/llm/llm_judge_mtbench_pair.py +205 -0
- eval_framework/metrics/llm/llm_judge_mtbench_single.py +188 -0
- eval_framework/metrics/llm/llm_judge_refusal.py +35 -0
- eval_framework/metrics/llm/llm_judge_sql.py +394 -0
- eval_framework/metrics/llm/llm_judge_world_knowledge.py +37 -0
- eval_framework/metrics/loglikelihood/__init__.py +0 -0
- eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +51 -0
- eval_framework/metrics/loglikelihood/probability_mass.py +56 -0
- eval_framework/py.typed +0 -0
- eval_framework/response_generator.py +416 -0
- eval_framework/result_processors/__init__.py +0 -0
- eval_framework/result_processors/base.py +74 -0
- eval_framework/result_processors/hf_processor.py +87 -0
- eval_framework/result_processors/result_processor.py +129 -0
- eval_framework/run.py +314 -0
- eval_framework/run_direct.py +42 -0
- eval_framework/shared/types.py +227 -0
- eval_framework/tasks/__init__.py +6 -0
- eval_framework/tasks/base.py +314 -0
- eval_framework/tasks/benchmarks/__init__.py +0 -0
- eval_framework/tasks/benchmarks/arc.py +46 -0
- eval_framework/tasks/benchmarks/arc_de.py +46 -0
- eval_framework/tasks/benchmarks/arc_fi.py +46 -0
- eval_framework/tasks/benchmarks/belebele.py +60 -0
- eval_framework/tasks/benchmarks/bigcodebench.py +155 -0
- eval_framework/tasks/benchmarks/casehold.py +47 -0
- eval_framework/tasks/benchmarks/chembench.py +85 -0
- eval_framework/tasks/benchmarks/copa.py +39 -0
- eval_framework/tasks/benchmarks/duc.py +91 -0
- eval_framework/tasks/benchmarks/flores200.py +62 -0
- eval_framework/tasks/benchmarks/flores_plus.py +84 -0
- eval_framework/tasks/benchmarks/gpqa.py +177 -0
- eval_framework/tasks/benchmarks/gsm8k.py +148 -0
- eval_framework/tasks/benchmarks/hellaswag.py +44 -0
- eval_framework/tasks/benchmarks/hellaswag_de.py +52 -0
- eval_framework/tasks/benchmarks/humaneval.py +97 -0
- eval_framework/tasks/benchmarks/ifeval.py +78 -0
- eval_framework/tasks/benchmarks/include.py +119 -0
- eval_framework/tasks/benchmarks/infinitebench.py +302 -0
- eval_framework/tasks/benchmarks/math_reasoning.py +569 -0
- eval_framework/tasks/benchmarks/mbpp.py +192 -0
- eval_framework/tasks/benchmarks/mmlu.py +190 -0
- eval_framework/tasks/benchmarks/mmlu_de.py +109 -0
- eval_framework/tasks/benchmarks/mmlu_pro.py +139 -0
- eval_framework/tasks/benchmarks/mmmlu.py +529 -0
- eval_framework/tasks/benchmarks/openbookqa.py +37 -0
- eval_framework/tasks/benchmarks/opengptx_eu20.py +363 -0
- eval_framework/tasks/benchmarks/pawsx.py +65 -0
- eval_framework/tasks/benchmarks/piqa.py +39 -0
- eval_framework/tasks/benchmarks/quality.py +56 -0
- eval_framework/tasks/benchmarks/sciq.py +44 -0
- eval_framework/tasks/benchmarks/sphyr.py +75 -0
- eval_framework/tasks/benchmarks/squad.py +89 -0
- eval_framework/tasks/benchmarks/struct_eval.py +110 -0
- eval_framework/tasks/benchmarks/tablebench.py +117 -0
- eval_framework/tasks/benchmarks/triviaqa.py +42 -0
- eval_framework/tasks/benchmarks/truthfulqa.py +95 -0
- eval_framework/tasks/benchmarks/winogender.py +39 -0
- eval_framework/tasks/benchmarks/winogrande.py +44 -0
- eval_framework/tasks/benchmarks/winox.py +57 -0
- eval_framework/tasks/benchmarks/wmt.py +160 -0
- eval_framework/tasks/benchmarks/zero_scrolls.py +197 -0
- eval_framework/tasks/eval_config.py +112 -0
- eval_framework/tasks/perturbation.py +83 -0
- eval_framework/tasks/registry.py +186 -0
- eval_framework/tasks/task_loader.py +80 -0
- eval_framework/tasks/task_names.py +138 -0
- eval_framework/tasks/utils.py +578 -0
- eval_framework/utils/constants.py +9 -0
- eval_framework/utils/generate_task_docs.py +229 -0
- eval_framework/utils/helpers.py +3 -0
- eval_framework/utils/logging.py +50 -0
- eval_framework/utils/packaging.py +52 -0
- eval_framework-0.2.0.dist-info/METADATA +514 -0
- eval_framework-0.2.0.dist-info/RECORD +161 -0
- eval_framework-0.2.0.dist-info/WHEEL +4 -0
- eval_framework-0.2.0.dist-info/entry_points.txt +3 -0
- template_formatting/README.md +83 -0
- template_formatting/__init__.py +0 -0
- template_formatting/formatter.py +536 -0
- template_formatting/mistral_formatter.py +159 -0
- template_formatting/py.typed +0 -0
- template_formatting/tests/test_formatter_eval.py +408 -0
- template_formatting/tests/test_formatter_scaling.py +253 -0
- template_formatting/tests/test_mistral_formatter.py +136 -0
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
eval_framework/__init__.py,sha256=dLv--h62kDYK2uN5aFpEowXpW2P9XLwMud-NwoiW_u4,120
|
|
2
|
+
eval_framework/base_config.py,sha256=LJOHr0MtE9PPsfbLmP2tpoa52Tt0rIHMaW3CTYVwehs,1236
|
|
3
|
+
eval_framework/context/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
eval_framework/context/determined.py,sha256=A3o-N7149LF6OqG9fDvmmge6tXi65qGexLuhazi1R30,7070
|
|
5
|
+
eval_framework/context/eval.py,sha256=uFlWBMEZ-xlJFjnvjaxvCphXlgWYXb2F-6xzVuoqXJk,4067
|
|
6
|
+
eval_framework/context/local.py,sha256=rO8yb1lMHdqiKcdFRVktWR6Ux3T9tM6Xa946Fpwxi5k,1962
|
|
7
|
+
eval_framework/evaluation_generator.py,sha256=xlZeGk0Y6jNYLQ3-8qAeCpx-tPBu924mvWLtXPqbGCA,11611
|
|
8
|
+
eval_framework/exceptions.py,sha256=j4jjN2Y-8vMxf0Dfms1buAJHNMzEQ6kZca6l_z-lDBo,38
|
|
9
|
+
eval_framework/external/ifeval_impl/README.md,sha256=fC2t3BSbjW_Hl8iAUoTwiFpblgY1NeqeF67tl5ScWT4,408
|
|
10
|
+
eval_framework/external/ifeval_impl/instructions.py,sha256=fp94wBZv0SQgm7OTTrguh1yiscPoYst8MqoBmoO_A6k,55615
|
|
11
|
+
eval_framework/external/ifeval_impl/instructions_registry.py,sha256=TzNBdO5rHl3jPwvm-o83IpJ8l1o0DoG2jp7gDSd54RU,6722
|
|
12
|
+
eval_framework/external/ifeval_impl/instructions_util.py,sha256=qUb8wipLfBMvHv3UpMTn-yZay_2JU7X_524f141xHJs,26095
|
|
13
|
+
eval_framework/external/ifeval_impl/utils.py,sha256=i2ADNqLmcBlBAdL7BZMa4HoTXJ3DU01UL01-7grcebg,4537
|
|
14
|
+
eval_framework/llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
|
+
eval_framework/llm/aleph_alpha.py,sha256=2SforEMRAORdeq9BqCho2KURLv8FYjabGOH4fghEW9M,14542
|
|
16
|
+
eval_framework/llm/base.py,sha256=WIA4N4z1RH9my-qiozRLyS6PmMKyd9_Uz98aGnoGe_U,2488
|
|
17
|
+
eval_framework/llm/huggingface.py,sha256=YpqFFAYrBO-G6Aws2AxzNNGsHtEo8CYDWZ0Gddfn95E,15042
|
|
18
|
+
eval_framework/llm/mistral.py,sha256=Fkxop0tSegNo22DCdyAWXUV8mKK30Fbq_aM9iXs-HHA,2732
|
|
19
|
+
eval_framework/llm/models.py,sha256=W0W4vaosvkU2CrHAT-4TxWQwGPuPUaDqIPtzb4G2uDA,638
|
|
20
|
+
eval_framework/llm/openai.py,sha256=x_OlwGUGPh5A6wp7HZBA-mn-4u6BbfIhLPguXQUjqIE,8633
|
|
21
|
+
eval_framework/llm/vllm.py,sha256=T5gaDYglrN6Omv3MpZZ7n5emjxfZPycKHHtjlqvfx5A,17086
|
|
22
|
+
eval_framework/logger.py,sha256=8Bj7S8JRYh-SJZ3dEgueDIoVrhOjRyDsnRuLG61ft9E,61
|
|
23
|
+
eval_framework/main.py,sha256=y-5bpkvjwVhcYWmzHXOPDv9bPABkgD_9GXID8pJI2F0,7552
|
|
24
|
+
eval_framework/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
|
+
eval_framework/metrics/base.py,sha256=3VmIfC8AN-SXFf_7d_5fnwTQirDKBtJI5JxwDGOhtDU,1013
|
|
26
|
+
eval_framework/metrics/completion/__init__.py,sha256=3m1ekU7MH8JqV-6VHRBRQiPatqpZsNW6tQKpaXjpztE,52
|
|
27
|
+
eval_framework/metrics/completion/accuracy_completion.py,sha256=t-6lJBZ6dhhppepIkLEmB3TSd9qkGE3mrPYaDhnql98,697
|
|
28
|
+
eval_framework/metrics/completion/bleu.py,sha256=IDO3Hn-VgH7eT83iO9FCBI8gBUcj8cMOc1kfm_E73uI,3311
|
|
29
|
+
eval_framework/metrics/completion/chrf.py,sha256=o0zbwOpbL99fg00neET1Pb7jsfT8Sd1n-px_Jql43X8,2526
|
|
30
|
+
eval_framework/metrics/completion/code_assertion.py,sha256=f9XYPJzP6XWD2wqZ1_qWYyw56IhBnxp7hBmymw30ExA,1489
|
|
31
|
+
eval_framework/metrics/completion/code_execution_pass_at_one.py,sha256=1GyruuwS12UXfLSo3K7rRy2CZhC3W0oOAilFzAUtOeU,5123
|
|
32
|
+
eval_framework/metrics/completion/comet.py,sha256=M_4ITNfthjxqX8CgVKlxK5W7Gdu08FbXsmbGOx4SfSA,2333
|
|
33
|
+
eval_framework/metrics/completion/concordance_index.py,sha256=LfmM4KmXKiPbztoJaBRCDMA6lQdPFhHcRTYjNP0olQk,1369
|
|
34
|
+
eval_framework/metrics/completion/csv_format.py,sha256=sxo8xnEkGUw7FnkkZC2k58yn3GPuJQ_rJAFNLLo2sNE,3640
|
|
35
|
+
eval_framework/metrics/completion/cwe_accuracy.py,sha256=1LV35uxoDlKzLE_XWBItMMVsGBLqXP2DfqiI0L2T-dI,2130
|
|
36
|
+
eval_framework/metrics/completion/exponential_similarity.py,sha256=93rQV_pG7RbFMt0DWCDQe8iUiF9GzcTneHRxvH9tIgI,2702
|
|
37
|
+
eval_framework/metrics/completion/f1.py,sha256=ddHQXsQv5keZDrJvoY_nPPZtqZMEfrRrafeSWg6HQys,1512
|
|
38
|
+
eval_framework/metrics/completion/format_checker.py,sha256=JUgx3EbxsZEJr0bNlmQFQdQzkghvegq8QtC4vxQjvaI,1997
|
|
39
|
+
eval_framework/metrics/completion/grid_difference.py,sha256=sun639fzMNkhjoesfgRIsy7dofF5vxzbKlvVvUfA_y4,3104
|
|
40
|
+
eval_framework/metrics/completion/ifeval.py,sha256=93KxO8qfE6-9snppzpr3a7jCmCT2ciJOqWcK31VB2No,2578
|
|
41
|
+
eval_framework/metrics/completion/json_format.py,sha256=MJz8tFASxEsqPwCzhz66Z3m2y4eHPYchRMkk0n4kD6I,6209
|
|
42
|
+
eval_framework/metrics/completion/language_checker.py,sha256=QO9yhHe99ZkvZxLSZ5m5B8N_oRVNsZeklg0b5MfUadg,3323
|
|
43
|
+
eval_framework/metrics/completion/length_control.py,sha256=15_S5m7SNFNR5KXNhmvTy3pGhtsuawlRU76w-ehLix8,3294
|
|
44
|
+
eval_framework/metrics/completion/math_reasoning_completion.py,sha256=L5GH_aQI6Azngv9a2DMueraNFBPmT3Ges09CO4naTXM,12050
|
|
45
|
+
eval_framework/metrics/completion/niah_accuracy.py,sha256=ycFUVXpJqdA_-aBvmzKUfaSpPi_-nCDY4F27kQjsPks,5803
|
|
46
|
+
eval_framework/metrics/completion/placeholder_checker.py,sha256=PhpPlcrP_QDYCOJuWK12ZfcUAOYys9IxZOKICTNUa1U,1147
|
|
47
|
+
eval_framework/metrics/completion/repetition.py,sha256=MRsap8ZDISDfC5luqWlQA05W_anjFU6XzzvD55LsM_M,3340
|
|
48
|
+
eval_framework/metrics/completion/rouge_1.py,sha256=Y1m7e9q258cIFjIfGShssneFn08_85ZQF6-YqIgOORQ,1514
|
|
49
|
+
eval_framework/metrics/completion/rouge_2.py,sha256=3GKFHVXHKvPOjk4SaU6D1vbykK5WeE6Q2Ogjhasa1uk,1978
|
|
50
|
+
eval_framework/metrics/completion/rouge_geometric_mean.py,sha256=0fqiWx72eJscuLkekh901CwhFInN9HoxQ2LJod40fJs,1730
|
|
51
|
+
eval_framework/metrics/completion/rouge_l.py,sha256=SwM1s7MQWKjVPlS0KyHcEH9pzkA-hlidz-4gM9kiTu4,2360
|
|
52
|
+
eval_framework/metrics/completion/struct_eval_metrics.py,sha256=8wBx7yTfzjww1wPST57X9sjrVNHavtKXZcOiCkbNrZk,8148
|
|
53
|
+
eval_framework/metrics/completion/ter.py,sha256=mskQejjl1RX0WuSQk1e42-L1QfH0kwTVIhDwqbaBNEc,2614
|
|
54
|
+
eval_framework/metrics/completion/text_counter.py,sha256=UXBOt7okRZHx6BuVcyAS9IeNoYSnryLKkdgYn0FArF8,7100
|
|
55
|
+
eval_framework/metrics/efficiency/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
56
|
+
eval_framework/metrics/efficiency/bytes_per_sequence_position.py,sha256=fPNqu_fQSqy__1Es5Zbm0niBr8N6j-jnprY-ysAFrds,1849
|
|
57
|
+
eval_framework/metrics/llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
58
|
+
eval_framework/metrics/llm/base.py,sha256=uCOGxMM7d3oS2ECZTg-Xy3GOQow68TOsB8YBKXGRnTI,286
|
|
59
|
+
eval_framework/metrics/llm/graders/chatbot_style_grader.py,sha256=7tplUGC7G_F730t9Ij242dBRbQKUaCzURP1iX6ZKgrI,4114
|
|
60
|
+
eval_framework/metrics/llm/graders/comparison_grader.py,sha256=9fIWPqVeky5MrwvWHrQNzFeeDlu6LH8bw5-eBHdc82g,5363
|
|
61
|
+
eval_framework/metrics/llm/graders/conciseness_grader.py,sha256=-WE7dOo7Jo57UzmesAr61WKurB9NegNBVtPLmViLOZw,3562
|
|
62
|
+
eval_framework/metrics/llm/graders/contains_names_grader.py,sha256=5NUGVcAzkyGJ1or5uReCbUJT3psplnHTd7dUkf_iR0Y,2724
|
|
63
|
+
eval_framework/metrics/llm/graders/format_correctness_grader.py,sha256=1ewPCXj97favA3BovNSOpHRILhtsTbmp5vWJfzk-968,4549
|
|
64
|
+
eval_framework/metrics/llm/graders/instruction_grader.py,sha256=v9ew30JHpO8LK99D2FYhFz6E-ikE4PIld3sCT79u0gk,11625
|
|
65
|
+
eval_framework/metrics/llm/graders/language.py,sha256=9YlEE3BjvzfHfQtRMTWrP_NxGbjKbZRbAjqo3GvL_wE,1720
|
|
66
|
+
eval_framework/metrics/llm/graders/long_context_grader.py,sha256=BX29D8BsVoVGOfGlQjAfFMJFw2Nn77puwMOBnHJvJoE,2476
|
|
67
|
+
eval_framework/metrics/llm/graders/models.py,sha256=PVGzyjOcmm-DN-NpoO8SzFyUNVoDLG330f3uFXG0SfE,2206
|
|
68
|
+
eval_framework/metrics/llm/graders/refusal_grader.py,sha256=SUFUiveL36LWyKR5w8LUgYl2Kx4aAc5IPu5uV8j4N5k,2272
|
|
69
|
+
eval_framework/metrics/llm/graders/sql_quality_grader.py,sha256=ooNCxBNKeyqFxf2nAKdtUcd7aIMQpmxcEn9iTo5XhiQ,5624
|
|
70
|
+
eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py,sha256=lZJzXyMNYLhY4RmrPZsBxJByPXWMk8aeDjGxUArwv4U,4599
|
|
71
|
+
eval_framework/metrics/llm/llm_judge_chatbot_style.py,sha256=pb_GWN5xVHRuk64XPNkIqRV1htKaNmr-Cdjx9jxgGiw,1400
|
|
72
|
+
eval_framework/metrics/llm/llm_judge_completion_accuracy.py,sha256=KS1Fn3cZAyIfXd8LM_o2s9IjoHpftqtuSIJ3fGDAr6Y,1523
|
|
73
|
+
eval_framework/metrics/llm/llm_judge_conciseness.py,sha256=v2iSxBeUU3QTjdy0hx-9t5j0pf4LMnp5z2JCiqpN9_8,1439
|
|
74
|
+
eval_framework/metrics/llm/llm_judge_contains_names.py,sha256=7r-sAI6Qwej4fgQIhmotXtEK5ZaLcHxgyjbP7TYzRtE,1401
|
|
75
|
+
eval_framework/metrics/llm/llm_judge_format_correctness.py,sha256=AwHLblRtWSo7hg0sJpcdQAZP7ldrfZFDp2rGB9-6rns,1668
|
|
76
|
+
eval_framework/metrics/llm/llm_judge_instruction.py,sha256=PcXACNijZSYIfLoks-bqCgjqo0YPqQpX4O5GinC2SvE,2170
|
|
77
|
+
eval_framework/metrics/llm/llm_judge_mtbench_pair.py,sha256=YCZcXA-HxQww7HUCgzNapJIUfPW3I0YP6WGG-dtRD9w,24787
|
|
78
|
+
eval_framework/metrics/llm/llm_judge_mtbench_single.py,sha256=20iNNtGm1Ch8Upt0Vk7MGlKvwYyN4i0lvRDym3AnK0w,17362
|
|
79
|
+
eval_framework/metrics/llm/llm_judge_refusal.py,sha256=iAoOstgOvKtk9M9wqVqrf21mM0Xbss4EraO7R3g9FBQ,1418
|
|
80
|
+
eval_framework/metrics/llm/llm_judge_sql.py,sha256=qMj2pHzijq2lVHqToewQL_xJSgKLulZWSb64996ztnQ,14480
|
|
81
|
+
eval_framework/metrics/llm/llm_judge_world_knowledge.py,sha256=C48aHS6bcVtGMk0YxzqDAGiHekypyeo--SK7EFVN5Jc,1517
|
|
82
|
+
eval_framework/metrics/loglikelihood/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
83
|
+
eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py,sha256=l0OxJFSQiLnwJdfX72SH9k-krZr8AI1FOUYlHfiT2Q0,1921
|
|
84
|
+
eval_framework/metrics/loglikelihood/probability_mass.py,sha256=I3AhKlwSYQEnKFfagopqmc6-Mdnui43GR0LSFfsrJVk,2291
|
|
85
|
+
eval_framework/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
86
|
+
eval_framework/response_generator.py,sha256=-6qQ2U_he0uNKG5kcir5tiZcQ4cOybTiQ1KWTmLe3cI,19244
|
|
87
|
+
eval_framework/result_processors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
88
|
+
eval_framework/result_processors/base.py,sha256=wuoOQENw6GPDsRxwk5lxeNUprplNa33D1XuQ4nJdJI0,2017
|
|
89
|
+
eval_framework/result_processors/hf_processor.py,sha256=p-b9EBKx0ahYK8tu-h8l76rDlYtcZzx_zKIe2p8HDBI,3071
|
|
90
|
+
eval_framework/result_processors/result_processor.py,sha256=zMQ_SJHbr81og4I_6Q1OrQGSJWCxlES_3xklMmU1S0Q,5362
|
|
91
|
+
eval_framework/run.py,sha256=lWn_u5Sfp4iZVtubArg2VfpD-qIRhuIBwp-lz-2q8o8,9896
|
|
92
|
+
eval_framework/run_direct.py,sha256=KMWkLDuDt-HPlmjsSGKAiXd7LlrpVUPKv89Gk3i0snA,1176
|
|
93
|
+
eval_framework/shared/types.py,sha256=lPA5uhdRgs3H---SFsjUOYwUkqBYL0K2Y2JvxCOyMLc,8841
|
|
94
|
+
eval_framework/tasks/__init__.py,sha256=Fzs8DY53Dt0Gsu34Ro6Dk6by9qgaFF0UIIHERl6PO5g,120
|
|
95
|
+
eval_framework/tasks/base.py,sha256=ujLYsjgRtOpEccx8RligP2HJd8G-A-ct1Tr8qzxJTMM,12558
|
|
96
|
+
eval_framework/tasks/benchmarks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
97
|
+
eval_framework/tasks/benchmarks/arc.py,sha256=KAu4etBPiLde4sLR706H536msLe-cc2ItE8eYCtV5ro,1834
|
|
98
|
+
eval_framework/tasks/benchmarks/arc_de.py,sha256=Ng7n0HeAEPh2SEHOTwIl1ccVCWH9iI0-U11mBe0aR38,1871
|
|
99
|
+
eval_framework/tasks/benchmarks/arc_fi.py,sha256=fgjdHN0pq8V_R_vMsLsk9Q2Mf7JwKF0vVFazESHtf2o,1858
|
|
100
|
+
eval_framework/tasks/benchmarks/belebele.py,sha256=x3V5DsNdkJpwMCWi5SOVIMJS7-ZZhi5E3XttGa23bR8,2213
|
|
101
|
+
eval_framework/tasks/benchmarks/bigcodebench.py,sha256=vrEySwqQTAEFnWGDoTzAOL6IVSwulbK7Rp60hxyOlPc,5892
|
|
102
|
+
eval_framework/tasks/benchmarks/casehold.py,sha256=hFWW1LnzVHk8un58flGLh_JOv3h95XHwOB-WenIWOJc,1727
|
|
103
|
+
eval_framework/tasks/benchmarks/chembench.py,sha256=GaPW0oBYLx4cQaZuvcFdDIL5XG5YUurqRcdaWXSzVgo,3522
|
|
104
|
+
eval_framework/tasks/benchmarks/copa.py,sha256=yJDy93Kjv1mVfwUORZidsAdPVrRjr6HKP8LREuOlQyU,1483
|
|
105
|
+
eval_framework/tasks/benchmarks/duc.py,sha256=6VAk38UrbuG63gr-K69WZ016g-EH0ONxTzictJ-AcN0,3516
|
|
106
|
+
eval_framework/tasks/benchmarks/flores200.py,sha256=HzgSaARfqTJaSQo71GUBJucY9ZZg6m0d4T3XNz9QARo,2176
|
|
107
|
+
eval_framework/tasks/benchmarks/flores_plus.py,sha256=bTKH8ECFdZRw-3RV-37a2clpY7u1Y2QigVeXIQoI2c0,3346
|
|
108
|
+
eval_framework/tasks/benchmarks/gpqa.py,sha256=bOg1oV1OXLsLHdYR8rlabJi2o4FA8szOrbiyRIsNGQs,7873
|
|
109
|
+
eval_framework/tasks/benchmarks/gsm8k.py,sha256=-JKQIMOZP0tF-GpZzGZNmqoAfMtCoDFAyagASln2Vbc,5790
|
|
110
|
+
eval_framework/tasks/benchmarks/hellaswag.py,sha256=j5KzRNSdIC_oBMrDCIOTsWoDBpfksodECfbneWKL2kU,1691
|
|
111
|
+
eval_framework/tasks/benchmarks/hellaswag_de.py,sha256=09sItfKknm6Xm-NKm5HcBgm-EYlm0dBqdgkEcXCReVk,2091
|
|
112
|
+
eval_framework/tasks/benchmarks/humaneval.py,sha256=wDP8ymSaqrhe28pTXavt0fxayA-cdUU8eOp5V8Q6T40,3370
|
|
113
|
+
eval_framework/tasks/benchmarks/ifeval.py,sha256=sww3y21udT1xCdf1fmh7z4EZ6-XLMR5fFgqUdwCUmZY,2826
|
|
114
|
+
eval_framework/tasks/benchmarks/include.py,sha256=Io4IFYTOCEoolVMRvjMEc58YJSJh4FcNnJ7wCYOmeIo,3380
|
|
115
|
+
eval_framework/tasks/benchmarks/infinitebench.py,sha256=bDNkNNe2v1FNOwGR9fHbTaUXFJJxNlHR6emarFjPFE4,11024
|
|
116
|
+
eval_framework/tasks/benchmarks/math_reasoning.py,sha256=cFs2x67imAGxdiPbv4YrgmkotxXtUdkr1v0PaDvLElg,22127
|
|
117
|
+
eval_framework/tasks/benchmarks/mbpp.py,sha256=hX8NnmI8iV4L35BX1-OCESNtfQq6hPryVKXJ_rsYQCI,7530
|
|
118
|
+
eval_framework/tasks/benchmarks/mmlu.py,sha256=pSIdhMABfKGY210M-XWZ7yZCRv-DULJM6-fR2iZDCHY,6657
|
|
119
|
+
eval_framework/tasks/benchmarks/mmlu_de.py,sha256=MAmiVWR-tSQFT383lKz_z0b0pdhWQ7PjPwtUuIGCb64,4634
|
|
120
|
+
eval_framework/tasks/benchmarks/mmlu_pro.py,sha256=zlUNC1M0XmydOccv_2pr5RYoFEmUdJKfHdn7Pt3bs4Q,5288
|
|
121
|
+
eval_framework/tasks/benchmarks/mmmlu.py,sha256=Kr6WVv4Z2SH3VJU549LF1yUXIPf1wvL60t7WdxfOcY8,24029
|
|
122
|
+
eval_framework/tasks/benchmarks/openbookqa.py,sha256=8OaC9hECRg9G0CBNhpx0OBThwD4OgjodD2xkvSshBPo,1524
|
|
123
|
+
eval_framework/tasks/benchmarks/opengptx_eu20.py,sha256=cg24VDABCug5pO2NOeLRNh2L5c536IJw_WNob4Zr1K0,14890
|
|
124
|
+
eval_framework/tasks/benchmarks/pawsx.py,sha256=VnTTi396NQzBiUfUxsJ14WRvDibYEDL4S7cHDjoQ96Y,3106
|
|
125
|
+
eval_framework/tasks/benchmarks/piqa.py,sha256=vnt6OHSFit7IHGfhz2qlNJWea4TmWXgIZulyU7lWNqY,1465
|
|
126
|
+
eval_framework/tasks/benchmarks/quality.py,sha256=8GTmOAsX3cxgNDYaTT31cN8N-xSr6iVIR0pvhD8aqeo,1955
|
|
127
|
+
eval_framework/tasks/benchmarks/sciq.py,sha256=oNjEJJ1dy-uPtq7uSojOAj1znR71fnI4Fa2awZJ8hTk,1593
|
|
128
|
+
eval_framework/tasks/benchmarks/sphyr.py,sha256=68miYCDlJHBTU7vTXXnPUQ54DJy_QwLLq-pjiAaXbJw,3004
|
|
129
|
+
eval_framework/tasks/benchmarks/squad.py,sha256=LqF4NTC11HazddgeLFsosd2b2KYJnQ-SWL6hHKuZDpE,3247
|
|
130
|
+
eval_framework/tasks/benchmarks/struct_eval.py,sha256=vz6b26q_uz1Yyk7mmAeCo3UP0oA-Ih5-PG6S-6ojYeg,3850
|
|
131
|
+
eval_framework/tasks/benchmarks/tablebench.py,sha256=MxQlW7d62hBqm0HZ93XxzVHVn_JIeqK4UJO4-b5tO3U,4912
|
|
132
|
+
eval_framework/tasks/benchmarks/triviaqa.py,sha256=S1RTJrJaeowmgjtMkcDxjUbu_9y35Eo1x9HouOqA_M0,1646
|
|
133
|
+
eval_framework/tasks/benchmarks/truthfulqa.py,sha256=cXS8z8udxHqxB0YRo3ZTuqJiqVcTa8DIGvhigu4MJMY,3845
|
|
134
|
+
eval_framework/tasks/benchmarks/winogender.py,sha256=ejv27r22F-hoU_kRvhe7YyZgr4ZjlVHzAnvEHYCFS0s,1541
|
|
135
|
+
eval_framework/tasks/benchmarks/winogrande.py,sha256=t9irSc8FqD2fbFredvvf-ACEXc1-QlU-4x39KXP-YXs,1634
|
|
136
|
+
eval_framework/tasks/benchmarks/winox.py,sha256=Qvbb_HLYpQlEKeOkJ22nFGAWlYjPQhMEp-Ra95_6k0U,1996
|
|
137
|
+
eval_framework/tasks/benchmarks/wmt.py,sha256=I54AmQNX4uv_7k5U3nYdQi2DhQIuAkOzRZCbUZYyZ64,5690
|
|
138
|
+
eval_framework/tasks/benchmarks/zero_scrolls.py,sha256=qUgoDICnmEXM1EZdV2PTUXf4YzUjyOCwmgWACN_-0zE,7862
|
|
139
|
+
eval_framework/tasks/eval_config.py,sha256=d49KUjBqIiH2sBGaIs0sylR2WQKsR5sDnbFXL_P3n1Q,4311
|
|
140
|
+
eval_framework/tasks/perturbation.py,sha256=ZtXMqPk9YSWiX6ytgUXEeacA1LhAgvkJyxuO7MruJho,3532
|
|
141
|
+
eval_framework/tasks/registry.py,sha256=d4uYpg8JOfStl-r0mExaJOYL4rqsXD9RAQ93fi32D7I,5738
|
|
142
|
+
eval_framework/tasks/task_loader.py,sha256=js6Um6ZoFFVl1FHfrbH4EQnFEX8-_jgdDAHvrPNULDw,3669
|
|
143
|
+
eval_framework/tasks/task_names.py,sha256=JbOHvhkii11-8nXmAqaqN_AuhRJMvMv514KmSfSI1Mw,8954
|
|
144
|
+
eval_framework/tasks/utils.py,sha256=QdTXOeWkUbC0hCcZbAPFRWFCu1KkP-9kFukIoyaSJkc,19785
|
|
145
|
+
eval_framework/utils/constants.py,sha256=LEElGdYrkIWm8dJa7lfD5LbL-fwkF17Z0nQ7_XVg098,164
|
|
146
|
+
eval_framework/utils/generate_task_docs.py,sha256=KBLicduL27fwa0bWwxPw_tfDcChn2E0GYpUt3dB9ldo,8999
|
|
147
|
+
eval_framework/utils/helpers.py,sha256=KCVUcGw-Hvkf6Qs7h4fqRxELORIlPXTEMDV5zO_2IYU,160
|
|
148
|
+
eval_framework/utils/logging.py,sha256=mpGNSoam2N3YkTClKF2hoe4XftsOZFa4bE9HHU6EXV0,1648
|
|
149
|
+
eval_framework/utils/packaging.py,sha256=Z_eXjzcgCvifJwJ-pqeAtNPVjscgR6QYNKe4E-iSnFc,1889
|
|
150
|
+
template_formatting/README.md,sha256=gVrps3xXKPP87lWmKzaaEjNMBikQVj_MMZ-FdG3O6Xg,3787
|
|
151
|
+
template_formatting/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
152
|
+
template_formatting/formatter.py,sha256=2U39HhQVWEy__5WysrapUR_GBjogcgmqnp47IKJf57Y,21490
|
|
153
|
+
template_formatting/mistral_formatter.py,sha256=NJESWDsd_QFx8USoTGJ1QOYczwXf-ObjnpTy7mqzjK0,6734
|
|
154
|
+
template_formatting/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
155
|
+
template_formatting/tests/test_formatter_eval.py,sha256=Z7YLmPLzOqJP7TbtdDDAWR_WBhlnCKBRNnm2NXT8jmk,17186
|
|
156
|
+
template_formatting/tests/test_formatter_scaling.py,sha256=jMdul-2urA6ouVY6d5iKdGhiaLMEp5cDHr5tVIJaVrU,9405
|
|
157
|
+
template_formatting/tests/test_mistral_formatter.py,sha256=V95tEjxoEzH3Eai7aHaqDkOou0nPkc9v9mx9yV48PqQ,6068
|
|
158
|
+
eval_framework-0.2.0.dist-info/WHEEL,sha256=Jb20R3Ili4n9P1fcwuLup21eQ5r9WXhs4_qy7VTrgPI,79
|
|
159
|
+
eval_framework-0.2.0.dist-info/entry_points.txt,sha256=k4dpbNwZ5XnovyqrScWTZ-UYzf_EPYOvZA2QTkqrYlk,59
|
|
160
|
+
eval_framework-0.2.0.dist-info/METADATA,sha256=ykdyXfaEI1hexHDSKwvUICz15Bzz-pfeuypJtYW-mFU,25935
|
|
161
|
+
eval_framework-0.2.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
|
|
2
|
+
# Internal template formatting package
|
|
3
|
+
|
|
4
|
+
Single source of truth for internal template formatting. Ensures compatibility between `scaling-internal` and `eval-framework`
|
|
5
|
+
|
|
6
|
+
### Install uv
|
|
7
|
+
|
|
8
|
+
`uv` is used for dependency management and packaging in Python projects. To install uv, follow the [official instructions](https://docs.astral.sh/uv/getting-started/installation/).
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
## Project Structure
|
|
12
|
+
- src/: Contains the template formatting code
|
|
13
|
+
- tests/: Contains pytest test cases.
|
|
14
|
+
- test_formatter_eval.py: Basic unit tests for the template formatter derived from `eval-framework`
|
|
15
|
+
- test_formatter_scaling.py: Basic unit tests for the template formatter derived from `scaling-internal`
|
|
16
|
+
- pyproject.toml: Configuration file for uv and other tools like MyPy, ruff and pytest.
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
## Adding dependencies
|
|
20
|
+
|
|
21
|
+
- **Adding Production Dependencies**: These are dependencies necessary for your project to run. For example, if your project uses Pydantic for data validation, you would add it as a production dependency:
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
uv add pydantic
|
|
25
|
+
```
|
|
26
|
+
- **Adding Development Dependencies**: These are dependencies that are only needed during development, such as testing libraries or linters. For instance, to add pytest for writing and running tests, you would specify it as a development dependency:
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
uv add --group dev pytest
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
After adding any new dependencies, you need to install them to update your project's virtual environment:
|
|
33
|
+
```bash
|
|
34
|
+
uv sync
|
|
35
|
+
```
|
|
36
|
+
This command ensures that all dependencies listed in your pyproject.toml file are correctly installed and available for use in your project.
|
|
37
|
+
|
|
38
|
+
To install all dependencies (including optional ones), run
|
|
39
|
+
```bash
|
|
40
|
+
uv install --extras optional
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Usage
|
|
44
|
+
**Running Commands with uv**
|
|
45
|
+
|
|
46
|
+
`uv`` creates a virtual environment for your project, which isolates your dependencies from the global Python environment. This isolation helps prevent version conflicts and ensures reproducibility. Here's how to use `uv`` to run commands:
|
|
47
|
+
|
|
48
|
+
- **Installation**: To set up pre-commit hooks, you first need to install the pre-commit package and then install the hooks.
|
|
49
|
+
|
|
50
|
+
You can either follow the [install instructions](https://pre-commit.com/#install) or install it globally through `uv tool install pre-commit`
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
pre-commit install
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
- **Running Hooks Manually**: Although pre-commit hooks are triggered automatically before each commit, you can also run them manually to check your files at any time:
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
pre-commit run -a
|
|
60
|
+
```
|
|
61
|
+
This command runs all hooks against all files, which is useful for initial setup or periodic checks.
|
|
62
|
+
|
|
63
|
+
- **Current Hooks**:
|
|
64
|
+
- **Check JSON**: Ensures JSON files are valid.
|
|
65
|
+
- **Pretty format JSON**: Formats JSON files to be more readable.
|
|
66
|
+
- **Fix End of Files**: Ensures files end with a newline.
|
|
67
|
+
- **Trim Trailing Whitespace**: Removes unnecessary trailing whitespace.
|
|
68
|
+
- **Ruff**: Runs the Ruff linter to check Python code for stylistic and programming errors.
|
|
69
|
+
- **Ruff-format**: Automatically formats Python code using Ruff.
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
- **Static Type Checking with MyPy**: To ensure your code is type-safe, run MyPy to check for type errors. This should be done frequently during development to catch type-related issues early:
|
|
73
|
+
```bash
|
|
74
|
+
uv run --all-extras mypy ./src
|
|
75
|
+
uv run --all-extras mypy ./tests
|
|
76
|
+
```
|
|
77
|
+
Run these commands after making changes to your source or test files to verify that your changes haven't introduced type errors.
|
|
78
|
+
|
|
79
|
+
- **Running Tests with pytest**: To ensure your code works as expected and hasn't broken existing functionality, run your tests:
|
|
80
|
+
```bash
|
|
81
|
+
uv run --all-extras pytest
|
|
82
|
+
```
|
|
83
|
+
Run this command frequently during development, especially before committing changes, to ensure all tests pass.
|
|
File without changes
|