eval-framework 0.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- eval_framework/__init__.py +7 -0
- eval_framework/base_config.py +36 -0
- eval_framework/context/__init__.py +0 -0
- eval_framework/context/determined.py +177 -0
- eval_framework/context/eval.py +121 -0
- eval_framework/context/local.py +78 -0
- eval_framework/evaluation_generator.py +234 -0
- eval_framework/exceptions.py +2 -0
- eval_framework/external/ifeval_impl/README.md +5 -0
- eval_framework/external/ifeval_impl/instructions.py +1523 -0
- eval_framework/external/ifeval_impl/instructions_registry.py +161 -0
- eval_framework/external/ifeval_impl/instructions_util.py +1689 -0
- eval_framework/external/ifeval_impl/utils.py +135 -0
- eval_framework/llm/__init__.py +0 -0
- eval_framework/llm/aleph_alpha.py +432 -0
- eval_framework/llm/base.py +180 -0
- eval_framework/llm/huggingface.py +418 -0
- eval_framework/llm/mistral.py +88 -0
- eval_framework/llm/models.py +28 -0
- eval_framework/llm/openai.py +400 -0
- eval_framework/llm/vllm.py +554 -0
- eval_framework/logger.py +3 -0
- eval_framework/main.py +166 -0
- eval_framework/metrics/__init__.py +0 -0
- eval_framework/metrics/base.py +40 -0
- eval_framework/metrics/completion/__init__.py +1 -0
- eval_framework/metrics/completion/accuracy_completion.py +16 -0
- eval_framework/metrics/completion/aidanbench.py +28 -0
- eval_framework/metrics/completion/bleu.py +76 -0
- eval_framework/metrics/completion/chrf.py +62 -0
- eval_framework/metrics/completion/code_assertion.py +44 -0
- eval_framework/metrics/completion/code_execution_pass_at_one.py +126 -0
- eval_framework/metrics/completion/comet.py +56 -0
- eval_framework/metrics/completion/concordance_index.py +38 -0
- eval_framework/metrics/completion/csv_format.py +102 -0
- eval_framework/metrics/completion/cwe_accuracy.py +49 -0
- eval_framework/metrics/completion/exponential_similarity.py +65 -0
- eval_framework/metrics/completion/f1.py +42 -0
- eval_framework/metrics/completion/format_checker.py +56 -0
- eval_framework/metrics/completion/grid_difference.py +77 -0
- eval_framework/metrics/completion/ifeval.py +73 -0
- eval_framework/metrics/completion/json_format.py +179 -0
- eval_framework/metrics/completion/language_checker.py +74 -0
- eval_framework/metrics/completion/length_control.py +83 -0
- eval_framework/metrics/completion/math_reasoning_completion.py +307 -0
- eval_framework/metrics/completion/niah_accuracy.py +163 -0
- eval_framework/metrics/completion/placeholder_checker.py +27 -0
- eval_framework/metrics/completion/repetition.py +88 -0
- eval_framework/metrics/completion/rouge_1.py +35 -0
- eval_framework/metrics/completion/rouge_2.py +45 -0
- eval_framework/metrics/completion/rouge_geometric_mean.py +36 -0
- eval_framework/metrics/completion/rouge_l.py +52 -0
- eval_framework/metrics/completion/struct_eval_metrics.py +248 -0
- eval_framework/metrics/completion/ter.py +67 -0
- eval_framework/metrics/completion/text_counter.py +182 -0
- eval_framework/metrics/efficiency/__init__.py +0 -0
- eval_framework/metrics/efficiency/bytes_per_sequence_position.py +48 -0
- eval_framework/metrics/llm/__init__.py +0 -0
- eval_framework/metrics/llm/base.py +34 -0
- eval_framework/metrics/llm/graders/chatbot_style_grader.py +92 -0
- eval_framework/metrics/llm/graders/coherence_grader.py +115 -0
- eval_framework/metrics/llm/graders/comparison_grader.py +198 -0
- eval_framework/metrics/llm/graders/conciseness_grader.py +93 -0
- eval_framework/metrics/llm/graders/contains_names_grader.py +71 -0
- eval_framework/metrics/llm/graders/format_correctness_grader.py +109 -0
- eval_framework/metrics/llm/graders/instruction_grader.py +177 -0
- eval_framework/metrics/llm/graders/language.py +56 -0
- eval_framework/metrics/llm/graders/long_context_grader.py +72 -0
- eval_framework/metrics/llm/graders/models.py +74 -0
- eval_framework/metrics/llm/graders/refusal_grader.py +57 -0
- eval_framework/metrics/llm/graders/sql_quality_grader.py +145 -0
- eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +103 -0
- eval_framework/metrics/llm/llm_judge_chatbot_style.py +36 -0
- eval_framework/metrics/llm/llm_judge_coherence.py +44 -0
- eval_framework/metrics/llm/llm_judge_completion_accuracy.py +39 -0
- eval_framework/metrics/llm/llm_judge_conciseness.py +37 -0
- eval_framework/metrics/llm/llm_judge_contains_names.py +36 -0
- eval_framework/metrics/llm/llm_judge_format_correctness.py +43 -0
- eval_framework/metrics/llm/llm_judge_instruction.py +58 -0
- eval_framework/metrics/llm/llm_judge_mtbench_pair.py +306 -0
- eval_framework/metrics/llm/llm_judge_mtbench_single.py +210 -0
- eval_framework/metrics/llm/llm_judge_refusal.py +35 -0
- eval_framework/metrics/llm/llm_judge_sql.py +394 -0
- eval_framework/metrics/llm/llm_judge_world_knowledge.py +37 -0
- eval_framework/metrics/llm/utils.py +20 -0
- eval_framework/metrics/loglikelihood/__init__.py +0 -0
- eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +51 -0
- eval_framework/metrics/loglikelihood/base.py +50 -0
- eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +25 -0
- eval_framework/metrics/loglikelihood/dcs.py +43 -0
- eval_framework/metrics/loglikelihood/probability_mass.py +53 -0
- eval_framework/metrics/loglikelihood/ternary.py +42 -0
- eval_framework/py.typed +0 -0
- eval_framework/response_generator.py +351 -0
- eval_framework/result_processors/__init__.py +0 -0
- eval_framework/result_processors/base.py +88 -0
- eval_framework/result_processors/hf_uploader.py +75 -0
- eval_framework/result_processors/result_processor.py +129 -0
- eval_framework/result_processors/wandb_uploader.py +137 -0
- eval_framework/run.py +369 -0
- eval_framework/run_direct.py +42 -0
- eval_framework/shared/types.py +227 -0
- eval_framework/tasks/__init__.py +6 -0
- eval_framework/tasks/base.py +392 -0
- eval_framework/tasks/benchmarks/__init__.py +0 -0
- eval_framework/tasks/benchmarks/aidanbench.py +211 -0
- eval_framework/tasks/benchmarks/arc.py +70 -0
- eval_framework/tasks/benchmarks/arc_de.py +46 -0
- eval_framework/tasks/benchmarks/arc_fi.py +46 -0
- eval_framework/tasks/benchmarks/belebele.py +60 -0
- eval_framework/tasks/benchmarks/bigcodebench.py +155 -0
- eval_framework/tasks/benchmarks/casehold.py +47 -0
- eval_framework/tasks/benchmarks/chembench.py +85 -0
- eval_framework/tasks/benchmarks/copa.py +64 -0
- eval_framework/tasks/benchmarks/duc.py +91 -0
- eval_framework/tasks/benchmarks/flores200.py +133 -0
- eval_framework/tasks/benchmarks/flores_plus.py +84 -0
- eval_framework/tasks/benchmarks/gpqa.py +201 -0
- eval_framework/tasks/benchmarks/gsm8k.py +150 -0
- eval_framework/tasks/benchmarks/hellaswag.py +69 -0
- eval_framework/tasks/benchmarks/hellaswag_de.py +52 -0
- eval_framework/tasks/benchmarks/humaneval.py +97 -0
- eval_framework/tasks/benchmarks/ifeval.py +78 -0
- eval_framework/tasks/benchmarks/include.py +119 -0
- eval_framework/tasks/benchmarks/infinitebench.py +302 -0
- eval_framework/tasks/benchmarks/math_reasoning.py +580 -0
- eval_framework/tasks/benchmarks/mbpp.py +192 -0
- eval_framework/tasks/benchmarks/mmlu.py +215 -0
- eval_framework/tasks/benchmarks/mmlu_de.py +109 -0
- eval_framework/tasks/benchmarks/mmlu_pro.py +164 -0
- eval_framework/tasks/benchmarks/mmmlu.py +529 -0
- eval_framework/tasks/benchmarks/openbookqa.py +85 -0
- eval_framework/tasks/benchmarks/opengptx_eu20.py +363 -0
- eval_framework/tasks/benchmarks/pawsx.py +65 -0
- eval_framework/tasks/benchmarks/piqa.py +64 -0
- eval_framework/tasks/benchmarks/quality.py +56 -0
- eval_framework/tasks/benchmarks/sciq.py +110 -0
- eval_framework/tasks/benchmarks/sphyr.py +79 -0
- eval_framework/tasks/benchmarks/squad.py +211 -0
- eval_framework/tasks/benchmarks/struct_eval.py +116 -0
- eval_framework/tasks/benchmarks/tablebench.py +117 -0
- eval_framework/tasks/benchmarks/triviaqa.py +42 -0
- eval_framework/tasks/benchmarks/truthfulqa.py +119 -0
- eval_framework/tasks/benchmarks/winogender.py +64 -0
- eval_framework/tasks/benchmarks/winogrande.py +69 -0
- eval_framework/tasks/benchmarks/winox.py +57 -0
- eval_framework/tasks/benchmarks/wmt.py +160 -0
- eval_framework/tasks/benchmarks/zero_scrolls.py +197 -0
- eval_framework/tasks/eval_config.py +136 -0
- eval_framework/tasks/perturbation.py +83 -0
- eval_framework/tasks/registry.py +186 -0
- eval_framework/tasks/task_loader.py +81 -0
- eval_framework/tasks/task_names.py +324 -0
- eval_framework/tasks/utils.py +584 -0
- eval_framework/utils/constants.py +9 -0
- eval_framework/utils/file_ops.py +245 -0
- eval_framework/utils/generate_task_docs.py +244 -0
- eval_framework/utils/helpers.py +32 -0
- eval_framework/utils/logging.py +62 -0
- eval_framework/utils/packaging.py +52 -0
- eval_framework/utils/tqdm_handler.py +14 -0
- eval_framework-0.2.7.dist-info/METADATA +548 -0
- eval_framework-0.2.7.dist-info/RECORD +170 -0
- eval_framework-0.2.7.dist-info/WHEEL +4 -0
- eval_framework-0.2.7.dist-info/entry_points.txt +3 -0
- template_formatting/README.md +83 -0
- template_formatting/__init__.py +0 -0
- template_formatting/formatter.py +537 -0
- template_formatting/mistral_formatter.py +159 -0
- template_formatting/py.typed +0 -0
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
eval_framework/__init__.py,sha256=dLv--h62kDYK2uN5aFpEowXpW2P9XLwMud-NwoiW_u4,120
|
|
2
|
+
eval_framework/base_config.py,sha256=LJOHr0MtE9PPsfbLmP2tpoa52Tt0rIHMaW3CTYVwehs,1236
|
|
3
|
+
eval_framework/context/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
eval_framework/context/determined.py,sha256=YNUnwZC8lIAOcsCV6ecHu47lWIMpMdXwpLsLm-DOAUk,7372
|
|
5
|
+
eval_framework/context/eval.py,sha256=pxd8L-GAjFz40sYcRskm6bAt9nno1jRaNSOJRU2S4YU,4515
|
|
6
|
+
eval_framework/context/local.py,sha256=XFqWSeaeG-ASioU3eabmbGvZPN9CEqa18sE3ukiVRXg,3192
|
|
7
|
+
eval_framework/evaluation_generator.py,sha256=edktgkM357yRwgeukDeZcJBYsClqWAQ3mLTRzvGOFXA,11846
|
|
8
|
+
eval_framework/exceptions.py,sha256=j4jjN2Y-8vMxf0Dfms1buAJHNMzEQ6kZca6l_z-lDBo,38
|
|
9
|
+
eval_framework/external/ifeval_impl/README.md,sha256=fC2t3BSbjW_Hl8iAUoTwiFpblgY1NeqeF67tl5ScWT4,408
|
|
10
|
+
eval_framework/external/ifeval_impl/instructions.py,sha256=fp94wBZv0SQgm7OTTrguh1yiscPoYst8MqoBmoO_A6k,55615
|
|
11
|
+
eval_framework/external/ifeval_impl/instructions_registry.py,sha256=TzNBdO5rHl3jPwvm-o83IpJ8l1o0DoG2jp7gDSd54RU,6722
|
|
12
|
+
eval_framework/external/ifeval_impl/instructions_util.py,sha256=qUb8wipLfBMvHv3UpMTn-yZay_2JU7X_524f141xHJs,26095
|
|
13
|
+
eval_framework/external/ifeval_impl/utils.py,sha256=i2ADNqLmcBlBAdL7BZMa4HoTXJ3DU01UL01-7grcebg,4537
|
|
14
|
+
eval_framework/llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
|
+
eval_framework/llm/aleph_alpha.py,sha256=xvUicZoILrWtdgOaYTwtyRZ7WR41i61nBrGBUd3pieg,18461
|
|
16
|
+
eval_framework/llm/base.py,sha256=5JzVxVyix0DG9cVtViMMxN5Wt0cRyVKmiWq5L-1iEoE,7948
|
|
17
|
+
eval_framework/llm/huggingface.py,sha256=Ovq3QZ4ducKxPJp8FisTX5Q57EKB276yfzGPd0pD8KA,18041
|
|
18
|
+
eval_framework/llm/mistral.py,sha256=vn1spuH0uXCtL7zi9cmteoLIiCQy6c8EvyrD0BpVBOs,3544
|
|
19
|
+
eval_framework/llm/models.py,sha256=tSq3jpVBG9OVK4i1MWesZGtEWzbwfn6Vjv6PqLYrhak,937
|
|
20
|
+
eval_framework/llm/openai.py,sha256=QZo3vPPUrRxD76NlIGFgcMh84zWF1TrW706fgoUX-gw,16447
|
|
21
|
+
eval_framework/llm/vllm.py,sha256=9Oa712oJKYNTlKdu30pDS-R13HW9AoyQL_iF0AosRGU,21766
|
|
22
|
+
eval_framework/logger.py,sha256=8Bj7S8JRYh-SJZ3dEgueDIoVrhOjRyDsnRuLG61ft9E,61
|
|
23
|
+
eval_framework/main.py,sha256=yM0BlPAUTbUZ2VD_WdZK7nRbps8bSrWnjOQCu5-VhFE,6829
|
|
24
|
+
eval_framework/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
|
+
eval_framework/metrics/base.py,sha256=3VmIfC8AN-SXFf_7d_5fnwTQirDKBtJI5JxwDGOhtDU,1013
|
|
26
|
+
eval_framework/metrics/completion/__init__.py,sha256=3m1ekU7MH8JqV-6VHRBRQiPatqpZsNW6tQKpaXjpztE,52
|
|
27
|
+
eval_framework/metrics/completion/accuracy_completion.py,sha256=t-6lJBZ6dhhppepIkLEmB3TSd9qkGE3mrPYaDhnql98,697
|
|
28
|
+
eval_framework/metrics/completion/aidanbench.py,sha256=Kyr9aW2jdyt0NCfE8ytRCmNuU6f5lRImb3fHKLgoJUA,1048
|
|
29
|
+
eval_framework/metrics/completion/bleu.py,sha256=IDO3Hn-VgH7eT83iO9FCBI8gBUcj8cMOc1kfm_E73uI,3311
|
|
30
|
+
eval_framework/metrics/completion/chrf.py,sha256=o0zbwOpbL99fg00neET1Pb7jsfT8Sd1n-px_Jql43X8,2526
|
|
31
|
+
eval_framework/metrics/completion/code_assertion.py,sha256=f9XYPJzP6XWD2wqZ1_qWYyw56IhBnxp7hBmymw30ExA,1489
|
|
32
|
+
eval_framework/metrics/completion/code_execution_pass_at_one.py,sha256=1GyruuwS12UXfLSo3K7rRy2CZhC3W0oOAilFzAUtOeU,5123
|
|
33
|
+
eval_framework/metrics/completion/comet.py,sha256=M_4ITNfthjxqX8CgVKlxK5W7Gdu08FbXsmbGOx4SfSA,2333
|
|
34
|
+
eval_framework/metrics/completion/concordance_index.py,sha256=LfmM4KmXKiPbztoJaBRCDMA6lQdPFhHcRTYjNP0olQk,1369
|
|
35
|
+
eval_framework/metrics/completion/csv_format.py,sha256=sxo8xnEkGUw7FnkkZC2k58yn3GPuJQ_rJAFNLLo2sNE,3640
|
|
36
|
+
eval_framework/metrics/completion/cwe_accuracy.py,sha256=1LV35uxoDlKzLE_XWBItMMVsGBLqXP2DfqiI0L2T-dI,2130
|
|
37
|
+
eval_framework/metrics/completion/exponential_similarity.py,sha256=93rQV_pG7RbFMt0DWCDQe8iUiF9GzcTneHRxvH9tIgI,2702
|
|
38
|
+
eval_framework/metrics/completion/f1.py,sha256=ddHQXsQv5keZDrJvoY_nPPZtqZMEfrRrafeSWg6HQys,1512
|
|
39
|
+
eval_framework/metrics/completion/format_checker.py,sha256=JUgx3EbxsZEJr0bNlmQFQdQzkghvegq8QtC4vxQjvaI,1997
|
|
40
|
+
eval_framework/metrics/completion/grid_difference.py,sha256=sun639fzMNkhjoesfgRIsy7dofF5vxzbKlvVvUfA_y4,3104
|
|
41
|
+
eval_framework/metrics/completion/ifeval.py,sha256=93KxO8qfE6-9snppzpr3a7jCmCT2ciJOqWcK31VB2No,2578
|
|
42
|
+
eval_framework/metrics/completion/json_format.py,sha256=EV2Zb9OhETx-i2eJm48qR62S13r_2XHHVjM6UuZfKb4,6522
|
|
43
|
+
eval_framework/metrics/completion/language_checker.py,sha256=QO9yhHe99ZkvZxLSZ5m5B8N_oRVNsZeklg0b5MfUadg,3323
|
|
44
|
+
eval_framework/metrics/completion/length_control.py,sha256=15_S5m7SNFNR5KXNhmvTy3pGhtsuawlRU76w-ehLix8,3294
|
|
45
|
+
eval_framework/metrics/completion/math_reasoning_completion.py,sha256=wzhdNggAQxwC8Kpmb_ZsX6_SZx7h9IRfnFJuDBIBFxA,12221
|
|
46
|
+
eval_framework/metrics/completion/niah_accuracy.py,sha256=ycFUVXpJqdA_-aBvmzKUfaSpPi_-nCDY4F27kQjsPks,5803
|
|
47
|
+
eval_framework/metrics/completion/placeholder_checker.py,sha256=PhpPlcrP_QDYCOJuWK12ZfcUAOYys9IxZOKICTNUa1U,1147
|
|
48
|
+
eval_framework/metrics/completion/repetition.py,sha256=MRsap8ZDISDfC5luqWlQA05W_anjFU6XzzvD55LsM_M,3340
|
|
49
|
+
eval_framework/metrics/completion/rouge_1.py,sha256=Y1m7e9q258cIFjIfGShssneFn08_85ZQF6-YqIgOORQ,1514
|
|
50
|
+
eval_framework/metrics/completion/rouge_2.py,sha256=3GKFHVXHKvPOjk4SaU6D1vbykK5WeE6Q2Ogjhasa1uk,1978
|
|
51
|
+
eval_framework/metrics/completion/rouge_geometric_mean.py,sha256=0fqiWx72eJscuLkekh901CwhFInN9HoxQ2LJod40fJs,1730
|
|
52
|
+
eval_framework/metrics/completion/rouge_l.py,sha256=SwM1s7MQWKjVPlS0KyHcEH9pzkA-hlidz-4gM9kiTu4,2360
|
|
53
|
+
eval_framework/metrics/completion/struct_eval_metrics.py,sha256=8wBx7yTfzjww1wPST57X9sjrVNHavtKXZcOiCkbNrZk,8148
|
|
54
|
+
eval_framework/metrics/completion/ter.py,sha256=mskQejjl1RX0WuSQk1e42-L1QfH0kwTVIhDwqbaBNEc,2614
|
|
55
|
+
eval_framework/metrics/completion/text_counter.py,sha256=UXBOt7okRZHx6BuVcyAS9IeNoYSnryLKkdgYn0FArF8,7100
|
|
56
|
+
eval_framework/metrics/efficiency/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
57
|
+
eval_framework/metrics/efficiency/bytes_per_sequence_position.py,sha256=fPNqu_fQSqy__1Es5Zbm0niBr8N6j-jnprY-ysAFrds,1849
|
|
58
|
+
eval_framework/metrics/llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
59
|
+
eval_framework/metrics/llm/base.py,sha256=pRqppTNG0MUpG-5rJqlQ4sGlR4lFcK1cZ9B7g9XikeM,1276
|
|
60
|
+
eval_framework/metrics/llm/graders/chatbot_style_grader.py,sha256=7tplUGC7G_F730t9Ij242dBRbQKUaCzURP1iX6ZKgrI,4114
|
|
61
|
+
eval_framework/metrics/llm/graders/coherence_grader.py,sha256=j-opPaQfv6co3_SXEjc8ICIeb-3rQ7I6sct8nLe-R1c,4208
|
|
62
|
+
eval_framework/metrics/llm/graders/comparison_grader.py,sha256=jEBnXQN6ebyexPCqg48L4ZpFoVYnT3WU-pjOY1NWzz0,7461
|
|
63
|
+
eval_framework/metrics/llm/graders/conciseness_grader.py,sha256=-WE7dOo7Jo57UzmesAr61WKurB9NegNBVtPLmViLOZw,3562
|
|
64
|
+
eval_framework/metrics/llm/graders/contains_names_grader.py,sha256=5NUGVcAzkyGJ1or5uReCbUJT3psplnHTd7dUkf_iR0Y,2724
|
|
65
|
+
eval_framework/metrics/llm/graders/format_correctness_grader.py,sha256=1ewPCXj97favA3BovNSOpHRILhtsTbmp5vWJfzk-968,4549
|
|
66
|
+
eval_framework/metrics/llm/graders/instruction_grader.py,sha256=v9ew30JHpO8LK99D2FYhFz6E-ikE4PIld3sCT79u0gk,11625
|
|
67
|
+
eval_framework/metrics/llm/graders/language.py,sha256=9YlEE3BjvzfHfQtRMTWrP_NxGbjKbZRbAjqo3GvL_wE,1720
|
|
68
|
+
eval_framework/metrics/llm/graders/long_context_grader.py,sha256=BX29D8BsVoVGOfGlQjAfFMJFw2Nn77puwMOBnHJvJoE,2476
|
|
69
|
+
eval_framework/metrics/llm/graders/models.py,sha256=PVGzyjOcmm-DN-NpoO8SzFyUNVoDLG330f3uFXG0SfE,2206
|
|
70
|
+
eval_framework/metrics/llm/graders/refusal_grader.py,sha256=SUFUiveL36LWyKR5w8LUgYl2Kx4aAc5IPu5uV8j4N5k,2272
|
|
71
|
+
eval_framework/metrics/llm/graders/sql_quality_grader.py,sha256=ooNCxBNKeyqFxf2nAKdtUcd7aIMQpmxcEn9iTo5XhiQ,5624
|
|
72
|
+
eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py,sha256=lZJzXyMNYLhY4RmrPZsBxJByPXWMk8aeDjGxUArwv4U,4599
|
|
73
|
+
eval_framework/metrics/llm/llm_judge_chatbot_style.py,sha256=pb_GWN5xVHRuk64XPNkIqRV1htKaNmr-Cdjx9jxgGiw,1400
|
|
74
|
+
eval_framework/metrics/llm/llm_judge_coherence.py,sha256=NrzkJ2vMQnVKFuKd834StrpYVNBNre8-StAii0g59E8,1559
|
|
75
|
+
eval_framework/metrics/llm/llm_judge_completion_accuracy.py,sha256=KS1Fn3cZAyIfXd8LM_o2s9IjoHpftqtuSIJ3fGDAr6Y,1523
|
|
76
|
+
eval_framework/metrics/llm/llm_judge_conciseness.py,sha256=v2iSxBeUU3QTjdy0hx-9t5j0pf4LMnp5z2JCiqpN9_8,1439
|
|
77
|
+
eval_framework/metrics/llm/llm_judge_contains_names.py,sha256=7r-sAI6Qwej4fgQIhmotXtEK5ZaLcHxgyjbP7TYzRtE,1401
|
|
78
|
+
eval_framework/metrics/llm/llm_judge_format_correctness.py,sha256=AwHLblRtWSo7hg0sJpcdQAZP7ldrfZFDp2rGB9-6rns,1668
|
|
79
|
+
eval_framework/metrics/llm/llm_judge_instruction.py,sha256=PcXACNijZSYIfLoks-bqCgjqo0YPqQpX4O5GinC2SvE,2170
|
|
80
|
+
eval_framework/metrics/llm/llm_judge_mtbench_pair.py,sha256=DhaM5iDJNDgg9TZNo7FPXldmZwuVtZWCPxO6ppFj1O0,29297
|
|
81
|
+
eval_framework/metrics/llm/llm_judge_mtbench_single.py,sha256=gAeewUHh-EuS9mP57Iiptl1Z0RuSHzEF8ldI_2Howkc,18468
|
|
82
|
+
eval_framework/metrics/llm/llm_judge_refusal.py,sha256=iAoOstgOvKtk9M9wqVqrf21mM0Xbss4EraO7R3g9FBQ,1418
|
|
83
|
+
eval_framework/metrics/llm/llm_judge_sql.py,sha256=qMj2pHzijq2lVHqToewQL_xJSgKLulZWSb64996ztnQ,14480
|
|
84
|
+
eval_framework/metrics/llm/llm_judge_world_knowledge.py,sha256=C48aHS6bcVtGMk0YxzqDAGiHekypyeo--SK7EFVN5Jc,1517
|
|
85
|
+
eval_framework/metrics/llm/utils.py,sha256=3rfaP7O1c8OOatOGNO3kZcLFCvZXoPplSjkju7eck3E,728
|
|
86
|
+
eval_framework/metrics/loglikelihood/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
87
|
+
eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py,sha256=l0OxJFSQiLnwJdfX72SH9k-krZr8AI1FOUYlHfiT2Q0,1921
|
|
88
|
+
eval_framework/metrics/loglikelihood/base.py,sha256=TJoJ5jXjPiC0xHeIQiWWRha2C_h1A1Bd0U9pwhQBdRg,1935
|
|
89
|
+
eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py,sha256=0meEs3EaVU72SwflS0Em5DvSEaFbu2zD_NOtVjIr6CQ,1148
|
|
90
|
+
eval_framework/metrics/loglikelihood/dcs.py,sha256=HqqpL_BdoB-Uq_6buDSUK2YVUkc20nstnHPqGdDM2RY,1935
|
|
91
|
+
eval_framework/metrics/loglikelihood/probability_mass.py,sha256=HyBlsz64lGcU2PCL2AYZQ9qS7olOofvtpUnskdT8D0s,2196
|
|
92
|
+
eval_framework/metrics/loglikelihood/ternary.py,sha256=aU1RyXBnKnElHSNOxXOw3fvL0iUuOLPlh-TQCTAk0Bw,1825
|
|
93
|
+
eval_framework/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
94
|
+
eval_framework/response_generator.py,sha256=blIOIzP25JPEmQYPUqqLCkR2NRgOnlBLfN1T1DpCgHU,16383
|
|
95
|
+
eval_framework/result_processors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
96
|
+
eval_framework/result_processors/base.py,sha256=30HzZLs4Rk0MhHKuE13wnkMxz47yBjeMraLzmz6f-Yc,2445
|
|
97
|
+
eval_framework/result_processors/hf_uploader.py,sha256=P2EUX5KB5DFAl4PZYTZWOG2a_8aYIBwFKSeVMu2tGs0,2898
|
|
98
|
+
eval_framework/result_processors/result_processor.py,sha256=eIzs30XwbhAO7vODudNw0oS8jsk9N04G0_wdhcI9MkQ,5456
|
|
99
|
+
eval_framework/result_processors/wandb_uploader.py,sha256=3noM9S0kVkx9RbblvB5I4Fy4jtRTWmq1pGQSSR1Des8,6266
|
|
100
|
+
eval_framework/run.py,sha256=-oqjRdkIGsQgXWXI3cGcTQAk9FDpvFfiUbiFu-ygn5U,11967
|
|
101
|
+
eval_framework/run_direct.py,sha256=KMWkLDuDt-HPlmjsSGKAiXd7LlrpVUPKv89Gk3i0snA,1176
|
|
102
|
+
eval_framework/shared/types.py,sha256=lPA5uhdRgs3H---SFsjUOYwUkqBYL0K2Y2JvxCOyMLc,8841
|
|
103
|
+
eval_framework/tasks/__init__.py,sha256=Fzs8DY53Dt0Gsu34Ro6Dk6by9qgaFF0UIIHERl6PO5g,120
|
|
104
|
+
eval_framework/tasks/base.py,sha256=LWkpIrdBDSq9VYi8W8iKcWve2T1_Oh76YDmczJcvOrA,15988
|
|
105
|
+
eval_framework/tasks/benchmarks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
106
|
+
eval_framework/tasks/benchmarks/aidanbench.py,sha256=KIilefSLgM3SINYlALxKiWE9hlOwuNZaTZHnIJerOv4,9963
|
|
107
|
+
eval_framework/tasks/benchmarks/arc.py,sha256=XC968m7yMLDdUWQn3S8CzlUVzzyC2CwPziiTB5yjf1s,2854
|
|
108
|
+
eval_framework/tasks/benchmarks/arc_de.py,sha256=Ng7n0HeAEPh2SEHOTwIl1ccVCWH9iI0-U11mBe0aR38,1871
|
|
109
|
+
eval_framework/tasks/benchmarks/arc_fi.py,sha256=fgjdHN0pq8V_R_vMsLsk9Q2Mf7JwKF0vVFazESHtf2o,1858
|
|
110
|
+
eval_framework/tasks/benchmarks/belebele.py,sha256=x3V5DsNdkJpwMCWi5SOVIMJS7-ZZhi5E3XttGa23bR8,2213
|
|
111
|
+
eval_framework/tasks/benchmarks/bigcodebench.py,sha256=vrEySwqQTAEFnWGDoTzAOL6IVSwulbK7Rp60hxyOlPc,5892
|
|
112
|
+
eval_framework/tasks/benchmarks/casehold.py,sha256=hFWW1LnzVHk8un58flGLh_JOv3h95XHwOB-WenIWOJc,1727
|
|
113
|
+
eval_framework/tasks/benchmarks/chembench.py,sha256=GaPW0oBYLx4cQaZuvcFdDIL5XG5YUurqRcdaWXSzVgo,3522
|
|
114
|
+
eval_framework/tasks/benchmarks/copa.py,sha256=pTOBAnahtY8jiT8b0RV8AJwjzF_HBp89Fiu18msXulw,2535
|
|
115
|
+
eval_framework/tasks/benchmarks/duc.py,sha256=6VAk38UrbuG63gr-K69WZ016g-EH0ONxTzictJ-AcN0,3516
|
|
116
|
+
eval_framework/tasks/benchmarks/flores200.py,sha256=WOhjn2RzwFuvMIHL_t2pvSBTS2-zPIsonJD1ZGXhJdc,5177
|
|
117
|
+
eval_framework/tasks/benchmarks/flores_plus.py,sha256=bTKH8ECFdZRw-3RV-37a2clpY7u1Y2QigVeXIQoI2c0,3346
|
|
118
|
+
eval_framework/tasks/benchmarks/gpqa.py,sha256=n_CzSMeSWDAh21g60cC08Ut_PwcQn6vZwWTgtCCnP1U,8873
|
|
119
|
+
eval_framework/tasks/benchmarks/gsm8k.py,sha256=nU2iVGI9YEa-mh1Z3nzCwEcPNXEX15XlUWrnLpaAk2M,5874
|
|
120
|
+
eval_framework/tasks/benchmarks/hellaswag.py,sha256=PtC0AkFceUEFqt5HVbMHRAAQagsxq1x36yWcAoRp5YQ,2763
|
|
121
|
+
eval_framework/tasks/benchmarks/hellaswag_de.py,sha256=09sItfKknm6Xm-NKm5HcBgm-EYlm0dBqdgkEcXCReVk,2091
|
|
122
|
+
eval_framework/tasks/benchmarks/humaneval.py,sha256=wDP8ymSaqrhe28pTXavt0fxayA-cdUU8eOp5V8Q6T40,3370
|
|
123
|
+
eval_framework/tasks/benchmarks/ifeval.py,sha256=sww3y21udT1xCdf1fmh7z4EZ6-XLMR5fFgqUdwCUmZY,2826
|
|
124
|
+
eval_framework/tasks/benchmarks/include.py,sha256=Io4IFYTOCEoolVMRvjMEc58YJSJh4FcNnJ7wCYOmeIo,3380
|
|
125
|
+
eval_framework/tasks/benchmarks/infinitebench.py,sha256=bDNkNNe2v1FNOwGR9fHbTaUXFJJxNlHR6emarFjPFE4,11024
|
|
126
|
+
eval_framework/tasks/benchmarks/math_reasoning.py,sha256=mCqNY9ZKMp_k09S3ropdgiFma7SzWbs65rUsYyyouOA,22750
|
|
127
|
+
eval_framework/tasks/benchmarks/mbpp.py,sha256=hX8NnmI8iV4L35BX1-OCESNtfQq6hPryVKXJ_rsYQCI,7530
|
|
128
|
+
eval_framework/tasks/benchmarks/mmlu.py,sha256=O2RMG9u8zoUsQ06A8LIXNTBXZTh84_95REJ2sy4JN30,7755
|
|
129
|
+
eval_framework/tasks/benchmarks/mmlu_de.py,sha256=MAmiVWR-tSQFT383lKz_z0b0pdhWQ7PjPwtUuIGCb64,4634
|
|
130
|
+
eval_framework/tasks/benchmarks/mmlu_pro.py,sha256=II_adK324gobsZmrK2EP3yLgFaSV9WHhchwVXRky_UI,6398
|
|
131
|
+
eval_framework/tasks/benchmarks/mmmlu.py,sha256=Kr6WVv4Z2SH3VJU549LF1yUXIPf1wvL60t7WdxfOcY8,24029
|
|
132
|
+
eval_framework/tasks/benchmarks/openbookqa.py,sha256=k-wqRliIdzNsCoExlO9NX-cL6Fzoi5do2A5kBwcAQRY,3663
|
|
133
|
+
eval_framework/tasks/benchmarks/opengptx_eu20.py,sha256=w2ITsVeyLKakZr_VCMaLckvqqSL3MXu9c8ZcywJjV6E,14923
|
|
134
|
+
eval_framework/tasks/benchmarks/pawsx.py,sha256=VnTTi396NQzBiUfUxsJ14WRvDibYEDL4S7cHDjoQ96Y,3106
|
|
135
|
+
eval_framework/tasks/benchmarks/piqa.py,sha256=_dyCkVzXrIm1t_eTjPi2xqhBtC9xaNnsLwui4JEgnIk,2517
|
|
136
|
+
eval_framework/tasks/benchmarks/quality.py,sha256=8GTmOAsX3cxgNDYaTT31cN8N-xSr6iVIR0pvhD8aqeo,1955
|
|
137
|
+
eval_framework/tasks/benchmarks/sciq.py,sha256=CTZuKsyr2CuK-tbz2XnNrQSdttb1_QwXqkBLoEcsRS8,4271
|
|
138
|
+
eval_framework/tasks/benchmarks/sphyr.py,sha256=CLp3eLkKly60IkYKyV_-nos3bUKkBhQLpakct2qiEOg,3199
|
|
139
|
+
eval_framework/tasks/benchmarks/squad.py,sha256=zrd-PPEMLx12SEyw0-qLe5YKC424fZAf7AWnX9AjTow,8201
|
|
140
|
+
eval_framework/tasks/benchmarks/struct_eval.py,sha256=9NnDdkMWAitDfT9ksrb-F2_GCMZU326xU1Hevwk0ysY,4050
|
|
141
|
+
eval_framework/tasks/benchmarks/tablebench.py,sha256=MxQlW7d62hBqm0HZ93XxzVHVn_JIeqK4UJO4-b5tO3U,4912
|
|
142
|
+
eval_framework/tasks/benchmarks/triviaqa.py,sha256=S1RTJrJaeowmgjtMkcDxjUbu_9y35Eo1x9HouOqA_M0,1646
|
|
143
|
+
eval_framework/tasks/benchmarks/truthfulqa.py,sha256=JO2-oCuf7DjxOkpESvEeWzAUh7oMhicOaIGLTe017D4,4915
|
|
144
|
+
eval_framework/tasks/benchmarks/winogender.py,sha256=18OLgtKZIsnN2P1YOPzc8LrjnsRy4zXSGXdKRNn_hxQ,2609
|
|
145
|
+
eval_framework/tasks/benchmarks/winogrande.py,sha256=QDTBHt4XtNQ1QF_Y7wScI9_pK_bY_b6O9dQ9VrVTqC4,2705
|
|
146
|
+
eval_framework/tasks/benchmarks/winox.py,sha256=Qvbb_HLYpQlEKeOkJ22nFGAWlYjPQhMEp-Ra95_6k0U,1996
|
|
147
|
+
eval_framework/tasks/benchmarks/wmt.py,sha256=I54AmQNX4uv_7k5U3nYdQi2DhQIuAkOzRZCbUZYyZ64,5690
|
|
148
|
+
eval_framework/tasks/benchmarks/zero_scrolls.py,sha256=qUgoDICnmEXM1EZdV2PTUXf4YzUjyOCwmgWACN_-0zE,7862
|
|
149
|
+
eval_framework/tasks/eval_config.py,sha256=xfFhzdfCHOMx1v2vl8Lp4XXC2SxN34PlbDzqo9nrGc8,5427
|
|
150
|
+
eval_framework/tasks/perturbation.py,sha256=ZtXMqPk9YSWiX6ytgUXEeacA1LhAgvkJyxuO7MruJho,3532
|
|
151
|
+
eval_framework/tasks/registry.py,sha256=d4uYpg8JOfStl-r0mExaJOYL4rqsXD9RAQ93fi32D7I,5738
|
|
152
|
+
eval_framework/tasks/task_loader.py,sha256=uXurAyS35y90cMUW_Sc2bFZBXuN34FiACzUniLRxjw4,3784
|
|
153
|
+
eval_framework/tasks/task_names.py,sha256=VH5eIrHdDps8zXMlKEwiGFuEgHpeZY9sx7txJq0H-qw,16463
|
|
154
|
+
eval_framework/tasks/utils.py,sha256=jjo5JDDTzlTevx5angDp9terg-eW6z8g1ZmAuFXdslw,20006
|
|
155
|
+
eval_framework/utils/constants.py,sha256=LEElGdYrkIWm8dJa7lfD5LbL-fwkF17Z0nQ7_XVg098,164
|
|
156
|
+
eval_framework/utils/file_ops.py,sha256=tGLHnwnZsm3gc8D6kGFzONIqR5wtdKDEzDKziRYoyo8,10922
|
|
157
|
+
eval_framework/utils/generate_task_docs.py,sha256=aPTz8M-Dlh06z4Ce41OIo81KnCpYnHIuusZQtpid2I4,9780
|
|
158
|
+
eval_framework/utils/helpers.py,sha256=GZJNUWaKg-6LcSU4gm585fX0kKd9Y_gvlAevYaNitSg,1253
|
|
159
|
+
eval_framework/utils/logging.py,sha256=xqwop0qpSRG8KTvzY31hX6Ew0ly_LqtZ16RItHpPmFE,1945
|
|
160
|
+
eval_framework/utils/packaging.py,sha256=Z_eXjzcgCvifJwJ-pqeAtNPVjscgR6QYNKe4E-iSnFc,1889
|
|
161
|
+
eval_framework/utils/tqdm_handler.py,sha256=-FHPrX29u5dWhbzworXIJ_I1EdfeoWZsWbwmRXz3Fuk,298
|
|
162
|
+
template_formatting/README.md,sha256=gVrps3xXKPP87lWmKzaaEjNMBikQVj_MMZ-FdG3O6Xg,3787
|
|
163
|
+
template_formatting/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
164
|
+
template_formatting/formatter.py,sha256=J6qGovNQYo5Cc_R7XV8iCO0d-UNaCdO-1N_FyD4mhpw,21522
|
|
165
|
+
template_formatting/mistral_formatter.py,sha256=NJESWDsd_QFx8USoTGJ1QOYczwXf-ObjnpTy7mqzjK0,6734
|
|
166
|
+
template_formatting/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
167
|
+
eval_framework-0.2.7.dist-info/WHEEL,sha256=KSLUh82mDPEPk0Bx0ScXlWL64bc8KmzIPNcpQZFV-6E,79
|
|
168
|
+
eval_framework-0.2.7.dist-info/entry_points.txt,sha256=k4dpbNwZ5XnovyqrScWTZ-UYzf_EPYOvZA2QTkqrYlk,59
|
|
169
|
+
eval_framework-0.2.7.dist-info/METADATA,sha256=Ye0vaxk9Xv_aO41sRyognvdKde7eu2Ef-nXAypQljsA,29424
|
|
170
|
+
eval_framework-0.2.7.dist-info/RECORD,,
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
|
|
2
|
+
# Internal template formatting package
|
|
3
|
+
|
|
4
|
+
Single source of truth for internal template formatting. Ensures compatibility between `scaling-internal` and `eval-framework`
|
|
5
|
+
|
|
6
|
+
### Install uv
|
|
7
|
+
|
|
8
|
+
`uv` is used for dependency management and packaging in Python projects. To install uv, follow the [official instructions](https://docs.astral.sh/uv/getting-started/installation/).
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
## Project Structure
|
|
12
|
+
- src/: Contains the template formatting code
|
|
13
|
+
- tests/: Contains pytest test cases.
|
|
14
|
+
- test_formatter_eval.py: Basic unit tests for the template formatter derived from `eval-framework`
|
|
15
|
+
- test_formatter_scaling.py: Basic unit tests for the template formatter derived from `scaling-internal`
|
|
16
|
+
- pyproject.toml: Configuration file for uv and other tools like MyPy, ruff and pytest.
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
## Adding dependencies
|
|
20
|
+
|
|
21
|
+
- **Adding Production Dependencies**: These are dependencies necessary for your project to run. For example, if your project uses Pydantic for data validation, you would add it as a production dependency:
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
uv add pydantic
|
|
25
|
+
```
|
|
26
|
+
- **Adding Development Dependencies**: These are dependencies that are only needed during development, such as testing libraries or linters. For instance, to add pytest for writing and running tests, you would specify it as a development dependency:
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
uv add --group dev pytest
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
After adding any new dependencies, you need to install them to update your project's virtual environment:
|
|
33
|
+
```bash
|
|
34
|
+
uv sync
|
|
35
|
+
```
|
|
36
|
+
This command ensures that all dependencies listed in your pyproject.toml file are correctly installed and available for use in your project.
|
|
37
|
+
|
|
38
|
+
To install all dependencies (including optional ones), run
|
|
39
|
+
```bash
|
|
40
|
+
uv install --extras optional
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Usage
|
|
44
|
+
**Running Commands with uv**
|
|
45
|
+
|
|
46
|
+
`uv`` creates a virtual environment for your project, which isolates your dependencies from the global Python environment. This isolation helps prevent version conflicts and ensures reproducibility. Here's how to use `uv`` to run commands:
|
|
47
|
+
|
|
48
|
+
- **Installation**: To set up pre-commit hooks, you first need to install the pre-commit package and then install the hooks.
|
|
49
|
+
|
|
50
|
+
You can either follow the [install instructions](https://pre-commit.com/#install) or install it globally through `uv tool install pre-commit`
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
pre-commit install
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
- **Running Hooks Manually**: Although pre-commit hooks are triggered automatically before each commit, you can also run them manually to check your files at any time:
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
pre-commit run -a
|
|
60
|
+
```
|
|
61
|
+
This command runs all hooks against all files, which is useful for initial setup or periodic checks.
|
|
62
|
+
|
|
63
|
+
- **Current Hooks**:
|
|
64
|
+
- **Check JSON**: Ensures JSON files are valid.
|
|
65
|
+
- **Pretty format JSON**: Formats JSON files to be more readable.
|
|
66
|
+
- **Fix End of Files**: Ensures files end with a newline.
|
|
67
|
+
- **Trim Trailing Whitespace**: Removes unnecessary trailing whitespace.
|
|
68
|
+
- **Ruff**: Runs the Ruff linter to check Python code for stylistic and programming errors.
|
|
69
|
+
- **Ruff-format**: Automatically formats Python code using Ruff.
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
- **Static Type Checking with MyPy**: To ensure your code is type-safe, run MyPy to check for type errors. This should be done frequently during development to catch type-related issues early:
|
|
73
|
+
```bash
|
|
74
|
+
uv run --all-extras mypy ./src
|
|
75
|
+
uv run --all-extras mypy ./tests
|
|
76
|
+
```
|
|
77
|
+
Run these commands after making changes to your source or test files to verify that your changes haven't introduced type errors.
|
|
78
|
+
|
|
79
|
+
- **Running Tests with pytest**: To ensure your code works as expected and hasn't broken existing functionality, run your tests:
|
|
80
|
+
```bash
|
|
81
|
+
uv run --all-extras pytest
|
|
82
|
+
```
|
|
83
|
+
Run this command frequently during development, especially before committing changes, to ensure all tests pass.
|
|
File without changes
|