evalscope 0.9.0__py3-none-any.whl → 0.10.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +1 -0
- evalscope/benchmarks/arc/arc_adapter.py +3 -5
- evalscope/benchmarks/bbh/bbh_adapter.py +3 -3
- evalscope/benchmarks/benchmark.py +1 -1
- evalscope/benchmarks/ceval/ceval_adapter.py +5 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +5 -79
- evalscope/benchmarks/competition_math/competition_math_adapter.py +4 -4
- evalscope/benchmarks/data_adapter.py +69 -70
- evalscope/benchmarks/general_qa/general_qa_adapter.py +10 -63
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +103 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +4 -5
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +12 -6
- evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -4
- evalscope/benchmarks/ifeval/__init__.py +0 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +56 -0
- evalscope/benchmarks/ifeval/instructions.py +1477 -0
- evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
- evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
- evalscope/benchmarks/ifeval/utils.py +134 -0
- evalscope/benchmarks/iquiz/__init__.py +0 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +8 -84
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +2 -2
- evalscope/benchmarks/race/race_adapter.py +4 -73
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -6
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -57
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +30 -0
- evalscope/collections/evaluator.py +82 -62
- evalscope/collections/sampler.py +47 -41
- evalscope/collections/schema.py +14 -10
- evalscope/constants.py +4 -0
- evalscope/evaluator/evaluator.py +22 -13
- evalscope/metrics/__init__.py +2 -5
- evalscope/metrics/metrics.py +11 -2
- evalscope/metrics/named_metrics.py +17 -0
- evalscope/models/chat_adapter.py +2 -0
- evalscope/models/server_adapter.py +11 -4
- evalscope/perf/__init__.py +1 -0
- evalscope/perf/main.py +0 -1
- evalscope/perf/plugin/api/custom_api.py +1 -1
- evalscope/perf/plugin/api/openai_api.py +1 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/longalpaca.py +1 -1
- evalscope/report/__init__.py +5 -0
- evalscope/report/app.py +693 -0
- evalscope/report/combinator.py +73 -0
- evalscope/report/generator.py +80 -0
- evalscope/report/utils.py +133 -0
- evalscope/run.py +16 -11
- evalscope/summarizer.py +1 -1
- evalscope/utils/chat_service.py +1 -1
- evalscope/utils/logger.py +1 -0
- evalscope/utils/model_utils.py +5 -2
- evalscope/version.py +2 -2
- {evalscope-0.9.0.dist-info → evalscope-0.10.1.dist-info}/METADATA +84 -7
- {evalscope-0.9.0.dist-info → evalscope-0.10.1.dist-info}/RECORD +66 -51
- tests/cli/test_collection.py +11 -7
- tests/cli/test_run.py +13 -4
- evalscope/tools/__init__.py +0 -1
- evalscope/tools/combine_reports.py +0 -133
- evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
- /evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
- {evalscope-0.9.0.dist-info → evalscope-0.10.1.dist-info}/LICENSE +0 -0
- {evalscope-0.9.0.dist-info → evalscope-0.10.1.dist-info}/WHEEL +0 -0
- {evalscope-0.9.0.dist-info → evalscope-0.10.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.9.0.dist-info → evalscope-0.10.1.dist-info}/top_level.txt +0 -0
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
|
|
2
|
-
evalscope/arguments.py,sha256=
|
|
2
|
+
evalscope/arguments.py,sha256=v6IyhjgBACDkapnZYi6DeBI1aZxRVA-mx7KR1j72lYs,4493
|
|
3
3
|
evalscope/config.py,sha256=4klkNziKT4r8a4Z1imkiY16-S8iER1BYPMOG4nJg9lU,8571
|
|
4
|
-
evalscope/constants.py,sha256=
|
|
5
|
-
evalscope/run.py,sha256=
|
|
4
|
+
evalscope/constants.py,sha256=bkcDVbB4Pr1Qxz83qefcWjEetVGiHTcx3m84WX14ASI,3330
|
|
5
|
+
evalscope/run.py,sha256=KKZBy2hr8_BscE0ZR1rN9U7iPc1eZYeeInfXe3EY7lA,5718
|
|
6
6
|
evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
|
|
7
|
-
evalscope/summarizer.py,sha256=
|
|
8
|
-
evalscope/version.py,sha256=
|
|
7
|
+
evalscope/summarizer.py,sha256=Wnt8Y61PasOkigo8zMpi1qdFzfETjfnDtCDDQ6VwgDw,5867
|
|
8
|
+
evalscope/version.py,sha256=Bt6Ke7m38AQOnf3xTgdKX-eFqm09Gu5GYEjTkjPrPEk,119
|
|
9
9
|
evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
10
|
evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
|
|
11
11
|
evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
|
|
@@ -56,13 +56,13 @@ evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4ty
|
|
|
56
56
|
evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=ZB0wYzPBPyIM0zjfp5C71GcjAgKxKoWxF7RrhuDQBYM,5931
|
|
57
57
|
evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=D-8Sr0tZblf-3ZR5T0lOggbgFRuy5wq9Nm02GUpNyAs,1620
|
|
58
58
|
evalscope/benchmarks/__init__.py,sha256=b_SWdV1ZyOqFiwc_9lIjKrIvK1rwnF2cCIF7XN9CN8E,932
|
|
59
|
-
evalscope/benchmarks/benchmark.py,sha256=
|
|
60
|
-
evalscope/benchmarks/data_adapter.py,sha256
|
|
59
|
+
evalscope/benchmarks/benchmark.py,sha256=SFDjyxd4t4KEcLBP82zE_KCJ_wXuv8J3XFzIR4M9fFI,2419
|
|
60
|
+
evalscope/benchmarks/data_adapter.py,sha256=Aaspp5dR1aINXAopm0y7LHeMwJbmYXfy5bNm9DpagRo,12051
|
|
61
61
|
evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
62
62
|
evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
|
|
63
|
-
evalscope/benchmarks/arc/arc_adapter.py,sha256=
|
|
63
|
+
evalscope/benchmarks/arc/arc_adapter.py,sha256=TdDB3lazJNdUt2bBo1G7zaOAN6YkKXdcgMui1ygQj3Y,6591
|
|
64
64
|
evalscope/benchmarks/bbh/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
65
|
-
evalscope/benchmarks/bbh/bbh_adapter.py,sha256=
|
|
65
|
+
evalscope/benchmarks/bbh/bbh_adapter.py,sha256=pkgIEr_4QyzngUcs0j4oOscFljGoYZcCAS861Afnt_0,8316
|
|
66
66
|
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt,sha256=xnzlaIRyeGlogG49v8nt4vpJO40J06ev4yc8cv0VSRY,1781
|
|
67
67
|
evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt,sha256=sfo-2iOeVzB0OGgd7NSQFELTGDTsr2DQ3u-g0ivI-sM,3653
|
|
68
68
|
evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt,sha256=UJBsc3Mwz8TZngdWH_NFlhhNbLhNHK6FvW9FHcS8H5g,1167
|
|
@@ -91,63 +91,76 @@ evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt
|
|
|
91
91
|
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt,sha256=s_x6u5MLeKpuAHZj3GNQqY1I8vWqQIfJasOp9XcM7Ck,2945
|
|
92
92
|
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt,sha256=qfTZafCzNiz9ULBaDlfy_LISL617NyH5Nc0-nO0K0LE,2164
|
|
93
93
|
evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
94
|
-
evalscope/benchmarks/ceval/ceval_adapter.py,sha256
|
|
94
|
+
evalscope/benchmarks/ceval/ceval_adapter.py,sha256=2PvM5cvviyVNeFGnz-ymYVhEyPoea52OL_dg7dwVzQQ,11429
|
|
95
95
|
evalscope/benchmarks/ceval/ceval_exam.py,sha256=ngOvb6Fymt7iPWIb2fzrUVpqmUT2VBoqh7X_IH8Bcsc,4824
|
|
96
96
|
evalscope/benchmarks/ceval/samples.jsonl,sha256=dyWhGAdt4eq6Amgu2Ykx8RevUJVFtbhGFSTbDAeUgHc,448
|
|
97
97
|
evalscope/benchmarks/cmmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
98
98
|
evalscope/benchmarks/cmmlu/cmmlu.py,sha256=Y59NIGUFzJEztJbkehZsG4Cz0J_v9Cyju6xazHMYIcA,5022
|
|
99
|
-
evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=
|
|
99
|
+
evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=O6FIsJDgg4OiHZSafaDq7jZ2gubWumPMhkdVb8WN-D8,10526
|
|
100
100
|
evalscope/benchmarks/cmmlu/samples.jsonl,sha256=FXbyPQSDorKBGSD0lnOzioZmFjG07lIL87FRDRaMPSY,1722
|
|
101
101
|
evalscope/benchmarks/competition_math/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
102
102
|
evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc8TdmfblZ6mMc39wQWtqqbUi34QLc,2629
|
|
103
|
-
evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=
|
|
103
|
+
evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=ns2WPbqkR52rRKo244WoAeAO9VOESEl_sHCPhym2DnM,6768
|
|
104
104
|
evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
105
|
-
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=
|
|
105
|
+
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=1MQXl3Wf_Dnzn7_7BSTu7RT6BOfhhiVyAnqECawxyfM,3899
|
|
106
|
+
evalscope/benchmarks/gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
107
|
+
evalscope/benchmarks/gpqa/chain_of_thought.txt,sha256=pgoT5a-DMPJaMhoH_M8zfU5s80ibWDTVW6vnonITd8k,5610
|
|
108
|
+
evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=Z5TtgPCCT8AVmFCMVIVmfhqe51CyCTaLSYTiev7smPw,4232
|
|
106
109
|
evalscope/benchmarks/gsm8k/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
107
110
|
evalscope/benchmarks/gsm8k/gsm8k.py,sha256=ZDN5lfeZyc_pkTDVY0voC_zUExHE1ZoEgEaTvt5hpXg,4233
|
|
108
|
-
evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=
|
|
111
|
+
evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=9DuNos8xCOVFOUSJ04LAoBRVPbtqgR4XmOVk6r8ADU8,11114
|
|
109
112
|
evalscope/benchmarks/hellaswag/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
110
113
|
evalscope/benchmarks/hellaswag/hellaswag.py,sha256=5_c9WbaS1LIdvgXzqEcvjAEtKi2V2Yn0YtszPlFqhXI,4610
|
|
111
|
-
evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=
|
|
114
|
+
evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=p7Nu-1B2mgbjfth1IhkMSWEC0TxOtD6tp_bOWeeRjts,6332
|
|
112
115
|
evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
113
116
|
evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
|
|
114
|
-
evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=
|
|
117
|
+
evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=mjWkJqeRM1JVlrLXaCz1qscneLhYySZt8cgdXZSmJWY,5215
|
|
118
|
+
evalscope/benchmarks/ifeval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
119
|
+
evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=Jx04TddVZE1gk4wXyljhtt3CLo-7Ux_RcLLMlTV-Nhg,2024
|
|
120
|
+
evalscope/benchmarks/ifeval/instructions.py,sha256=oaJ9D_4rvS67BraHBNPpDtFd4TblFAnR4A3YW9HWfuY,56304
|
|
121
|
+
evalscope/benchmarks/ifeval/instructions_registry.py,sha256=tVUmhuSwnOidLtI8onOAw_gpJ6bi8FL07GiX19hSuo8,7288
|
|
122
|
+
evalscope/benchmarks/ifeval/instructions_util.py,sha256=vkemXeylJMmgW8LgfQe4cSy2OF-oH_NcSZtzyZDURW4,25780
|
|
123
|
+
evalscope/benchmarks/ifeval/utils.py,sha256=TKrM1m2qDCUauahogItDdICf4mDk0OjasSxgnxjt2KY,4517
|
|
124
|
+
evalscope/benchmarks/iquiz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
125
|
+
evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=gByj-11KGRTQk2wF1UwNACl8i1svBAEDaj-KJm1XEmw,2387
|
|
115
126
|
evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
116
127
|
evalscope/benchmarks/mmlu/mmlu.py,sha256=sA8AC0bN7iURrSazqkY31s_reNVbDZSUCB-NCTQsVeI,5042
|
|
117
|
-
evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256
|
|
128
|
+
evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=-ONQW0EPAPXFPIpH_Y6zRE-t9j5dT7yABgAU8wxIH4M,11829
|
|
118
129
|
evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
|
|
119
130
|
evalscope/benchmarks/mmlu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
120
|
-
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=
|
|
131
|
+
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=9Mg7AKb2YL7aCilsXNA5_f1JmETfXQd1kOvLkGcKFEA,4372
|
|
121
132
|
evalscope/benchmarks/race/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
122
133
|
evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO61nU,3608
|
|
123
|
-
evalscope/benchmarks/race/race_adapter.py,sha256=
|
|
134
|
+
evalscope/benchmarks/race/race_adapter.py,sha256=9uyQLDA9kVKGu0XhwcBoMyxcgUh3jqWXRO5DahRqUpg,6678
|
|
124
135
|
evalscope/benchmarks/race/samples.jsonl,sha256=bhSktBgU6axYQCClRtQ7nN8D1x815AU8xMAIG1oflG0,1243
|
|
125
136
|
evalscope/benchmarks/trivia_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
126
137
|
evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=1isBD62PGhCiNbzQa-GFrHHL4XLHIkojWfgSvn7ktf8,3445
|
|
127
138
|
evalscope/benchmarks/trivia_qa/trivia_qa.py,sha256=eekxaXppMLb5tCQqNLOw2MaWlYDhI2IicPzRsTHqb5A,3070
|
|
128
|
-
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=
|
|
139
|
+
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=e-jrcCvl8fbPzWCOYKq_sbl4XCulsPzAECGtvTPE-rM,5106
|
|
129
140
|
evalscope/benchmarks/truthful_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
130
141
|
evalscope/benchmarks/truthful_qa/truthful_qa.py,sha256=A4abSL6WMRcXez8flxsHy-0ZFyRg-moq9rTeOA1TalY,6909
|
|
131
|
-
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=
|
|
142
|
+
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=tCVO0RTD_S7z1ky7su5z67dnpgbsEtcH5j0vCpfvUV8,12908
|
|
132
143
|
evalscope/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
133
144
|
evalscope/cli/base.py,sha256=m1DFlF16L0Lyrn0YNuFj8ByGjVJIoI0jKzAoodIXjRk,404
|
|
134
|
-
evalscope/cli/cli.py,sha256=
|
|
145
|
+
evalscope/cli/cli.py,sha256=w_dtXljur9s5lmTn6LbbFL_viTPQB1WAEzhYcId09Og,729
|
|
146
|
+
evalscope/cli/start_app.py,sha256=_NTmCd15tZOROAnPacGWirMS4OXHrL3n2eZj1kokpks,758
|
|
135
147
|
evalscope/cli/start_eval.py,sha256=2lyD2WSQ0DnP6T31VvTimQ-6POnwxeEP9GLPFnT7Tfo,767
|
|
136
148
|
evalscope/cli/start_perf.py,sha256=lEHJBSpzNsO4KGlWfQc-EfZGXq1M_FpOwtRxRdb4fso,813
|
|
137
149
|
evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
|
|
138
150
|
evalscope/collections/__init__.py,sha256=hd68Qf-ryeDsz5Pu-Dh83M5V5RE5mhLsG-vc55n5n0o,228
|
|
139
|
-
evalscope/collections/evaluator.py,sha256=
|
|
140
|
-
evalscope/collections/sampler.py,sha256=
|
|
141
|
-
evalscope/collections/schema.py,sha256=
|
|
151
|
+
evalscope/collections/evaluator.py,sha256=_XaLn_cSKvAW96aNwaaPbrBDPl9qn0VrsTjID_y7SpM,8910
|
|
152
|
+
evalscope/collections/sampler.py,sha256=6Tp0jN7bJQqG-7AQ2UDPDur6O5aC_nl0N-OV9HfuE9Q,4769
|
|
153
|
+
evalscope/collections/schema.py,sha256=Ns47HXt7Ym4sPdPyxStxALHUid2cW7kWhqvw_jK_p-4,4172
|
|
142
154
|
evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
|
|
143
|
-
evalscope/evaluator/evaluator.py,sha256=
|
|
155
|
+
evalscope/evaluator/evaluator.py,sha256=0IOuWQ4KgWuMisNmFqh4-id3d1Kkbkf4JW-6hVz7tqU,16638
|
|
144
156
|
evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
|
|
145
157
|
evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
146
158
|
evalscope/evaluator/reviewer/auto_reviewer.py,sha256=PDz1Nt_qq0oGZBBwek2-M8VBUNLkOkmYObzR8gO7nEc,16624
|
|
147
|
-
evalscope/metrics/__init__.py,sha256=
|
|
159
|
+
evalscope/metrics/__init__.py,sha256=yzuZjXufrPqVhzNTNaJLJwhs7-Sgb-iNG0I3BdOX7Tg,291
|
|
148
160
|
evalscope/metrics/code_metric.py,sha256=EXE2BZAc9JJT_cpd6eCb0Jo9wwtnzXzHBZxmLzG5Jpw,3300
|
|
149
161
|
evalscope/metrics/math_accuracy.py,sha256=a0L_YT70bsJYn5_POICJyj6ZVFbHek1ly6j_ssV9Xsc,5585
|
|
150
|
-
evalscope/metrics/metrics.py,sha256=
|
|
162
|
+
evalscope/metrics/metrics.py,sha256=H02Hhj9Me2qzUjSzdV57i5Gj6xP_w5kbuPcuPpejlI0,12860
|
|
163
|
+
evalscope/metrics/named_metrics.py,sha256=j-y-d5EJ4FJzOxlIKobKIMUNu--nzAIIc2j0TvDfFb0,574
|
|
151
164
|
evalscope/metrics/rouge_metric.py,sha256=zhIUqenSuxnORR9tamLQBGjFwP91Zei2UiLtcOyseVM,4639
|
|
152
165
|
evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
|
|
153
166
|
evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=Kq6AObenmLVQ5tN3NgN042a6mgRFQmRO21-ohd9mSa8,11972
|
|
@@ -155,32 +168,33 @@ evalscope/metrics/resources/gpt2-zhcn3-v4.bpe,sha256=J_K-oSt9usFEw87Av6j7ETn3J48
|
|
|
155
168
|
evalscope/metrics/resources/gpt2-zhcn3-v4.json,sha256=WkM4J_FDPPNQwYi0kj5sM5SVjk2_6bci7tqf8dV9p_U,1289341
|
|
156
169
|
evalscope/models/__init__.py,sha256=pafIEbJq_2DrYjQbgI0SNVxywNYOxvqwk7Dr1P7KEwk,923
|
|
157
170
|
evalscope/models/base_adapter.py,sha256=fT3i8c9jRmz_VBcUYMMmXrlCM6JWcixPdgak5yT6Wkw,2177
|
|
158
|
-
evalscope/models/chat_adapter.py,sha256=
|
|
171
|
+
evalscope/models/chat_adapter.py,sha256=9DIMwacjrR647pYVKgeYn090ZKBVHmMD_mf3Gz2vdw0,5461
|
|
159
172
|
evalscope/models/choice_adapter.py,sha256=Zb-UUFpF2tpMGuGH_wFleMxpSb__-SuN1cMF7yj25aI,7661
|
|
160
173
|
evalscope/models/custom_adapter.py,sha256=uj4kbBCwhrXjvSq9f6HgTJ5yJ9FJpvs1k5-9Ekm9RmA,2272
|
|
161
174
|
evalscope/models/local_model.py,sha256=EBclVq5tqUFNOZebRlNnZSvzwtSun7FsZRf2tx0cMt0,2486
|
|
162
175
|
evalscope/models/model.py,sha256=diu4TE1ZFWdynTxsl4DejTNsLdwjxoyj2nsKR-Y8EZE,7343
|
|
163
|
-
evalscope/models/server_adapter.py,sha256=
|
|
176
|
+
evalscope/models/server_adapter.py,sha256=VGk_nTwkLWO7Ln7lV_KSaIBzlSRZzyIs_bWDeJ_pOho,4469
|
|
164
177
|
evalscope/models/custom/__init__.py,sha256=wb6f_Bi39s5sj-VO7EXRDXB2WhyFb49BUtEMk77ksNQ,102
|
|
165
178
|
evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
|
|
166
|
-
evalscope/
|
|
179
|
+
evalscope/models/custom/dummy_model.py,sha256=ODD6pt9FvZq_a54oYsehBDslRKHOsk9zsC9iAZvi5Yg,2020
|
|
180
|
+
evalscope/perf/__init__.py,sha256=rgSXzxIJ67yB_SLUdl4ljem2-ilB-Gw3640f4KWLO1k,51
|
|
167
181
|
evalscope/perf/arguments.py,sha256=8KiD4u51B_twEaIiI0_kw4Jknk3YG4S6XN-vgvutChA,9233
|
|
168
182
|
evalscope/perf/benchmark.py,sha256=qNgDNseW8N0beuAB_4-JVtTdHs7ZaJEHK5XnkMU9vRU,9618
|
|
169
183
|
evalscope/perf/http_client.py,sha256=TfnQT9OaBlUCpGwi4ifSJBaaGsn3P2KVBPMGuw-Rqkk,7073
|
|
170
|
-
evalscope/perf/main.py,sha256=
|
|
184
|
+
evalscope/perf/main.py,sha256=SUMz8S2XPL8JaSL1-vy8qkrb34d5vp6DfQdwIGOUXTk,1277
|
|
171
185
|
evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
|
|
172
186
|
evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
|
|
173
187
|
evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
|
|
174
188
|
evalscope/perf/plugin/api/base.py,sha256=B_H04qKx7eRTn155rnDrbTYur7PK1mvxfQKYcqYbndU,2118
|
|
175
|
-
evalscope/perf/plugin/api/custom_api.py,sha256=
|
|
189
|
+
evalscope/perf/plugin/api/custom_api.py,sha256=ay1AGi4y2opjwyRl0J0A54-vLB-pBj3QBFkzog0KA-g,3787
|
|
176
190
|
evalscope/perf/plugin/api/dashscope_api.py,sha256=V5fwn-p_fLH0dWKzhN9TvYSHRgla4INfXC4NDaIjoQ8,3825
|
|
177
|
-
evalscope/perf/plugin/api/openai_api.py,sha256=
|
|
191
|
+
evalscope/perf/plugin/api/openai_api.py,sha256=JxQGlzAbM7MBWcr3MvWiAg6E4lqdQLfkk1qK0vUWvn8,6817
|
|
178
192
|
evalscope/perf/plugin/datasets/__init__.py,sha256=9mz2TnVHhxbEKAS9pLbKMQuIoShNlZpGiRo9e2RQLUs,490
|
|
179
193
|
evalscope/perf/plugin/datasets/base.py,sha256=Z-INWueeYjfEZhP4lbTlBMVwIa6BcXZKWx-w7Pop3mA,1786
|
|
180
194
|
evalscope/perf/plugin/datasets/custom.py,sha256=_GSC5yR90_BjcRjdJqrJT2vHQAzskz5XxYOxngUM2Pg,815
|
|
181
|
-
evalscope/perf/plugin/datasets/flickr8k.py,sha256=
|
|
195
|
+
evalscope/perf/plugin/datasets/flickr8k.py,sha256=UzAIFIO0m5inWOkWM1mO6wfV2HOuXAqiTxCJ4b0SiZM,1589
|
|
182
196
|
evalscope/perf/plugin/datasets/line_by_line.py,sha256=IKVZMpKei6XW9DTm9VEssWHE96i1lTqMf0621dA_img,836
|
|
183
|
-
evalscope/perf/plugin/datasets/longalpaca.py,sha256=
|
|
197
|
+
evalscope/perf/plugin/datasets/longalpaca.py,sha256=2aENqCly_DX1dyNcurYsLFJIvXYFph6jWm7z7XETvMk,1176
|
|
184
198
|
evalscope/perf/plugin/datasets/openqa.py,sha256=2pv7yyPSFYTjPhvAGBsHl0eQO8gt7Wk1CaKcfTi3Tnc,1394
|
|
185
199
|
evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=ef8MXhT6756y6LsXSpYeWjmwswu2hRXe2BOVS2_OgVM,1968
|
|
186
200
|
evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -210,6 +224,11 @@ evalscope/registry/tasks/general_qa.yaml,sha256=S3kdlrazWX2VAX2PMhNtBnFZVSnUKBNi
|
|
|
210
224
|
evalscope/registry/tasks/gsm8k.yaml,sha256=M2I7otwOSy0usD8yG8d6QziASQlKdhKLflRHMG0LXiM,729
|
|
211
225
|
evalscope/registry/tasks/mmlu.yaml,sha256=cJcMH1Cvgo9PlYoTmeGx2bcZayysltaa6ehK57dDkvo,726
|
|
212
226
|
evalscope/registry/tasks/mmlu_mini.yaml,sha256=K8ouHh7ve5ZsbkqRtV3Jl-DF01YFPuObfwEdACJA4Pk,778
|
|
227
|
+
evalscope/report/__init__.py,sha256=0Wes3ot2hy9s-WwZaBztst8qkNrXkOF-Hwa1WW1e8lY,260
|
|
228
|
+
evalscope/report/app.py,sha256=kB4CCrAoIONRc37Np8B3QsLxJBD_j2Sw2xtfR1FgfC0,26087
|
|
229
|
+
evalscope/report/combinator.py,sha256=bi6nvTbMrzraZ8kUZ6mIMikk8-qEIVYUhdaH4RE1Tg8,2653
|
|
230
|
+
evalscope/report/generator.py,sha256=2DULY9W8QCUxdtyfNjo8XAP_YxI1LgR95jknK__kYPU,3600
|
|
231
|
+
evalscope/report/utils.py,sha256=DRlbjbqHEmM8rGlA4pwtlHFhOZtyUzcqiS-mejfIDkU,4584
|
|
213
232
|
evalscope/third_party/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
214
233
|
evalscope/third_party/longbench_write/README.md,sha256=1yLKeSVIcihpoc4KXr8NpK86JbcLssCPx76aOKdPbYI,5431
|
|
215
234
|
evalscope/third_party/longbench_write/__init__.py,sha256=GNbBDc7HAh_V2Hfy5HhND_u7z6OI79czoBlP8lX4PVo,126
|
|
@@ -237,23 +256,19 @@ evalscope/third_party/toolbench_static/requirements.txt,sha256=OW91Z8hfzh7yQUYgP
|
|
|
237
256
|
evalscope/third_party/toolbench_static/toolbench_static.py,sha256=ABb9Gy09zMt30tY50AZGxSZ46k3NVEsvuDj6xlLOjeA,1966
|
|
238
257
|
evalscope/third_party/toolbench_static/llm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
239
258
|
evalscope/third_party/toolbench_static/llm/swift_infer.py,sha256=usmVelh0ogBlCtSUL0dqp89w2mAqH1Ptv9MURVoGrc8,1209
|
|
240
|
-
evalscope/tools/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
241
|
-
evalscope/tools/combine_reports.py,sha256=JFf3P_GJLPdlSqpv30D8ioPb7dup3tOTktsELmsKXLI,4900
|
|
242
|
-
evalscope/tools/gen_mmlu_subject_mapping.py,sha256=CUmRdReEU7QfMyprh9I56KmHoRww_zUda_JuyxmCL1A,3277
|
|
243
|
-
evalscope/tools/rewrite_eval_results.py,sha256=ODD6pt9FvZq_a54oYsehBDslRKHOsk9zsC9iAZvi5Yg,2020
|
|
244
259
|
evalscope/utils/__init__.py,sha256=jLVoGryuqUh4Km9QWWQBzpqkcVNRK0MbwNaSgckqdiU,139
|
|
245
260
|
evalscope/utils/arena_utils.py,sha256=Gf8VpH4C_oF2Abif_QeL0rAP6tvTzsc0gglpdNkUE48,7155
|
|
246
|
-
evalscope/utils/chat_service.py,sha256=
|
|
261
|
+
evalscope/utils/chat_service.py,sha256=Kh3hEUW_HF158a0QqHbWepHIHRQFJgUM-jCDAcQ_maw,8674
|
|
247
262
|
evalscope/utils/completion_parsers.py,sha256=YWHkLkSfURTcUjNNlCL6PPDICd4F2Ns9figgPN4C97c,2933
|
|
248
263
|
evalscope/utils/io_utils.py,sha256=vm6uJBBqx4fc7jsHGbwNQ6Hbx7XYhjT1Q2dQ7aHjDD0,4172
|
|
249
|
-
evalscope/utils/logger.py,sha256=
|
|
250
|
-
evalscope/utils/model_utils.py,sha256=
|
|
264
|
+
evalscope/utils/logger.py,sha256=49F2WDi1g_o8aW8Z29wOt9YHE9LDqkHIgb-d8TVybJY,3635
|
|
265
|
+
evalscope/utils/model_utils.py,sha256=PK7pKNY8ovtGZHNRvDpZ-d8zBHMOkxd6fRVkM8VF06I,736
|
|
251
266
|
evalscope/utils/utils.py,sha256=a6a2vDDxqlj7nY8xynkKkWs_ZPXEU2UMwvxp0JEpHjg,9686
|
|
252
267
|
tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
253
268
|
tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
|
|
254
269
|
tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
255
|
-
tests/cli/test_collection.py,sha256=
|
|
256
|
-
tests/cli/test_run.py,sha256=
|
|
270
|
+
tests/cli/test_collection.py,sha256=gx3GySIAPNaLUSf3D3Q3V0WZc21BPdNthIbECHQN0TI,3026
|
|
271
|
+
tests/cli/test_run.py,sha256=aywruYPPweMEHaBOynf0G3liKBKMH_H_e4Znq2PcaR4,5821
|
|
257
272
|
tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
258
273
|
tests/perf/test_perf.py,sha256=iB8Mg565SfwPsObdAByHYfZNqN71kUtPW7ucmyiOWo8,3025
|
|
259
274
|
tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -266,9 +281,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZU
|
|
|
266
281
|
tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
|
|
267
282
|
tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
268
283
|
tests/vlm/test_vlmeval.py,sha256=nzWXjw49SlxXgDnYS9N5JSFtcUp8xPOW2YNNzupvtt4,1806
|
|
269
|
-
evalscope-0.
|
|
270
|
-
evalscope-0.
|
|
271
|
-
evalscope-0.
|
|
272
|
-
evalscope-0.
|
|
273
|
-
evalscope-0.
|
|
274
|
-
evalscope-0.
|
|
284
|
+
evalscope-0.10.1.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
|
|
285
|
+
evalscope-0.10.1.dist-info/METADATA,sha256=-HQt66q9NaZvcNwiXgLW87aduUogXKaHYz6JokxtEXc,28975
|
|
286
|
+
evalscope-0.10.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
287
|
+
evalscope-0.10.1.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
|
|
288
|
+
evalscope-0.10.1.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
|
|
289
|
+
evalscope-0.10.1.dist-info/RECORD,,
|
tests/cli/test_collection.py
CHANGED
|
@@ -12,15 +12,19 @@ class TestCollection(unittest.TestCase):
|
|
|
12
12
|
def test_create_collection(self):
|
|
13
13
|
schema = CollectionSchema(name='math&reasoning', datasets=[
|
|
14
14
|
CollectionSchema(name='math', datasets=[
|
|
15
|
+
CollectionSchema(name='generation', datasets=[
|
|
15
16
|
DatasetInfo(name='gsm8k', weight=1, task_type='math', tags=['en', 'math']),
|
|
16
17
|
DatasetInfo(name='competition_math', weight=1, task_type='math', tags=['en', 'math']),
|
|
18
|
+
]),
|
|
19
|
+
CollectionSchema(name='multiple_choice', datasets=[
|
|
17
20
|
DatasetInfo(name='cmmlu', weight=2, task_type='math', tags=['zh', 'math'], args={'subset_list': ['college_mathematics', 'high_school_mathematics']}),
|
|
18
21
|
DatasetInfo(name='ceval', weight=3, task_type='math', tags=['zh', 'math'], args={'subset_list': ['advanced_mathematics', 'high_school_mathematics', 'discrete_mathematics', 'middle_school_mathematics']}),
|
|
22
|
+
]),
|
|
19
23
|
]),
|
|
20
24
|
CollectionSchema(name='reasoning', datasets=[
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
25
|
+
DatasetInfo(name='arc', weight=1, task_type='reasoning', tags=['en', 'reasoning']),
|
|
26
|
+
DatasetInfo(name='ceval', weight=1, task_type='reasoning', tags=['zh', 'reasoning'], args={'subset_list': ['logic']}),
|
|
27
|
+
DatasetInfo(name='race', weight=1, task_type='reasoning', tags=['en', 'reasoning']),
|
|
24
28
|
]),
|
|
25
29
|
])
|
|
26
30
|
print(schema.to_dict())
|
|
@@ -32,7 +36,7 @@ class TestCollection(unittest.TestCase):
|
|
|
32
36
|
def test_generate_data(self):
|
|
33
37
|
schema = CollectionSchema.from_dict(json.load(open('outputs/schema_test.json', 'r')))
|
|
34
38
|
print(schema.to_dict())
|
|
35
|
-
mixed_data = WeightedSampler(schema
|
|
39
|
+
mixed_data = WeightedSampler(schema).sample(100)
|
|
36
40
|
dump_jsonl_data(mixed_data, 'outputs/mixed_data_test.jsonl')
|
|
37
41
|
|
|
38
42
|
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
@@ -40,14 +44,14 @@ class TestCollection(unittest.TestCase):
|
|
|
40
44
|
from evalscope import TaskConfig, run_task
|
|
41
45
|
|
|
42
46
|
task_cfg = TaskConfig(
|
|
43
|
-
model='
|
|
47
|
+
model='Qwen2.5-7B-Instruct',
|
|
44
48
|
api_url='http://127.0.0.1:8801/v1/chat/completions',
|
|
45
49
|
api_key='EMPTY',
|
|
46
50
|
eval_type=EvalType.SERVICE,
|
|
47
51
|
datasets=['data_collection'],
|
|
48
52
|
dataset_args={'data_collection': {
|
|
49
|
-
|
|
50
|
-
'local_path': 'outputs/weighted_mixed_data.jsonl'
|
|
53
|
+
'local_path': 'outputs/mixed_data_test.jsonl'
|
|
54
|
+
# 'local_path': 'outputs/weighted_mixed_data.jsonl'
|
|
51
55
|
}},
|
|
52
56
|
)
|
|
53
57
|
run_task(task_cfg=task_cfg)
|
tests/cli/test_run.py
CHANGED
|
@@ -128,12 +128,15 @@ class TestRun(unittest.TestCase):
|
|
|
128
128
|
from evalscope.config import TaskConfig
|
|
129
129
|
|
|
130
130
|
task_cfg = TaskConfig(
|
|
131
|
-
model='
|
|
131
|
+
model='Qwen2.5-7B-Instruct',
|
|
132
132
|
api_url='http://127.0.0.1:8801/v1/chat/completions',
|
|
133
133
|
api_key='EMPTY',
|
|
134
134
|
eval_type=EvalType.SERVICE,
|
|
135
135
|
datasets=[
|
|
136
|
-
'
|
|
136
|
+
'iquiz',
|
|
137
|
+
# 'ifeval',
|
|
138
|
+
# 'mmlu',
|
|
139
|
+
# 'mmlu_pro',
|
|
137
140
|
# 'race',
|
|
138
141
|
# 'trivia_qa',
|
|
139
142
|
# 'cmmlu',
|
|
@@ -145,8 +148,14 @@ class TestRun(unittest.TestCase):
|
|
|
145
148
|
# 'bbh',
|
|
146
149
|
# 'hellaswag',
|
|
147
150
|
],
|
|
148
|
-
|
|
149
|
-
|
|
151
|
+
dataset_args={
|
|
152
|
+
'ceval': {
|
|
153
|
+
'subset_list': [
|
|
154
|
+
'computer_network', 'operating_system', 'computer_architecture', 'college_programming'
|
|
155
|
+
]
|
|
156
|
+
}
|
|
157
|
+
},
|
|
158
|
+
# limit=10
|
|
150
159
|
)
|
|
151
160
|
|
|
152
161
|
run_task(task_cfg=task_cfg)
|
evalscope/tools/__init__.py
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
@@ -1,133 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
|
|
3
|
-
import glob
|
|
4
|
-
import json
|
|
5
|
-
import os
|
|
6
|
-
from collections import defaultdict
|
|
7
|
-
from tabulate import tabulate
|
|
8
|
-
|
|
9
|
-
from evalscope.utils.logger import get_logger
|
|
10
|
-
|
|
11
|
-
logger = get_logger()
|
|
12
|
-
"""
|
|
13
|
-
Combine and generate table for reports of LLMs.
|
|
14
|
-
"""
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def get_report(report_file: str):
|
|
18
|
-
data_d: dict = json.load(open(report_file, 'r'))
|
|
19
|
-
dataset_name = data_d['dataset_name']
|
|
20
|
-
model_name = data_d['model_name']
|
|
21
|
-
score = data_d['score'] # float or dict
|
|
22
|
-
metric = data_d['metric']
|
|
23
|
-
score_d = {}
|
|
24
|
-
if isinstance(score, dict):
|
|
25
|
-
score_d = score
|
|
26
|
-
elif isinstance(score, float):
|
|
27
|
-
score_d[metric] = score
|
|
28
|
-
else:
|
|
29
|
-
raise ValueError(f'Unknown score type: {type(score)}')
|
|
30
|
-
score_str = '\n'.join(['(' + dataset_name + '/' + k + ') ' + str(v) for k, v in score_d.items()])
|
|
31
|
-
|
|
32
|
-
return model_name, {'dataset_name': dataset_name, 'score': score_str}
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
def get_model_reports(model_report_dir: str):
|
|
36
|
-
model_report_dir = os.path.normpath(model_report_dir)
|
|
37
|
-
report_files = glob.glob(os.path.join(model_report_dir, '**/*.json'))
|
|
38
|
-
|
|
39
|
-
model_reports_d = defaultdict(list)
|
|
40
|
-
for file_path in report_files:
|
|
41
|
-
model_name, report_d = get_report(file_path)
|
|
42
|
-
model_reports_d[model_name].append(report_d)
|
|
43
|
-
|
|
44
|
-
return model_reports_d
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
def gen_table(reports_path_list: list):
|
|
48
|
-
table_values = []
|
|
49
|
-
headers = ['Model']
|
|
50
|
-
is_headers_set = False
|
|
51
|
-
|
|
52
|
-
for report_path in reports_path_list:
|
|
53
|
-
model_reports_d = get_model_reports(report_path)
|
|
54
|
-
for model_name, report_list in model_reports_d.items():
|
|
55
|
-
report_list = sorted(report_list, key=lambda x: x['dataset_name'])
|
|
56
|
-
if not is_headers_set:
|
|
57
|
-
headers.extend([x['dataset_name'] for x in report_list])
|
|
58
|
-
is_headers_set = True
|
|
59
|
-
single_row = []
|
|
60
|
-
single_row.append(model_name)
|
|
61
|
-
for single_report in report_list:
|
|
62
|
-
# e.g. '28.51 (acc)'
|
|
63
|
-
single_row.append(single_report['score'])
|
|
64
|
-
table_values.append(single_row)
|
|
65
|
-
|
|
66
|
-
report_table = tabulate(table_values, headers=headers, tablefmt='grid')
|
|
67
|
-
return report_table
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
class ReportsRecorder:
|
|
71
|
-
COMMON_DATASET_PATH = []
|
|
72
|
-
CUSTOM_DATASET_PATH = []
|
|
73
|
-
|
|
74
|
-
def __init__(self, oss_url: str = '', endpoint: str = ''):
|
|
75
|
-
if oss_url and endpoint:
|
|
76
|
-
import oss2
|
|
77
|
-
from oss2.credentials import EnvironmentVariableCredentialsProvider
|
|
78
|
-
|
|
79
|
-
auth = oss2.ProviderAuth(EnvironmentVariableCredentialsProvider())
|
|
80
|
-
oss_url = oss_url.replace('oss://', '').split('/')
|
|
81
|
-
bucket_name = oss_url[0]
|
|
82
|
-
|
|
83
|
-
self.object_path = '/'.join(oss_url[1:])
|
|
84
|
-
self.bucket = oss2.Bucket(auth, endpoint, bucket_name)
|
|
85
|
-
else:
|
|
86
|
-
self.object_path = ''
|
|
87
|
-
self.bucket = None
|
|
88
|
-
|
|
89
|
-
def append_path(self, report_path: str, dataset_name: str):
|
|
90
|
-
if dataset_name == 'general_qa':
|
|
91
|
-
self.CUSTOM_DATASET_PATH.append(report_path)
|
|
92
|
-
else:
|
|
93
|
-
self.COMMON_DATASET_PATH.append(report_path)
|
|
94
|
-
|
|
95
|
-
def dump_reports(self, output_dir: str):
|
|
96
|
-
result = {'CommonDataset': [], 'CustomDataset': []}
|
|
97
|
-
for line in self.COMMON_DATASET_PATH:
|
|
98
|
-
with open(line, 'r') as f:
|
|
99
|
-
report = json.load(f)
|
|
100
|
-
result['CommonDataset'].append(report)
|
|
101
|
-
for line in self.CUSTOM_DATASET_PATH:
|
|
102
|
-
with open(line, 'r') as f:
|
|
103
|
-
report = json.load(f)
|
|
104
|
-
report.update({'name': os.path.basename(line)})
|
|
105
|
-
result['CustomDataset'].append(report)
|
|
106
|
-
|
|
107
|
-
os.makedirs(output_dir, exist_ok=True)
|
|
108
|
-
output_file_name = 'metric.json'
|
|
109
|
-
output_path = os.path.join(output_dir, output_file_name)
|
|
110
|
-
with open(output_path, 'w+') as f:
|
|
111
|
-
f.write(json.dumps(result, ensure_ascii=False, indent=4))
|
|
112
|
-
|
|
113
|
-
if self.bucket:
|
|
114
|
-
remote_path = os.path.join(self.object_path, output_file_name)
|
|
115
|
-
logger.info(f'** Upload report to oss: {remote_path}')
|
|
116
|
-
self.bucket.put_object_from_file(remote_path, output_path)
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
if __name__ == '__main__':
|
|
120
|
-
report_dir_1 = '/to/path/20231129_020533_default_ZhipuAI_chatglm2-6b-base_none/reports'
|
|
121
|
-
report_dir_2 = '/to/path/20231129_020533_default_ZhipuAI_chatglm2-6b_none/reports'
|
|
122
|
-
|
|
123
|
-
report_table = gen_table([report_dir_1, report_dir_2])
|
|
124
|
-
print(report_table)
|
|
125
|
-
|
|
126
|
-
# ALL VALUES ONLY FOR EXAMPLE
|
|
127
|
-
# +--------------------------+-------------------+-------------+
|
|
128
|
-
# | Model | CompetitionMath | GSM8K |
|
|
129
|
-
# +==========================+===================+=============+
|
|
130
|
-
# | ZhipuAI_chatglm2-6b-base | 25.0 (acc) | 30.50 (acc) |
|
|
131
|
-
# +--------------------------+-------------------+-------------+
|
|
132
|
-
# | ZhipuAI_chatglm2-6b | 30.5 (acc) | 40.50 (acc) |
|
|
133
|
-
# +--------------------------+-------------------+-------------+
|
|
@@ -1,90 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
|
|
3
|
-
# Note: refer to https://github.com/hendrycks/test/blob/master/categories.py
|
|
4
|
-
|
|
5
|
-
subcategories = {
|
|
6
|
-
'abstract_algebra': ['math'],
|
|
7
|
-
'anatomy': ['health'],
|
|
8
|
-
'astronomy': ['physics'],
|
|
9
|
-
'business_ethics': ['business'],
|
|
10
|
-
'clinical_knowledge': ['health'],
|
|
11
|
-
'college_biology': ['biology'],
|
|
12
|
-
'college_chemistry': ['chemistry'],
|
|
13
|
-
'college_computer_science': ['computer science'],
|
|
14
|
-
'college_mathematics': ['math'],
|
|
15
|
-
'college_medicine': ['health'],
|
|
16
|
-
'college_physics': ['physics'],
|
|
17
|
-
'computer_security': ['computer science'],
|
|
18
|
-
'conceptual_physics': ['physics'],
|
|
19
|
-
'econometrics': ['economics'],
|
|
20
|
-
'electrical_engineering': ['engineering'],
|
|
21
|
-
'elementary_mathematics': ['math'],
|
|
22
|
-
'formal_logic': ['philosophy'],
|
|
23
|
-
'global_facts': ['other'],
|
|
24
|
-
'high_school_biology': ['biology'],
|
|
25
|
-
'high_school_chemistry': ['chemistry'],
|
|
26
|
-
'high_school_computer_science': ['computer science'],
|
|
27
|
-
'high_school_european_history': ['history'],
|
|
28
|
-
'high_school_geography': ['geography'],
|
|
29
|
-
'high_school_government_and_politics': ['politics'],
|
|
30
|
-
'high_school_macroeconomics': ['economics'],
|
|
31
|
-
'high_school_mathematics': ['math'],
|
|
32
|
-
'high_school_microeconomics': ['economics'],
|
|
33
|
-
'high_school_physics': ['physics'],
|
|
34
|
-
'high_school_psychology': ['psychology'],
|
|
35
|
-
'high_school_statistics': ['math'],
|
|
36
|
-
'high_school_us_history': ['history'],
|
|
37
|
-
'high_school_world_history': ['history'],
|
|
38
|
-
'human_aging': ['health'],
|
|
39
|
-
'human_sexuality': ['culture'],
|
|
40
|
-
'international_law': ['law'],
|
|
41
|
-
'jurisprudence': ['law'],
|
|
42
|
-
'logical_fallacies': ['philosophy'],
|
|
43
|
-
'machine_learning': ['computer science'],
|
|
44
|
-
'management': ['business'],
|
|
45
|
-
'marketing': ['business'],
|
|
46
|
-
'medical_genetics': ['health'],
|
|
47
|
-
'miscellaneous': ['other'],
|
|
48
|
-
'moral_disputes': ['philosophy'],
|
|
49
|
-
'moral_scenarios': ['philosophy'],
|
|
50
|
-
'nutrition': ['health'],
|
|
51
|
-
'philosophy': ['philosophy'],
|
|
52
|
-
'prehistory': ['history'],
|
|
53
|
-
'professional_accounting': ['other'],
|
|
54
|
-
'professional_law': ['law'],
|
|
55
|
-
'professional_medicine': ['health'],
|
|
56
|
-
'professional_psychology': ['psychology'],
|
|
57
|
-
'public_relations': ['politics'],
|
|
58
|
-
'security_studies': ['politics'],
|
|
59
|
-
'sociology': ['culture'],
|
|
60
|
-
'us_foreign_policy': ['politics'],
|
|
61
|
-
'virology': ['health'],
|
|
62
|
-
'world_religions': ['philosophy'],
|
|
63
|
-
}
|
|
64
|
-
|
|
65
|
-
categories = {
|
|
66
|
-
'STEM': ['physics', 'chemistry', 'biology', 'computer science', 'math', 'engineering'],
|
|
67
|
-
'Humanities': ['history', 'philosophy', 'law'],
|
|
68
|
-
'Social Science': ['politics', 'culture', 'economics', 'geography', 'psychology'],
|
|
69
|
-
'Other': ['other', 'business', 'health'],
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
def main():
|
|
74
|
-
|
|
75
|
-
reversed_categories = {}
|
|
76
|
-
for category, subcategory_list in categories.items():
|
|
77
|
-
for subcategory in subcategory_list:
|
|
78
|
-
reversed_categories[subcategory] = category
|
|
79
|
-
|
|
80
|
-
subject_mapping = {}
|
|
81
|
-
for subject, subcategory_list in subcategories.items():
|
|
82
|
-
category_name: str = reversed_categories[subcategory_list[0]]
|
|
83
|
-
subject_show_name: str = ' '.join([item.capitalize() for item in subject.split('_')])
|
|
84
|
-
subject_mapping[subject] = [subject_show_name, subcategory_list[0], category_name]
|
|
85
|
-
|
|
86
|
-
print(subject_mapping)
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
if __name__ == '__main__':
|
|
90
|
-
main()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|