evalscope 0.8.2__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +2 -0
- evalscope/arguments.py +10 -3
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
- evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope/benchmarks/__init__.py +20 -1
- evalscope/benchmarks/arc/__init__.py +0 -5
- evalscope/benchmarks/arc/arc_adapter.py +23 -99
- evalscope/benchmarks/bbh/__init__.py +0 -4
- evalscope/benchmarks/bbh/bbh_adapter.py +19 -89
- evalscope/benchmarks/benchmark.py +70 -59
- evalscope/benchmarks/ceval/__init__.py +0 -5
- evalscope/benchmarks/ceval/ceval_adapter.py +22 -46
- evalscope/benchmarks/cmmlu/__init__.py +0 -5
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +20 -41
- evalscope/benchmarks/competition_math/__init__.py +0 -5
- evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
- evalscope/benchmarks/data_adapter.py +114 -85
- evalscope/benchmarks/general_qa/__init__.py +0 -5
- evalscope/benchmarks/general_qa/general_qa_adapter.py +16 -19
- evalscope/benchmarks/gsm8k/__init__.py +0 -4
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +19 -98
- evalscope/benchmarks/hellaswag/__init__.py +0 -5
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +23 -96
- evalscope/benchmarks/humaneval/__init__.py +0 -4
- evalscope/benchmarks/humaneval/humaneval_adapter.py +16 -117
- evalscope/benchmarks/mmlu/__init__.py +0 -5
- evalscope/benchmarks/mmlu/mmlu_adapter.py +26 -48
- evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
- evalscope/benchmarks/race/__init__.py +0 -5
- evalscope/benchmarks/race/race_adapter.py +25 -53
- evalscope/benchmarks/trivia_qa/__init__.py +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +24 -97
- evalscope/benchmarks/truthful_qa/__init__.py +0 -5
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +23 -33
- evalscope/collections/__init__.py +3 -0
- evalscope/collections/evaluator.py +178 -0
- evalscope/collections/sampler.py +132 -0
- evalscope/collections/schema.py +122 -0
- evalscope/config.py +7 -5
- evalscope/constants.py +7 -28
- evalscope/evaluator/evaluator.py +66 -109
- evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
- evalscope/metrics/__init__.py +6 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- evalscope/metrics/math_accuracy.py +193 -50
- evalscope/metrics/metrics.py +7 -4
- evalscope/metrics/rouge_metric.py +13 -8
- evalscope/models/__init__.py +14 -1
- evalscope/models/base_adapter.py +52 -0
- evalscope/models/chat_adapter.py +138 -0
- evalscope/models/choice_adapter.py +211 -0
- evalscope/models/custom_adapter.py +67 -0
- evalscope/models/local_model.py +74 -0
- evalscope/models/model.py +141 -0
- evalscope/models/server_adapter.py +104 -0
- evalscope/run.py +37 -66
- evalscope/run_arena.py +1 -1
- evalscope/utils/__init__.py +1 -1
- evalscope/utils/chat_service.py +4 -3
- evalscope/utils/io_utils.py +8 -0
- evalscope/utils/logger.py +4 -0
- evalscope/utils/model_utils.py +10 -0
- evalscope/utils/utils.py +3 -25
- evalscope/version.py +2 -2
- {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/METADATA +32 -15
- {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/RECORD +75 -66
- tests/cli/test_collection.py +53 -0
- tests/cli/test_run.py +43 -1
- tests/rag/test_mteb.py +3 -2
- evalscope/models/api/__init__.py +0 -3
- evalscope/models/dummy_chat_model.py +0 -49
- evalscope/models/model_adapter.py +0 -525
- evalscope/models/openai_model.py +0 -103
- /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/LICENSE +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/WHEEL +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/top_level.txt +0 -0
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
evalscope/__init__.py,sha256=
|
|
2
|
-
evalscope/arguments.py,sha256=
|
|
3
|
-
evalscope/config.py,sha256=
|
|
4
|
-
evalscope/constants.py,sha256=
|
|
5
|
-
evalscope/run.py,sha256=
|
|
6
|
-
evalscope/run_arena.py,sha256=
|
|
1
|
+
evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
|
|
2
|
+
evalscope/arguments.py,sha256=v0oKhnJ-2RUpEWWKC_-e7Km5osgPJeZC_aKw8R-3Y0A,4382
|
|
3
|
+
evalscope/config.py,sha256=4klkNziKT4r8a4Z1imkiY16-S8iER1BYPMOG4nJg9lU,8571
|
|
4
|
+
evalscope/constants.py,sha256=SAa5IEjcDvcH_ePvCcbValAEyMvGnXPdO0jDmKk8uUs,3277
|
|
5
|
+
evalscope/run.py,sha256=cFUwfsXDTQ8NGJYe314LDF_hnuM60UUQxzgbOcPRDbY,5619
|
|
6
|
+
evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
|
|
7
7
|
evalscope/summarizer.py,sha256=FgdYz7LlNs5XpDMlj2ULkVQGIg5XVeeWdWJ1_OMweq0,5882
|
|
8
|
-
evalscope/version.py,sha256=
|
|
8
|
+
evalscope/version.py,sha256=zr0PUDVLPIYwSv10FsTbYbOSBc6BNKFH3cDqhMMp1Jg,118
|
|
9
9
|
evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
10
|
evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
|
|
11
11
|
evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
|
|
@@ -22,7 +22,7 @@ evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py,sha256=anuIhRk9OC8y
|
|
|
22
22
|
evalscope/backend/rag_eval/clip_benchmark/task_template.py,sha256=2NQRvlYY2SOzvOOj9WRLyxvRlyj8CAcgbQqgsv-Xjgw,3929
|
|
23
23
|
evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
24
|
evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py,sha256=CQnWZZTQ0FOzDtmGv7OF0W4Cv4g6u4_LQ93koDu1pes,2556
|
|
25
|
-
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py,sha256=
|
|
25
|
+
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py,sha256=NwpxNECN7NFgtlVdKY7vet5m-gAmIp8MJYka0eexWu0,7424
|
|
26
26
|
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py,sha256=t0Uq7W0sPDBJS1rqp70KgSfeRQ3c7u8YeGhj5Yiu6rk,5646
|
|
27
27
|
evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py,sha256=rZY-TulG-Cb8b6GTBxqTDYQ_4Ois3kbgKhuunZq8Ato,8407
|
|
28
28
|
evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt,sha256=eiiAaxhS48b5rVLy5O9VvFfV2AfxY86ITu_iqT7ZLkQ,649
|
|
@@ -50,19 +50,19 @@ evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py,sha256=aP8U9zjIDl26X_
|
|
|
50
50
|
evalscope/backend/rag_eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
51
51
|
evalscope/backend/rag_eval/utils/clip.py,sha256=GLHhPCac2AH35AvRLvVqePA1gIMAewHTFmCJCDZzvqU,5015
|
|
52
52
|
evalscope/backend/rag_eval/utils/embedding.py,sha256=x9HAEfZSSAnT2Tdbf-9a5UmBVagCr__ay5A2nMCPMpg,6258
|
|
53
|
-
evalscope/backend/rag_eval/utils/llm.py,sha256=
|
|
53
|
+
evalscope/backend/rag_eval/utils/llm.py,sha256=IaNgdQBnURAmtpK5UPDqfCNrtV_J3wu0s4JWQqKedHA,2568
|
|
54
54
|
evalscope/backend/rag_eval/utils/tools.py,sha256=FU7tNu-8y8V_o_kArFVTTLM_GzL12KBNeXiwQw5SpJA,1529
|
|
55
55
|
evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4tycyy-FJgzLdjeY,84
|
|
56
56
|
evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=ZB0wYzPBPyIM0zjfp5C71GcjAgKxKoWxF7RrhuDQBYM,5931
|
|
57
57
|
evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=D-8Sr0tZblf-3ZR5T0lOggbgFRuy5wq9Nm02GUpNyAs,1620
|
|
58
|
-
evalscope/benchmarks/__init__.py,sha256=
|
|
59
|
-
evalscope/benchmarks/benchmark.py,sha256=
|
|
60
|
-
evalscope/benchmarks/data_adapter.py,sha256
|
|
61
|
-
evalscope/benchmarks/arc/__init__.py,sha256=
|
|
58
|
+
evalscope/benchmarks/__init__.py,sha256=b_SWdV1ZyOqFiwc_9lIjKrIvK1rwnF2cCIF7XN9CN8E,932
|
|
59
|
+
evalscope/benchmarks/benchmark.py,sha256=RuQEH5cQv4I9B1XxBZ0vAKTAfYZSUS9eK0o0RrMFVMA,2407
|
|
60
|
+
evalscope/benchmarks/data_adapter.py,sha256=-5Z_fdTRmkcXf1wnRuHgPrGVMKIl8Sq8RBTF9_HYo9A,12146
|
|
61
|
+
evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
62
62
|
evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
|
|
63
|
-
evalscope/benchmarks/arc/arc_adapter.py,sha256=
|
|
64
|
-
evalscope/benchmarks/bbh/__init__.py,sha256=
|
|
65
|
-
evalscope/benchmarks/bbh/bbh_adapter.py,sha256=
|
|
63
|
+
evalscope/benchmarks/arc/arc_adapter.py,sha256=8xw01LNkx19J4BNN-D2SbzcA6GA_9nAVMH7WNPzBWXs,6661
|
|
64
|
+
evalscope/benchmarks/bbh/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
65
|
+
evalscope/benchmarks/bbh/bbh_adapter.py,sha256=vpFy-05ubDwJ1IIsIV802_fWicgPJvq3uXtIneVhr48,8293
|
|
66
66
|
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt,sha256=xnzlaIRyeGlogG49v8nt4vpJO40J06ev4yc8cv0VSRY,1781
|
|
67
67
|
evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt,sha256=sfo-2iOeVzB0OGgd7NSQFELTGDTsr2DQ3u-g0ivI-sM,3653
|
|
68
68
|
evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt,sha256=UJBsc3Mwz8TZngdWH_NFlhhNbLhNHK6FvW9FHcS8H5g,1167
|
|
@@ -90,70 +90,77 @@ evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt
|
|
|
90
90
|
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt,sha256=Su_-fICm9LxGpAkQlRbUZKvet_wPqTK-5jQo_VqJxQI,2604
|
|
91
91
|
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt,sha256=s_x6u5MLeKpuAHZj3GNQqY1I8vWqQIfJasOp9XcM7Ck,2945
|
|
92
92
|
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt,sha256=qfTZafCzNiz9ULBaDlfy_LISL617NyH5Nc0-nO0K0LE,2164
|
|
93
|
-
evalscope/benchmarks/ceval/__init__.py,sha256=
|
|
94
|
-
evalscope/benchmarks/ceval/ceval_adapter.py,sha256
|
|
93
|
+
evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
94
|
+
evalscope/benchmarks/ceval/ceval_adapter.py,sha256=-qrzeXWC3dmF-mpJV-Gtz5PDIzCbWaLGdi5x1ha1ZC4,14347
|
|
95
95
|
evalscope/benchmarks/ceval/ceval_exam.py,sha256=ngOvb6Fymt7iPWIb2fzrUVpqmUT2VBoqh7X_IH8Bcsc,4824
|
|
96
96
|
evalscope/benchmarks/ceval/samples.jsonl,sha256=dyWhGAdt4eq6Amgu2Ykx8RevUJVFtbhGFSTbDAeUgHc,448
|
|
97
|
-
evalscope/benchmarks/cmmlu/__init__.py,sha256=
|
|
97
|
+
evalscope/benchmarks/cmmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
98
98
|
evalscope/benchmarks/cmmlu/cmmlu.py,sha256=Y59NIGUFzJEztJbkehZsG4Cz0J_v9Cyju6xazHMYIcA,5022
|
|
99
|
-
evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=
|
|
99
|
+
evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=G1EnVVeYhycQ58a8PiXfYb3Pe4iEuf8ngHNJ4CUJz14,13311
|
|
100
100
|
evalscope/benchmarks/cmmlu/samples.jsonl,sha256=FXbyPQSDorKBGSD0lnOzioZmFjG07lIL87FRDRaMPSY,1722
|
|
101
|
-
evalscope/benchmarks/competition_math/__init__.py,sha256=
|
|
101
|
+
evalscope/benchmarks/competition_math/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
102
102
|
evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc8TdmfblZ6mMc39wQWtqqbUi34QLc,2629
|
|
103
|
-
evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=
|
|
104
|
-
evalscope/benchmarks/general_qa/__init__.py,sha256=
|
|
105
|
-
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=
|
|
106
|
-
evalscope/benchmarks/gsm8k/__init__.py,sha256=
|
|
103
|
+
evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=xAH3_EiJNhHO1iGTNC7CqTVOF-tpr-9o6Hj_DF5-gNg,6766
|
|
104
|
+
evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
105
|
+
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=F33qTr2LksJOkkR8VqFM4dwM1CKHSsdWfNrZ7w09z2Y,5650
|
|
106
|
+
evalscope/benchmarks/gsm8k/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
107
107
|
evalscope/benchmarks/gsm8k/gsm8k.py,sha256=ZDN5lfeZyc_pkTDVY0voC_zUExHE1ZoEgEaTvt5hpXg,4233
|
|
108
|
-
evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=
|
|
109
|
-
evalscope/benchmarks/hellaswag/__init__.py,sha256=
|
|
108
|
+
evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=Qo-4fKHMFzSH5TEkc8NbciKOfP9ESY8CcGRV7dgjh7k,11212
|
|
109
|
+
evalscope/benchmarks/hellaswag/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
110
110
|
evalscope/benchmarks/hellaswag/hellaswag.py,sha256=5_c9WbaS1LIdvgXzqEcvjAEtKi2V2Yn0YtszPlFqhXI,4610
|
|
111
|
-
evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=
|
|
112
|
-
evalscope/benchmarks/humaneval/__init__.py,sha256=
|
|
111
|
+
evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=Ea_LTREFtroil7D6EGxPT9-QxVGdot5ZhfixUqjuYqo,6046
|
|
112
|
+
evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
113
113
|
evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
|
|
114
|
-
evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=
|
|
115
|
-
evalscope/benchmarks/mmlu/__init__.py,sha256=
|
|
114
|
+
evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=iGxgOMVJTDAmJMmSzCmErLOwTMpPd11afoF5YgtvMJs,5224
|
|
115
|
+
evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
116
116
|
evalscope/benchmarks/mmlu/mmlu.py,sha256=sA8AC0bN7iURrSazqkY31s_reNVbDZSUCB-NCTQsVeI,5042
|
|
117
|
-
evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=
|
|
117
|
+
evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=8hfAcTXN4c6I45GA8IhU1bJmQMTGJBXoEyaZEuR-ays,14761
|
|
118
118
|
evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
|
|
119
|
-
evalscope/benchmarks/
|
|
119
|
+
evalscope/benchmarks/mmlu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
120
|
+
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=CYDfZTqn6qVwTE66PUpSt-RRqZHwXNZdykQr2QSECSY,4388
|
|
121
|
+
evalscope/benchmarks/race/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
120
122
|
evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO61nU,3608
|
|
121
|
-
evalscope/benchmarks/race/race_adapter.py,sha256=
|
|
123
|
+
evalscope/benchmarks/race/race_adapter.py,sha256=1tLSb9nCvqCQ_6JjwiknFPD-L1E5pgvOBwZ-11G0JMU,9220
|
|
122
124
|
evalscope/benchmarks/race/samples.jsonl,sha256=bhSktBgU6axYQCClRtQ7nN8D1x815AU8xMAIG1oflG0,1243
|
|
123
|
-
evalscope/benchmarks/trivia_qa/__init__.py,sha256=
|
|
125
|
+
evalscope/benchmarks/trivia_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
124
126
|
evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=1isBD62PGhCiNbzQa-GFrHHL4XLHIkojWfgSvn7ktf8,3445
|
|
125
127
|
evalscope/benchmarks/trivia_qa/trivia_qa.py,sha256=eekxaXppMLb5tCQqNLOw2MaWlYDhI2IicPzRsTHqb5A,3070
|
|
126
|
-
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=
|
|
127
|
-
evalscope/benchmarks/truthful_qa/__init__.py,sha256=
|
|
128
|
+
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=pS8-uqNBqRIxTER8oVrLvu8kGJ9L3pvNCqCHZHiCPAc,5191
|
|
129
|
+
evalscope/benchmarks/truthful_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
128
130
|
evalscope/benchmarks/truthful_qa/truthful_qa.py,sha256=A4abSL6WMRcXez8flxsHy-0ZFyRg-moq9rTeOA1TalY,6909
|
|
129
|
-
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=
|
|
131
|
+
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=UpzhcW7yCMv4GDzDKqL_y0KxeDkvbupuzoRh5qCsiys,14623
|
|
130
132
|
evalscope/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
131
133
|
evalscope/cli/base.py,sha256=m1DFlF16L0Lyrn0YNuFj8ByGjVJIoI0jKzAoodIXjRk,404
|
|
132
134
|
evalscope/cli/cli.py,sha256=yNL3ZeolBc-cVr5D4GByGZWKrmpKIK-48R6wXOXO7Y0,641
|
|
133
135
|
evalscope/cli/start_eval.py,sha256=2lyD2WSQ0DnP6T31VvTimQ-6POnwxeEP9GLPFnT7Tfo,767
|
|
134
136
|
evalscope/cli/start_perf.py,sha256=lEHJBSpzNsO4KGlWfQc-EfZGXq1M_FpOwtRxRdb4fso,813
|
|
135
137
|
evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
|
|
138
|
+
evalscope/collections/__init__.py,sha256=hd68Qf-ryeDsz5Pu-Dh83M5V5RE5mhLsG-vc55n5n0o,228
|
|
139
|
+
evalscope/collections/evaluator.py,sha256=6bF7TtgHMWOSpuBzpuu9A40y9dNTxdI8vizC5-3LRhI,7404
|
|
140
|
+
evalscope/collections/sampler.py,sha256=psvciGq9lE_-EnJxR3l06SM7NC9XmDnRdu1ckH79kXI,4526
|
|
141
|
+
evalscope/collections/schema.py,sha256=Eq64Hr8GebsBsO_THixfrIWCioVCpr3LXsGXMaehui0,4055
|
|
136
142
|
evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
|
|
137
|
-
evalscope/evaluator/evaluator.py,sha256=
|
|
143
|
+
evalscope/evaluator/evaluator.py,sha256=S3VWI6kFX4cJdsI1Px0-P1y4wmC_PoOqXMFeM3v-C74,16310
|
|
138
144
|
evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
|
|
139
145
|
evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
140
|
-
evalscope/evaluator/reviewer/auto_reviewer.py,sha256=
|
|
141
|
-
evalscope/metrics/__init__.py,sha256=
|
|
146
|
+
evalscope/evaluator/reviewer/auto_reviewer.py,sha256=PDz1Nt_qq0oGZBBwek2-M8VBUNLkOkmYObzR8gO7nEc,16624
|
|
147
|
+
evalscope/metrics/__init__.py,sha256=CnhvODaILc4X0dnBoSPuSbTE2WbSf5NEEzM2M9a6uII,434
|
|
142
148
|
evalscope/metrics/code_metric.py,sha256=EXE2BZAc9JJT_cpd6eCb0Jo9wwtnzXzHBZxmLzG5Jpw,3300
|
|
143
|
-
evalscope/metrics/math_accuracy.py,sha256=
|
|
144
|
-
evalscope/metrics/metrics.py,sha256=
|
|
145
|
-
evalscope/metrics/rouge_metric.py,sha256=
|
|
149
|
+
evalscope/metrics/math_accuracy.py,sha256=a0L_YT70bsJYn5_POICJyj6ZVFbHek1ly6j_ssV9Xsc,5585
|
|
150
|
+
evalscope/metrics/metrics.py,sha256=XutNgiBAWACPZEIBSzylugDGFV4fDvo-qIYkxG7w2Mc,12634
|
|
151
|
+
evalscope/metrics/rouge_metric.py,sha256=zhIUqenSuxnORR9tamLQBGjFwP91Zei2UiLtcOyseVM,4639
|
|
146
152
|
evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
|
|
147
|
-
evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=
|
|
153
|
+
evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=Kq6AObenmLVQ5tN3NgN042a6mgRFQmRO21-ohd9mSa8,11972
|
|
148
154
|
evalscope/metrics/resources/gpt2-zhcn3-v4.bpe,sha256=J_K-oSt9usFEw87Av6j7ETn3J48EsmFuY5_iVvY6xjc,524464
|
|
149
155
|
evalscope/metrics/resources/gpt2-zhcn3-v4.json,sha256=WkM4J_FDPPNQwYi0kj5sM5SVjk2_6bci7tqf8dV9p_U,1289341
|
|
150
|
-
evalscope/models/__init__.py,sha256=
|
|
151
|
-
evalscope/models/
|
|
152
|
-
evalscope/models/
|
|
153
|
-
evalscope/models/
|
|
154
|
-
evalscope/models/
|
|
155
|
-
evalscope/models/
|
|
156
|
-
evalscope/models/
|
|
156
|
+
evalscope/models/__init__.py,sha256=pafIEbJq_2DrYjQbgI0SNVxywNYOxvqwk7Dr1P7KEwk,923
|
|
157
|
+
evalscope/models/base_adapter.py,sha256=fT3i8c9jRmz_VBcUYMMmXrlCM6JWcixPdgak5yT6Wkw,2177
|
|
158
|
+
evalscope/models/chat_adapter.py,sha256=P6CE0JqWDsE7afNfU_wicdisHLfc46Rw3rwTA0sEGQQ,5398
|
|
159
|
+
evalscope/models/choice_adapter.py,sha256=Zb-UUFpF2tpMGuGH_wFleMxpSb__-SuN1cMF7yj25aI,7661
|
|
160
|
+
evalscope/models/custom_adapter.py,sha256=uj4kbBCwhrXjvSq9f6HgTJ5yJ9FJpvs1k5-9Ekm9RmA,2272
|
|
161
|
+
evalscope/models/local_model.py,sha256=EBclVq5tqUFNOZebRlNnZSvzwtSun7FsZRf2tx0cMt0,2486
|
|
162
|
+
evalscope/models/model.py,sha256=diu4TE1ZFWdynTxsl4DejTNsLdwjxoyj2nsKR-Y8EZE,7343
|
|
163
|
+
evalscope/models/server_adapter.py,sha256=InS4M_LprbBV4xHcbPCm5y_S8-kApKDYhR-HEKXzG8Q,4169
|
|
157
164
|
evalscope/models/custom/__init__.py,sha256=wb6f_Bi39s5sj-VO7EXRDXB2WhyFb49BUtEMk77ksNQ,102
|
|
158
165
|
evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
|
|
159
166
|
evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -219,6 +226,7 @@ evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl,sha256=
|
|
|
219
226
|
evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl,sha256=odTr8N8PoWAFZ2kdEcmlLeMDfEo3KXDtLo9S8oieCmI,5718
|
|
220
227
|
evalscope/third_party/longbench_write/tools/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
221
228
|
evalscope/third_party/longbench_write/tools/data_etl.py,sha256=T7a-4PwZg5alZQh-oTi1zjMxjGmVVZYVwSR9-diZlF8,5971
|
|
229
|
+
evalscope/third_party/longbench_write/tools/openai_api.py,sha256=PiIvvDYJkn041SJkLoroXwl1B8TtwpB7licVfqNSeuQ,8168
|
|
222
230
|
evalscope/third_party/toolbench_static/README.md,sha256=Osdnt0_K-setbmYwDPCPRp2LXxamGp2mE8KsOByPPOY,3944
|
|
223
231
|
evalscope/third_party/toolbench_static/__init__.py,sha256=BO936RxwodHr4OEpV6W3S_keC91OfOd41_msIJ2d0fs,128
|
|
224
232
|
evalscope/third_party/toolbench_static/config_default.json,sha256=KrUzeHL2DNiM5FwY7cH3KZlxTwELCQZ6e39nilfUi0M,368
|
|
@@ -233,23 +241,24 @@ evalscope/tools/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,5
|
|
|
233
241
|
evalscope/tools/combine_reports.py,sha256=JFf3P_GJLPdlSqpv30D8ioPb7dup3tOTktsELmsKXLI,4900
|
|
234
242
|
evalscope/tools/gen_mmlu_subject_mapping.py,sha256=CUmRdReEU7QfMyprh9I56KmHoRww_zUda_JuyxmCL1A,3277
|
|
235
243
|
evalscope/tools/rewrite_eval_results.py,sha256=ODD6pt9FvZq_a54oYsehBDslRKHOsk9zsC9iAZvi5Yg,2020
|
|
236
|
-
evalscope/utils/__init__.py,sha256=
|
|
244
|
+
evalscope/utils/__init__.py,sha256=jLVoGryuqUh4Km9QWWQBzpqkcVNRK0MbwNaSgckqdiU,139
|
|
237
245
|
evalscope/utils/arena_utils.py,sha256=Gf8VpH4C_oF2Abif_QeL0rAP6tvTzsc0gglpdNkUE48,7155
|
|
238
|
-
evalscope/utils/chat_service.py,sha256=
|
|
246
|
+
evalscope/utils/chat_service.py,sha256=h6Z9CpgdmalD9u2WNxdfJw2MdzDqsMfDHmnNk8GkffY,8666
|
|
239
247
|
evalscope/utils/completion_parsers.py,sha256=YWHkLkSfURTcUjNNlCL6PPDICd4F2Ns9figgPN4C97c,2933
|
|
240
|
-
evalscope/utils/io_utils.py,sha256=
|
|
241
|
-
evalscope/utils/logger.py,sha256=
|
|
242
|
-
evalscope/utils/model_utils.py,sha256=
|
|
243
|
-
evalscope/utils/utils.py,sha256=
|
|
248
|
+
evalscope/utils/io_utils.py,sha256=vm6uJBBqx4fc7jsHGbwNQ6Hbx7XYhjT1Q2dQ7aHjDD0,4172
|
|
249
|
+
evalscope/utils/logger.py,sha256=Cke17sVV9MrccINeuEsiVouJarDvS4Wt2JUaWK5NFLM,3582
|
|
250
|
+
evalscope/utils/model_utils.py,sha256=PqIu1nMhoD7sauZATkuxkPo4lrYTQRh8kleERrWD-Po,678
|
|
251
|
+
evalscope/utils/utils.py,sha256=a6a2vDDxqlj7nY8xynkKkWs_ZPXEU2UMwvxp0JEpHjg,9686
|
|
244
252
|
tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
245
253
|
tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
|
|
246
254
|
tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
247
|
-
tests/cli/
|
|
255
|
+
tests/cli/test_collection.py,sha256=pS-omRGU6yuvk5O5RPRIOklVKWKsV3lvPNvmk7rVIMY,2825
|
|
256
|
+
tests/cli/test_run.py,sha256=V5lxiqtuNcpbjewPaE3KD8ssuIolvhhIzYEU7iDXlZE,5492
|
|
248
257
|
tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
249
258
|
tests/perf/test_perf.py,sha256=iB8Mg565SfwPsObdAByHYfZNqN71kUtPW7ucmyiOWo8,3025
|
|
250
259
|
tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
251
260
|
tests/rag/test_clip_benchmark.py,sha256=Ar8Br2CoAFYT2h4zCv_JKMKCGJKbKGYZgNwJ410ZaoU,2597
|
|
252
|
-
tests/rag/test_mteb.py,sha256=
|
|
261
|
+
tests/rag/test_mteb.py,sha256=t64FXE-ZsOCLiRJrw-dIDIhKd1OXiaglXaeERs0lOh4,4643
|
|
253
262
|
tests/rag/test_ragas.py,sha256=N_mUBIyxdQ1REzjkoI2sBNluKLLmKatLc3VY1o9uPck,3947
|
|
254
263
|
tests/swift/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
255
264
|
tests/swift/test_run_swift_eval.py,sha256=JKG-0BwTxkbg-XeiXxujPqnVIM3f2EFaJ_9a7p_R4dk,5748
|
|
@@ -257,9 +266,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZU
|
|
|
257
266
|
tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
|
|
258
267
|
tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
259
268
|
tests/vlm/test_vlmeval.py,sha256=nzWXjw49SlxXgDnYS9N5JSFtcUp8xPOW2YNNzupvtt4,1806
|
|
260
|
-
evalscope-0.
|
|
261
|
-
evalscope-0.
|
|
262
|
-
evalscope-0.
|
|
263
|
-
evalscope-0.
|
|
264
|
-
evalscope-0.
|
|
265
|
-
evalscope-0.
|
|
269
|
+
evalscope-0.9.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
|
|
270
|
+
evalscope-0.9.0.dist-info/METADATA,sha256=KbU5bo3jjt1FsaTVXvdRqJJQEgge_431xW3uQHYKawI,25136
|
|
271
|
+
evalscope-0.9.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
272
|
+
evalscope-0.9.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
|
|
273
|
+
evalscope-0.9.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
|
|
274
|
+
evalscope-0.9.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import unittest
|
|
3
|
+
|
|
4
|
+
from evalscope.collections import CollectionSchema, DatasetInfo, WeightedSampler
|
|
5
|
+
from evalscope.constants import EvalType
|
|
6
|
+
from evalscope.utils.io_utils import dump_jsonl_data
|
|
7
|
+
from evalscope.utils.utils import test_level_list
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TestCollection(unittest.TestCase):
|
|
11
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
12
|
+
def test_create_collection(self):
|
|
13
|
+
schema = CollectionSchema(name='math&reasoning', datasets=[
|
|
14
|
+
CollectionSchema(name='math', datasets=[
|
|
15
|
+
DatasetInfo(name='gsm8k', weight=1, task_type='math', tags=['en', 'math']),
|
|
16
|
+
DatasetInfo(name='competition_math', weight=1, task_type='math', tags=['en', 'math']),
|
|
17
|
+
DatasetInfo(name='cmmlu', weight=2, task_type='math', tags=['zh', 'math'], args={'subset_list': ['college_mathematics', 'high_school_mathematics']}),
|
|
18
|
+
DatasetInfo(name='ceval', weight=3, task_type='math', tags=['zh', 'math'], args={'subset_list': ['advanced_mathematics', 'high_school_mathematics', 'discrete_mathematics', 'middle_school_mathematics']}),
|
|
19
|
+
]),
|
|
20
|
+
CollectionSchema(name='reasoning', datasets=[
|
|
21
|
+
DatasetInfo(name='arc', weight=1, task_type='reasoning', tags=['en', 'reasoning']),
|
|
22
|
+
DatasetInfo(name='ceval', weight=1, task_type='reasoning', tags=['zh', 'reasoning'], args={'subset_list': ['logic']}),
|
|
23
|
+
DatasetInfo(name='race', weight=1, task_type='reasoning', tags=['en', 'reasoning']),
|
|
24
|
+
]),
|
|
25
|
+
])
|
|
26
|
+
print(schema.to_dict())
|
|
27
|
+
print(schema.flatten())
|
|
28
|
+
schema.dump_json('outputs/schema_test.json')
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
32
|
+
def test_generate_data(self):
|
|
33
|
+
schema = CollectionSchema.from_dict(json.load(open('outputs/schema_test.json', 'r')))
|
|
34
|
+
print(schema.to_dict())
|
|
35
|
+
mixed_data = WeightedSampler(schema, 100).sample()
|
|
36
|
+
dump_jsonl_data(mixed_data, 'outputs/mixed_data_test.jsonl')
|
|
37
|
+
|
|
38
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
39
|
+
def test_evaluate_collection(self):
|
|
40
|
+
from evalscope import TaskConfig, run_task
|
|
41
|
+
|
|
42
|
+
task_cfg = TaskConfig(
|
|
43
|
+
model='qwen2.5',
|
|
44
|
+
api_url='http://127.0.0.1:8801/v1/chat/completions',
|
|
45
|
+
api_key='EMPTY',
|
|
46
|
+
eval_type=EvalType.SERVICE,
|
|
47
|
+
datasets=['data_collection'],
|
|
48
|
+
dataset_args={'data_collection': {
|
|
49
|
+
# 'local_path': 'outputs/mixed_data_test.jsonl'
|
|
50
|
+
'local_path': 'outputs/weighted_mixed_data.jsonl'
|
|
51
|
+
}},
|
|
52
|
+
)
|
|
53
|
+
run_task(task_cfg=task_cfg)
|
tests/cli/test_run.py
CHANGED
|
@@ -4,6 +4,7 @@ import subprocess
|
|
|
4
4
|
import torch
|
|
5
5
|
import unittest
|
|
6
6
|
|
|
7
|
+
from evalscope.constants import EvalType
|
|
7
8
|
from evalscope.run import run_task
|
|
8
9
|
from evalscope.utils import is_module_installed, test_level_list
|
|
9
10
|
from evalscope.utils.logger import get_logger
|
|
@@ -70,7 +71,19 @@ class TestRun(unittest.TestCase):
|
|
|
70
71
|
|
|
71
72
|
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
72
73
|
def test_run_task(self):
|
|
73
|
-
task_cfg = {'model': 'qwen/Qwen2-0.5B-Instruct',
|
|
74
|
+
task_cfg = {'model': 'qwen/Qwen2-0.5B-Instruct',
|
|
75
|
+
'datasets': [
|
|
76
|
+
'mmlu_pro',
|
|
77
|
+
# 'bbh',
|
|
78
|
+
'hellaswag',
|
|
79
|
+
# 'gsm8k',
|
|
80
|
+
# 'arc'
|
|
81
|
+
# 'race',
|
|
82
|
+
# 'truthful_qa',
|
|
83
|
+
# 'trivia_qa',
|
|
84
|
+
],
|
|
85
|
+
'limit': 20,
|
|
86
|
+
'debug': True}
|
|
74
87
|
run_task(task_cfg=task_cfg)
|
|
75
88
|
|
|
76
89
|
|
|
@@ -110,5 +123,34 @@ class TestRun(unittest.TestCase):
|
|
|
110
123
|
|
|
111
124
|
run_task(task_cfg=task_cfg)
|
|
112
125
|
|
|
126
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
127
|
+
def test_run_server_model(self):
|
|
128
|
+
from evalscope.config import TaskConfig
|
|
129
|
+
|
|
130
|
+
task_cfg = TaskConfig(
|
|
131
|
+
model='qwen2.5',
|
|
132
|
+
api_url='http://127.0.0.1:8801/v1/chat/completions',
|
|
133
|
+
api_key='EMPTY',
|
|
134
|
+
eval_type=EvalType.SERVICE,
|
|
135
|
+
datasets=[
|
|
136
|
+
'mmlu_pro',
|
|
137
|
+
# 'race',
|
|
138
|
+
# 'trivia_qa',
|
|
139
|
+
# 'cmmlu',
|
|
140
|
+
# 'humaneval',
|
|
141
|
+
# 'competition_math',
|
|
142
|
+
# 'gsm8k',
|
|
143
|
+
# 'arc',
|
|
144
|
+
# 'ceval',
|
|
145
|
+
# 'bbh',
|
|
146
|
+
# 'hellaswag',
|
|
147
|
+
],
|
|
148
|
+
limit=2,
|
|
149
|
+
debug=True
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
run_task(task_cfg=task_cfg)
|
|
153
|
+
|
|
154
|
+
|
|
113
155
|
if __name__ == '__main__':
|
|
114
156
|
unittest.main()
|
tests/rag/test_mteb.py
CHANGED
|
@@ -79,7 +79,7 @@ class TestMTEB(unittest.TestCase):
|
|
|
79
79
|
},
|
|
80
80
|
},
|
|
81
81
|
{
|
|
82
|
-
'model_name_or_path': '
|
|
82
|
+
'model_name_or_path': 'BAAI/bge-reranker-v2-m3',
|
|
83
83
|
'is_cross_encoder': True,
|
|
84
84
|
'max_seq_length': 512,
|
|
85
85
|
'prompt': '为这个问题生成一个检索用的表示',
|
|
@@ -94,7 +94,8 @@ class TestMTEB(unittest.TestCase):
|
|
|
94
94
|
'verbosity': 2,
|
|
95
95
|
'output_folder': 'outputs',
|
|
96
96
|
'overwrite_results': True,
|
|
97
|
-
'limits': 10,
|
|
97
|
+
# 'limits': 10,
|
|
98
|
+
'top_k': 10,
|
|
98
99
|
},
|
|
99
100
|
},
|
|
100
101
|
}
|
evalscope/models/api/__init__.py
DELETED
|
@@ -1,49 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
|
|
3
|
-
import random
|
|
4
|
-
import time
|
|
5
|
-
|
|
6
|
-
from evalscope.models import ChatBaseModel
|
|
7
|
-
from evalscope.utils.logger import get_logger
|
|
8
|
-
|
|
9
|
-
logger = get_logger()
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class DummyChatModel(ChatBaseModel):
|
|
13
|
-
|
|
14
|
-
MODEL_ID = 'dummy_chat_model_0801'
|
|
15
|
-
REVISION = 'v1.0.0'
|
|
16
|
-
|
|
17
|
-
def __init__(self, model_cfg: dict, **kwargs):
|
|
18
|
-
model_cfg['model_id'] = self.MODEL_ID
|
|
19
|
-
model_cfg['revision'] = self.REVISION
|
|
20
|
-
super(DummyChatModel, self).__init__(model_cfg=model_cfg)
|
|
21
|
-
|
|
22
|
-
def predict(self, inputs: dict, **kwargs) -> dict:
|
|
23
|
-
|
|
24
|
-
debug: bool = False
|
|
25
|
-
if debug:
|
|
26
|
-
messages = inputs['messages']
|
|
27
|
-
history = inputs['history']
|
|
28
|
-
|
|
29
|
-
logger.info(f'** messages: {messages}')
|
|
30
|
-
logger.info(f'** history: {history}')
|
|
31
|
-
|
|
32
|
-
choice = random.choice(['A', 'B', 'C', 'D'])
|
|
33
|
-
|
|
34
|
-
# Build response
|
|
35
|
-
res = {
|
|
36
|
-
'choices': [{
|
|
37
|
-
'index': 0,
|
|
38
|
-
'message': {
|
|
39
|
-
'content': choice,
|
|
40
|
-
'role': 'assistant'
|
|
41
|
-
}
|
|
42
|
-
}],
|
|
43
|
-
'created': time.time(),
|
|
44
|
-
'model': self.MODEL_ID + '-' + self.REVISION,
|
|
45
|
-
'object': 'chat.completion',
|
|
46
|
-
'usage': {}
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
return res
|