evalscope 0.9.0__py3-none-any.whl → 0.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (69) hide show
  1. evalscope/arguments.py +1 -0
  2. evalscope/benchmarks/arc/arc_adapter.py +3 -5
  3. evalscope/benchmarks/bbh/bbh_adapter.py +3 -3
  4. evalscope/benchmarks/benchmark.py +1 -1
  5. evalscope/benchmarks/ceval/ceval_adapter.py +5 -82
  6. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +5 -79
  7. evalscope/benchmarks/competition_math/competition_math_adapter.py +4 -4
  8. evalscope/benchmarks/data_adapter.py +69 -70
  9. evalscope/benchmarks/general_qa/general_qa_adapter.py +10 -63
  10. evalscope/benchmarks/gpqa/__init__.py +0 -0
  11. evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
  12. evalscope/benchmarks/gpqa/gpqa_adapter.py +103 -0
  13. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +4 -5
  14. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +12 -6
  15. evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -4
  16. evalscope/benchmarks/ifeval/__init__.py +0 -0
  17. evalscope/benchmarks/ifeval/ifeval_adapter.py +56 -0
  18. evalscope/benchmarks/ifeval/instructions.py +1477 -0
  19. evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
  20. evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
  21. evalscope/benchmarks/ifeval/utils.py +134 -0
  22. evalscope/benchmarks/iquiz/__init__.py +0 -0
  23. evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
  24. evalscope/benchmarks/mmlu/mmlu_adapter.py +8 -84
  25. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +2 -2
  26. evalscope/benchmarks/race/race_adapter.py +4 -73
  27. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -6
  28. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -57
  29. evalscope/cli/cli.py +2 -0
  30. evalscope/cli/start_app.py +30 -0
  31. evalscope/collections/evaluator.py +82 -62
  32. evalscope/collections/sampler.py +47 -41
  33. evalscope/collections/schema.py +14 -10
  34. evalscope/constants.py +4 -0
  35. evalscope/evaluator/evaluator.py +22 -13
  36. evalscope/metrics/__init__.py +2 -5
  37. evalscope/metrics/metrics.py +11 -2
  38. evalscope/metrics/named_metrics.py +17 -0
  39. evalscope/models/chat_adapter.py +2 -0
  40. evalscope/models/server_adapter.py +11 -4
  41. evalscope/perf/__init__.py +1 -0
  42. evalscope/perf/main.py +0 -1
  43. evalscope/perf/plugin/api/custom_api.py +1 -1
  44. evalscope/perf/plugin/api/openai_api.py +1 -1
  45. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  46. evalscope/perf/plugin/datasets/longalpaca.py +1 -1
  47. evalscope/report/__init__.py +5 -0
  48. evalscope/report/app.py +693 -0
  49. evalscope/report/combinator.py +73 -0
  50. evalscope/report/generator.py +80 -0
  51. evalscope/report/utils.py +133 -0
  52. evalscope/run.py +16 -11
  53. evalscope/summarizer.py +1 -1
  54. evalscope/utils/chat_service.py +1 -1
  55. evalscope/utils/logger.py +1 -0
  56. evalscope/utils/model_utils.py +5 -2
  57. evalscope/version.py +2 -2
  58. {evalscope-0.9.0.dist-info → evalscope-0.10.1.dist-info}/METADATA +84 -7
  59. {evalscope-0.9.0.dist-info → evalscope-0.10.1.dist-info}/RECORD +66 -51
  60. tests/cli/test_collection.py +11 -7
  61. tests/cli/test_run.py +13 -4
  62. evalscope/tools/__init__.py +0 -1
  63. evalscope/tools/combine_reports.py +0 -133
  64. evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
  65. /evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
  66. {evalscope-0.9.0.dist-info → evalscope-0.10.1.dist-info}/LICENSE +0 -0
  67. {evalscope-0.9.0.dist-info → evalscope-0.10.1.dist-info}/WHEEL +0 -0
  68. {evalscope-0.9.0.dist-info → evalscope-0.10.1.dist-info}/entry_points.txt +0 -0
  69. {evalscope-0.9.0.dist-info → evalscope-0.10.1.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,11 @@
1
1
  evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
2
- evalscope/arguments.py,sha256=v0oKhnJ-2RUpEWWKC_-e7Km5osgPJeZC_aKw8R-3Y0A,4382
2
+ evalscope/arguments.py,sha256=v6IyhjgBACDkapnZYi6DeBI1aZxRVA-mx7KR1j72lYs,4493
3
3
  evalscope/config.py,sha256=4klkNziKT4r8a4Z1imkiY16-S8iER1BYPMOG4nJg9lU,8571
4
- evalscope/constants.py,sha256=SAa5IEjcDvcH_ePvCcbValAEyMvGnXPdO0jDmKk8uUs,3277
5
- evalscope/run.py,sha256=cFUwfsXDTQ8NGJYe314LDF_hnuM60UUQxzgbOcPRDbY,5619
4
+ evalscope/constants.py,sha256=bkcDVbB4Pr1Qxz83qefcWjEetVGiHTcx3m84WX14ASI,3330
5
+ evalscope/run.py,sha256=KKZBy2hr8_BscE0ZR1rN9U7iPc1eZYeeInfXe3EY7lA,5718
6
6
  evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
7
- evalscope/summarizer.py,sha256=FgdYz7LlNs5XpDMlj2ULkVQGIg5XVeeWdWJ1_OMweq0,5882
8
- evalscope/version.py,sha256=zr0PUDVLPIYwSv10FsTbYbOSBc6BNKFH3cDqhMMp1Jg,118
7
+ evalscope/summarizer.py,sha256=Wnt8Y61PasOkigo8zMpi1qdFzfETjfnDtCDDQ6VwgDw,5867
8
+ evalscope/version.py,sha256=Bt6Ke7m38AQOnf3xTgdKX-eFqm09Gu5GYEjTkjPrPEk,119
9
9
  evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
11
11
  evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
@@ -56,13 +56,13 @@ evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4ty
56
56
  evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=ZB0wYzPBPyIM0zjfp5C71GcjAgKxKoWxF7RrhuDQBYM,5931
57
57
  evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=D-8Sr0tZblf-3ZR5T0lOggbgFRuy5wq9Nm02GUpNyAs,1620
58
58
  evalscope/benchmarks/__init__.py,sha256=b_SWdV1ZyOqFiwc_9lIjKrIvK1rwnF2cCIF7XN9CN8E,932
59
- evalscope/benchmarks/benchmark.py,sha256=RuQEH5cQv4I9B1XxBZ0vAKTAfYZSUS9eK0o0RrMFVMA,2407
60
- evalscope/benchmarks/data_adapter.py,sha256=-5Z_fdTRmkcXf1wnRuHgPrGVMKIl8Sq8RBTF9_HYo9A,12146
59
+ evalscope/benchmarks/benchmark.py,sha256=SFDjyxd4t4KEcLBP82zE_KCJ_wXuv8J3XFzIR4M9fFI,2419
60
+ evalscope/benchmarks/data_adapter.py,sha256=Aaspp5dR1aINXAopm0y7LHeMwJbmYXfy5bNm9DpagRo,12051
61
61
  evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
62
62
  evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
63
- evalscope/benchmarks/arc/arc_adapter.py,sha256=8xw01LNkx19J4BNN-D2SbzcA6GA_9nAVMH7WNPzBWXs,6661
63
+ evalscope/benchmarks/arc/arc_adapter.py,sha256=TdDB3lazJNdUt2bBo1G7zaOAN6YkKXdcgMui1ygQj3Y,6591
64
64
  evalscope/benchmarks/bbh/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
65
- evalscope/benchmarks/bbh/bbh_adapter.py,sha256=vpFy-05ubDwJ1IIsIV802_fWicgPJvq3uXtIneVhr48,8293
65
+ evalscope/benchmarks/bbh/bbh_adapter.py,sha256=pkgIEr_4QyzngUcs0j4oOscFljGoYZcCAS861Afnt_0,8316
66
66
  evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt,sha256=xnzlaIRyeGlogG49v8nt4vpJO40J06ev4yc8cv0VSRY,1781
67
67
  evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt,sha256=sfo-2iOeVzB0OGgd7NSQFELTGDTsr2DQ3u-g0ivI-sM,3653
68
68
  evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt,sha256=UJBsc3Mwz8TZngdWH_NFlhhNbLhNHK6FvW9FHcS8H5g,1167
@@ -91,63 +91,76 @@ evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt
91
91
  evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt,sha256=s_x6u5MLeKpuAHZj3GNQqY1I8vWqQIfJasOp9XcM7Ck,2945
92
92
  evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt,sha256=qfTZafCzNiz9ULBaDlfy_LISL617NyH5Nc0-nO0K0LE,2164
93
93
  evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
94
- evalscope/benchmarks/ceval/ceval_adapter.py,sha256=-qrzeXWC3dmF-mpJV-Gtz5PDIzCbWaLGdi5x1ha1ZC4,14347
94
+ evalscope/benchmarks/ceval/ceval_adapter.py,sha256=2PvM5cvviyVNeFGnz-ymYVhEyPoea52OL_dg7dwVzQQ,11429
95
95
  evalscope/benchmarks/ceval/ceval_exam.py,sha256=ngOvb6Fymt7iPWIb2fzrUVpqmUT2VBoqh7X_IH8Bcsc,4824
96
96
  evalscope/benchmarks/ceval/samples.jsonl,sha256=dyWhGAdt4eq6Amgu2Ykx8RevUJVFtbhGFSTbDAeUgHc,448
97
97
  evalscope/benchmarks/cmmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
98
98
  evalscope/benchmarks/cmmlu/cmmlu.py,sha256=Y59NIGUFzJEztJbkehZsG4Cz0J_v9Cyju6xazHMYIcA,5022
99
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=G1EnVVeYhycQ58a8PiXfYb3Pe4iEuf8ngHNJ4CUJz14,13311
99
+ evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=O6FIsJDgg4OiHZSafaDq7jZ2gubWumPMhkdVb8WN-D8,10526
100
100
  evalscope/benchmarks/cmmlu/samples.jsonl,sha256=FXbyPQSDorKBGSD0lnOzioZmFjG07lIL87FRDRaMPSY,1722
101
101
  evalscope/benchmarks/competition_math/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
102
102
  evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc8TdmfblZ6mMc39wQWtqqbUi34QLc,2629
103
- evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=xAH3_EiJNhHO1iGTNC7CqTVOF-tpr-9o6Hj_DF5-gNg,6766
103
+ evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=ns2WPbqkR52rRKo244WoAeAO9VOESEl_sHCPhym2DnM,6768
104
104
  evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
105
- evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=F33qTr2LksJOkkR8VqFM4dwM1CKHSsdWfNrZ7w09z2Y,5650
105
+ evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=1MQXl3Wf_Dnzn7_7BSTu7RT6BOfhhiVyAnqECawxyfM,3899
106
+ evalscope/benchmarks/gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
107
+ evalscope/benchmarks/gpqa/chain_of_thought.txt,sha256=pgoT5a-DMPJaMhoH_M8zfU5s80ibWDTVW6vnonITd8k,5610
108
+ evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=Z5TtgPCCT8AVmFCMVIVmfhqe51CyCTaLSYTiev7smPw,4232
106
109
  evalscope/benchmarks/gsm8k/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
107
110
  evalscope/benchmarks/gsm8k/gsm8k.py,sha256=ZDN5lfeZyc_pkTDVY0voC_zUExHE1ZoEgEaTvt5hpXg,4233
108
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=Qo-4fKHMFzSH5TEkc8NbciKOfP9ESY8CcGRV7dgjh7k,11212
111
+ evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=9DuNos8xCOVFOUSJ04LAoBRVPbtqgR4XmOVk6r8ADU8,11114
109
112
  evalscope/benchmarks/hellaswag/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
110
113
  evalscope/benchmarks/hellaswag/hellaswag.py,sha256=5_c9WbaS1LIdvgXzqEcvjAEtKi2V2Yn0YtszPlFqhXI,4610
111
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=Ea_LTREFtroil7D6EGxPT9-QxVGdot5ZhfixUqjuYqo,6046
114
+ evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=p7Nu-1B2mgbjfth1IhkMSWEC0TxOtD6tp_bOWeeRjts,6332
112
115
  evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
113
116
  evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
114
- evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=iGxgOMVJTDAmJMmSzCmErLOwTMpPd11afoF5YgtvMJs,5224
117
+ evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=mjWkJqeRM1JVlrLXaCz1qscneLhYySZt8cgdXZSmJWY,5215
118
+ evalscope/benchmarks/ifeval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
119
+ evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=Jx04TddVZE1gk4wXyljhtt3CLo-7Ux_RcLLMlTV-Nhg,2024
120
+ evalscope/benchmarks/ifeval/instructions.py,sha256=oaJ9D_4rvS67BraHBNPpDtFd4TblFAnR4A3YW9HWfuY,56304
121
+ evalscope/benchmarks/ifeval/instructions_registry.py,sha256=tVUmhuSwnOidLtI8onOAw_gpJ6bi8FL07GiX19hSuo8,7288
122
+ evalscope/benchmarks/ifeval/instructions_util.py,sha256=vkemXeylJMmgW8LgfQe4cSy2OF-oH_NcSZtzyZDURW4,25780
123
+ evalscope/benchmarks/ifeval/utils.py,sha256=TKrM1m2qDCUauahogItDdICf4mDk0OjasSxgnxjt2KY,4517
124
+ evalscope/benchmarks/iquiz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
125
+ evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=gByj-11KGRTQk2wF1UwNACl8i1svBAEDaj-KJm1XEmw,2387
115
126
  evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
116
127
  evalscope/benchmarks/mmlu/mmlu.py,sha256=sA8AC0bN7iURrSazqkY31s_reNVbDZSUCB-NCTQsVeI,5042
117
- evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=8hfAcTXN4c6I45GA8IhU1bJmQMTGJBXoEyaZEuR-ays,14761
128
+ evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=-ONQW0EPAPXFPIpH_Y6zRE-t9j5dT7yABgAU8wxIH4M,11829
118
129
  evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
119
130
  evalscope/benchmarks/mmlu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
120
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=CYDfZTqn6qVwTE66PUpSt-RRqZHwXNZdykQr2QSECSY,4388
131
+ evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=9Mg7AKb2YL7aCilsXNA5_f1JmETfXQd1kOvLkGcKFEA,4372
121
132
  evalscope/benchmarks/race/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
122
133
  evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO61nU,3608
123
- evalscope/benchmarks/race/race_adapter.py,sha256=1tLSb9nCvqCQ_6JjwiknFPD-L1E5pgvOBwZ-11G0JMU,9220
134
+ evalscope/benchmarks/race/race_adapter.py,sha256=9uyQLDA9kVKGu0XhwcBoMyxcgUh3jqWXRO5DahRqUpg,6678
124
135
  evalscope/benchmarks/race/samples.jsonl,sha256=bhSktBgU6axYQCClRtQ7nN8D1x815AU8xMAIG1oflG0,1243
125
136
  evalscope/benchmarks/trivia_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
126
137
  evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=1isBD62PGhCiNbzQa-GFrHHL4XLHIkojWfgSvn7ktf8,3445
127
138
  evalscope/benchmarks/trivia_qa/trivia_qa.py,sha256=eekxaXppMLb5tCQqNLOw2MaWlYDhI2IicPzRsTHqb5A,3070
128
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=pS8-uqNBqRIxTER8oVrLvu8kGJ9L3pvNCqCHZHiCPAc,5191
139
+ evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=e-jrcCvl8fbPzWCOYKq_sbl4XCulsPzAECGtvTPE-rM,5106
129
140
  evalscope/benchmarks/truthful_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
130
141
  evalscope/benchmarks/truthful_qa/truthful_qa.py,sha256=A4abSL6WMRcXez8flxsHy-0ZFyRg-moq9rTeOA1TalY,6909
131
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=UpzhcW7yCMv4GDzDKqL_y0KxeDkvbupuzoRh5qCsiys,14623
142
+ evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=tCVO0RTD_S7z1ky7su5z67dnpgbsEtcH5j0vCpfvUV8,12908
132
143
  evalscope/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
133
144
  evalscope/cli/base.py,sha256=m1DFlF16L0Lyrn0YNuFj8ByGjVJIoI0jKzAoodIXjRk,404
134
- evalscope/cli/cli.py,sha256=yNL3ZeolBc-cVr5D4GByGZWKrmpKIK-48R6wXOXO7Y0,641
145
+ evalscope/cli/cli.py,sha256=w_dtXljur9s5lmTn6LbbFL_viTPQB1WAEzhYcId09Og,729
146
+ evalscope/cli/start_app.py,sha256=_NTmCd15tZOROAnPacGWirMS4OXHrL3n2eZj1kokpks,758
135
147
  evalscope/cli/start_eval.py,sha256=2lyD2WSQ0DnP6T31VvTimQ-6POnwxeEP9GLPFnT7Tfo,767
136
148
  evalscope/cli/start_perf.py,sha256=lEHJBSpzNsO4KGlWfQc-EfZGXq1M_FpOwtRxRdb4fso,813
137
149
  evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
138
150
  evalscope/collections/__init__.py,sha256=hd68Qf-ryeDsz5Pu-Dh83M5V5RE5mhLsG-vc55n5n0o,228
139
- evalscope/collections/evaluator.py,sha256=6bF7TtgHMWOSpuBzpuu9A40y9dNTxdI8vizC5-3LRhI,7404
140
- evalscope/collections/sampler.py,sha256=psvciGq9lE_-EnJxR3l06SM7NC9XmDnRdu1ckH79kXI,4526
141
- evalscope/collections/schema.py,sha256=Eq64Hr8GebsBsO_THixfrIWCioVCpr3LXsGXMaehui0,4055
151
+ evalscope/collections/evaluator.py,sha256=_XaLn_cSKvAW96aNwaaPbrBDPl9qn0VrsTjID_y7SpM,8910
152
+ evalscope/collections/sampler.py,sha256=6Tp0jN7bJQqG-7AQ2UDPDur6O5aC_nl0N-OV9HfuE9Q,4769
153
+ evalscope/collections/schema.py,sha256=Ns47HXt7Ym4sPdPyxStxALHUid2cW7kWhqvw_jK_p-4,4172
142
154
  evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
143
- evalscope/evaluator/evaluator.py,sha256=S3VWI6kFX4cJdsI1Px0-P1y4wmC_PoOqXMFeM3v-C74,16310
155
+ evalscope/evaluator/evaluator.py,sha256=0IOuWQ4KgWuMisNmFqh4-id3d1Kkbkf4JW-6hVz7tqU,16638
144
156
  evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
145
157
  evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
146
158
  evalscope/evaluator/reviewer/auto_reviewer.py,sha256=PDz1Nt_qq0oGZBBwek2-M8VBUNLkOkmYObzR8gO7nEc,16624
147
- evalscope/metrics/__init__.py,sha256=CnhvODaILc4X0dnBoSPuSbTE2WbSf5NEEzM2M9a6uII,434
159
+ evalscope/metrics/__init__.py,sha256=yzuZjXufrPqVhzNTNaJLJwhs7-Sgb-iNG0I3BdOX7Tg,291
148
160
  evalscope/metrics/code_metric.py,sha256=EXE2BZAc9JJT_cpd6eCb0Jo9wwtnzXzHBZxmLzG5Jpw,3300
149
161
  evalscope/metrics/math_accuracy.py,sha256=a0L_YT70bsJYn5_POICJyj6ZVFbHek1ly6j_ssV9Xsc,5585
150
- evalscope/metrics/metrics.py,sha256=XutNgiBAWACPZEIBSzylugDGFV4fDvo-qIYkxG7w2Mc,12634
162
+ evalscope/metrics/metrics.py,sha256=H02Hhj9Me2qzUjSzdV57i5Gj6xP_w5kbuPcuPpejlI0,12860
163
+ evalscope/metrics/named_metrics.py,sha256=j-y-d5EJ4FJzOxlIKobKIMUNu--nzAIIc2j0TvDfFb0,574
151
164
  evalscope/metrics/rouge_metric.py,sha256=zhIUqenSuxnORR9tamLQBGjFwP91Zei2UiLtcOyseVM,4639
152
165
  evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
153
166
  evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=Kq6AObenmLVQ5tN3NgN042a6mgRFQmRO21-ohd9mSa8,11972
@@ -155,32 +168,33 @@ evalscope/metrics/resources/gpt2-zhcn3-v4.bpe,sha256=J_K-oSt9usFEw87Av6j7ETn3J48
155
168
  evalscope/metrics/resources/gpt2-zhcn3-v4.json,sha256=WkM4J_FDPPNQwYi0kj5sM5SVjk2_6bci7tqf8dV9p_U,1289341
156
169
  evalscope/models/__init__.py,sha256=pafIEbJq_2DrYjQbgI0SNVxywNYOxvqwk7Dr1P7KEwk,923
157
170
  evalscope/models/base_adapter.py,sha256=fT3i8c9jRmz_VBcUYMMmXrlCM6JWcixPdgak5yT6Wkw,2177
158
- evalscope/models/chat_adapter.py,sha256=P6CE0JqWDsE7afNfU_wicdisHLfc46Rw3rwTA0sEGQQ,5398
171
+ evalscope/models/chat_adapter.py,sha256=9DIMwacjrR647pYVKgeYn090ZKBVHmMD_mf3Gz2vdw0,5461
159
172
  evalscope/models/choice_adapter.py,sha256=Zb-UUFpF2tpMGuGH_wFleMxpSb__-SuN1cMF7yj25aI,7661
160
173
  evalscope/models/custom_adapter.py,sha256=uj4kbBCwhrXjvSq9f6HgTJ5yJ9FJpvs1k5-9Ekm9RmA,2272
161
174
  evalscope/models/local_model.py,sha256=EBclVq5tqUFNOZebRlNnZSvzwtSun7FsZRf2tx0cMt0,2486
162
175
  evalscope/models/model.py,sha256=diu4TE1ZFWdynTxsl4DejTNsLdwjxoyj2nsKR-Y8EZE,7343
163
- evalscope/models/server_adapter.py,sha256=InS4M_LprbBV4xHcbPCm5y_S8-kApKDYhR-HEKXzG8Q,4169
176
+ evalscope/models/server_adapter.py,sha256=VGk_nTwkLWO7Ln7lV_KSaIBzlSRZzyIs_bWDeJ_pOho,4469
164
177
  evalscope/models/custom/__init__.py,sha256=wb6f_Bi39s5sj-VO7EXRDXB2WhyFb49BUtEMk77ksNQ,102
165
178
  evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
166
- evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
179
+ evalscope/models/custom/dummy_model.py,sha256=ODD6pt9FvZq_a54oYsehBDslRKHOsk9zsC9iAZvi5Yg,2020
180
+ evalscope/perf/__init__.py,sha256=rgSXzxIJ67yB_SLUdl4ljem2-ilB-Gw3640f4KWLO1k,51
167
181
  evalscope/perf/arguments.py,sha256=8KiD4u51B_twEaIiI0_kw4Jknk3YG4S6XN-vgvutChA,9233
168
182
  evalscope/perf/benchmark.py,sha256=qNgDNseW8N0beuAB_4-JVtTdHs7ZaJEHK5XnkMU9vRU,9618
169
183
  evalscope/perf/http_client.py,sha256=TfnQT9OaBlUCpGwi4ifSJBaaGsn3P2KVBPMGuw-Rqkk,7073
170
- evalscope/perf/main.py,sha256=Qg99KhGUjnVAMkNofbDsvMGFxijewH8ri3QoW1y1U7U,1292
184
+ evalscope/perf/main.py,sha256=SUMz8S2XPL8JaSL1-vy8qkrb34d5vp6DfQdwIGOUXTk,1277
171
185
  evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
172
186
  evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
173
187
  evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
174
188
  evalscope/perf/plugin/api/base.py,sha256=B_H04qKx7eRTn155rnDrbTYur7PK1mvxfQKYcqYbndU,2118
175
- evalscope/perf/plugin/api/custom_api.py,sha256=IplmkCu8v9yQrY5CeqBEQDWdOfOp3vRkiDYUcvhw2yY,3775
189
+ evalscope/perf/plugin/api/custom_api.py,sha256=ay1AGi4y2opjwyRl0J0A54-vLB-pBj3QBFkzog0KA-g,3787
176
190
  evalscope/perf/plugin/api/dashscope_api.py,sha256=V5fwn-p_fLH0dWKzhN9TvYSHRgla4INfXC4NDaIjoQ8,3825
177
- evalscope/perf/plugin/api/openai_api.py,sha256=WV2EUIl1PTg-Dj7HMSxJrAE7OUxJZqQmZLJZLHffcJo,6805
191
+ evalscope/perf/plugin/api/openai_api.py,sha256=JxQGlzAbM7MBWcr3MvWiAg6E4lqdQLfkk1qK0vUWvn8,6817
178
192
  evalscope/perf/plugin/datasets/__init__.py,sha256=9mz2TnVHhxbEKAS9pLbKMQuIoShNlZpGiRo9e2RQLUs,490
179
193
  evalscope/perf/plugin/datasets/base.py,sha256=Z-INWueeYjfEZhP4lbTlBMVwIa6BcXZKWx-w7Pop3mA,1786
180
194
  evalscope/perf/plugin/datasets/custom.py,sha256=_GSC5yR90_BjcRjdJqrJT2vHQAzskz5XxYOxngUM2Pg,815
181
- evalscope/perf/plugin/datasets/flickr8k.py,sha256=CGYtmRw71-ycJIObAHm2gmmJl_1MXPJOwmHV-0WS8DY,1581
195
+ evalscope/perf/plugin/datasets/flickr8k.py,sha256=UzAIFIO0m5inWOkWM1mO6wfV2HOuXAqiTxCJ4b0SiZM,1589
182
196
  evalscope/perf/plugin/datasets/line_by_line.py,sha256=IKVZMpKei6XW9DTm9VEssWHE96i1lTqMf0621dA_img,836
183
- evalscope/perf/plugin/datasets/longalpaca.py,sha256=Yx5nxHGkmD4lJOJ-jcyqm2ZsGAxotJc77jUCkO1z0a4,1164
197
+ evalscope/perf/plugin/datasets/longalpaca.py,sha256=2aENqCly_DX1dyNcurYsLFJIvXYFph6jWm7z7XETvMk,1176
184
198
  evalscope/perf/plugin/datasets/openqa.py,sha256=2pv7yyPSFYTjPhvAGBsHl0eQO8gt7Wk1CaKcfTi3Tnc,1394
185
199
  evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=ef8MXhT6756y6LsXSpYeWjmwswu2hRXe2BOVS2_OgVM,1968
186
200
  evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -210,6 +224,11 @@ evalscope/registry/tasks/general_qa.yaml,sha256=S3kdlrazWX2VAX2PMhNtBnFZVSnUKBNi
210
224
  evalscope/registry/tasks/gsm8k.yaml,sha256=M2I7otwOSy0usD8yG8d6QziASQlKdhKLflRHMG0LXiM,729
211
225
  evalscope/registry/tasks/mmlu.yaml,sha256=cJcMH1Cvgo9PlYoTmeGx2bcZayysltaa6ehK57dDkvo,726
212
226
  evalscope/registry/tasks/mmlu_mini.yaml,sha256=K8ouHh7ve5ZsbkqRtV3Jl-DF01YFPuObfwEdACJA4Pk,778
227
+ evalscope/report/__init__.py,sha256=0Wes3ot2hy9s-WwZaBztst8qkNrXkOF-Hwa1WW1e8lY,260
228
+ evalscope/report/app.py,sha256=kB4CCrAoIONRc37Np8B3QsLxJBD_j2Sw2xtfR1FgfC0,26087
229
+ evalscope/report/combinator.py,sha256=bi6nvTbMrzraZ8kUZ6mIMikk8-qEIVYUhdaH4RE1Tg8,2653
230
+ evalscope/report/generator.py,sha256=2DULY9W8QCUxdtyfNjo8XAP_YxI1LgR95jknK__kYPU,3600
231
+ evalscope/report/utils.py,sha256=DRlbjbqHEmM8rGlA4pwtlHFhOZtyUzcqiS-mejfIDkU,4584
213
232
  evalscope/third_party/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
214
233
  evalscope/third_party/longbench_write/README.md,sha256=1yLKeSVIcihpoc4KXr8NpK86JbcLssCPx76aOKdPbYI,5431
215
234
  evalscope/third_party/longbench_write/__init__.py,sha256=GNbBDc7HAh_V2Hfy5HhND_u7z6OI79czoBlP8lX4PVo,126
@@ -237,23 +256,19 @@ evalscope/third_party/toolbench_static/requirements.txt,sha256=OW91Z8hfzh7yQUYgP
237
256
  evalscope/third_party/toolbench_static/toolbench_static.py,sha256=ABb9Gy09zMt30tY50AZGxSZ46k3NVEsvuDj6xlLOjeA,1966
238
257
  evalscope/third_party/toolbench_static/llm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
239
258
  evalscope/third_party/toolbench_static/llm/swift_infer.py,sha256=usmVelh0ogBlCtSUL0dqp89w2mAqH1Ptv9MURVoGrc8,1209
240
- evalscope/tools/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
241
- evalscope/tools/combine_reports.py,sha256=JFf3P_GJLPdlSqpv30D8ioPb7dup3tOTktsELmsKXLI,4900
242
- evalscope/tools/gen_mmlu_subject_mapping.py,sha256=CUmRdReEU7QfMyprh9I56KmHoRww_zUda_JuyxmCL1A,3277
243
- evalscope/tools/rewrite_eval_results.py,sha256=ODD6pt9FvZq_a54oYsehBDslRKHOsk9zsC9iAZvi5Yg,2020
244
259
  evalscope/utils/__init__.py,sha256=jLVoGryuqUh4Km9QWWQBzpqkcVNRK0MbwNaSgckqdiU,139
245
260
  evalscope/utils/arena_utils.py,sha256=Gf8VpH4C_oF2Abif_QeL0rAP6tvTzsc0gglpdNkUE48,7155
246
- evalscope/utils/chat_service.py,sha256=h6Z9CpgdmalD9u2WNxdfJw2MdzDqsMfDHmnNk8GkffY,8666
261
+ evalscope/utils/chat_service.py,sha256=Kh3hEUW_HF158a0QqHbWepHIHRQFJgUM-jCDAcQ_maw,8674
247
262
  evalscope/utils/completion_parsers.py,sha256=YWHkLkSfURTcUjNNlCL6PPDICd4F2Ns9figgPN4C97c,2933
248
263
  evalscope/utils/io_utils.py,sha256=vm6uJBBqx4fc7jsHGbwNQ6Hbx7XYhjT1Q2dQ7aHjDD0,4172
249
- evalscope/utils/logger.py,sha256=Cke17sVV9MrccINeuEsiVouJarDvS4Wt2JUaWK5NFLM,3582
250
- evalscope/utils/model_utils.py,sha256=PqIu1nMhoD7sauZATkuxkPo4lrYTQRh8kleERrWD-Po,678
264
+ evalscope/utils/logger.py,sha256=49F2WDi1g_o8aW8Z29wOt9YHE9LDqkHIgb-d8TVybJY,3635
265
+ evalscope/utils/model_utils.py,sha256=PK7pKNY8ovtGZHNRvDpZ-d8zBHMOkxd6fRVkM8VF06I,736
251
266
  evalscope/utils/utils.py,sha256=a6a2vDDxqlj7nY8xynkKkWs_ZPXEU2UMwvxp0JEpHjg,9686
252
267
  tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
253
268
  tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
254
269
  tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
255
- tests/cli/test_collection.py,sha256=pS-omRGU6yuvk5O5RPRIOklVKWKsV3lvPNvmk7rVIMY,2825
256
- tests/cli/test_run.py,sha256=V5lxiqtuNcpbjewPaE3KD8ssuIolvhhIzYEU7iDXlZE,5492
270
+ tests/cli/test_collection.py,sha256=gx3GySIAPNaLUSf3D3Q3V0WZc21BPdNthIbECHQN0TI,3026
271
+ tests/cli/test_run.py,sha256=aywruYPPweMEHaBOynf0G3liKBKMH_H_e4Znq2PcaR4,5821
257
272
  tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
258
273
  tests/perf/test_perf.py,sha256=iB8Mg565SfwPsObdAByHYfZNqN71kUtPW7ucmyiOWo8,3025
259
274
  tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -266,9 +281,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZU
266
281
  tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
267
282
  tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
268
283
  tests/vlm/test_vlmeval.py,sha256=nzWXjw49SlxXgDnYS9N5JSFtcUp8xPOW2YNNzupvtt4,1806
269
- evalscope-0.9.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
270
- evalscope-0.9.0.dist-info/METADATA,sha256=KbU5bo3jjt1FsaTVXvdRqJJQEgge_431xW3uQHYKawI,25136
271
- evalscope-0.9.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
272
- evalscope-0.9.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
273
- evalscope-0.9.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
274
- evalscope-0.9.0.dist-info/RECORD,,
284
+ evalscope-0.10.1.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
285
+ evalscope-0.10.1.dist-info/METADATA,sha256=-HQt66q9NaZvcNwiXgLW87aduUogXKaHYz6JokxtEXc,28975
286
+ evalscope-0.10.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
287
+ evalscope-0.10.1.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
288
+ evalscope-0.10.1.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
289
+ evalscope-0.10.1.dist-info/RECORD,,
@@ -12,15 +12,19 @@ class TestCollection(unittest.TestCase):
12
12
  def test_create_collection(self):
13
13
  schema = CollectionSchema(name='math&reasoning', datasets=[
14
14
  CollectionSchema(name='math', datasets=[
15
+ CollectionSchema(name='generation', datasets=[
15
16
  DatasetInfo(name='gsm8k', weight=1, task_type='math', tags=['en', 'math']),
16
17
  DatasetInfo(name='competition_math', weight=1, task_type='math', tags=['en', 'math']),
18
+ ]),
19
+ CollectionSchema(name='multiple_choice', datasets=[
17
20
  DatasetInfo(name='cmmlu', weight=2, task_type='math', tags=['zh', 'math'], args={'subset_list': ['college_mathematics', 'high_school_mathematics']}),
18
21
  DatasetInfo(name='ceval', weight=3, task_type='math', tags=['zh', 'math'], args={'subset_list': ['advanced_mathematics', 'high_school_mathematics', 'discrete_mathematics', 'middle_school_mathematics']}),
22
+ ]),
19
23
  ]),
20
24
  CollectionSchema(name='reasoning', datasets=[
21
- DatasetInfo(name='arc', weight=1, task_type='reasoning', tags=['en', 'reasoning']),
22
- DatasetInfo(name='ceval', weight=1, task_type='reasoning', tags=['zh', 'reasoning'], args={'subset_list': ['logic']}),
23
- DatasetInfo(name='race', weight=1, task_type='reasoning', tags=['en', 'reasoning']),
25
+ DatasetInfo(name='arc', weight=1, task_type='reasoning', tags=['en', 'reasoning']),
26
+ DatasetInfo(name='ceval', weight=1, task_type='reasoning', tags=['zh', 'reasoning'], args={'subset_list': ['logic']}),
27
+ DatasetInfo(name='race', weight=1, task_type='reasoning', tags=['en', 'reasoning']),
24
28
  ]),
25
29
  ])
26
30
  print(schema.to_dict())
@@ -32,7 +36,7 @@ class TestCollection(unittest.TestCase):
32
36
  def test_generate_data(self):
33
37
  schema = CollectionSchema.from_dict(json.load(open('outputs/schema_test.json', 'r')))
34
38
  print(schema.to_dict())
35
- mixed_data = WeightedSampler(schema, 100).sample()
39
+ mixed_data = WeightedSampler(schema).sample(100)
36
40
  dump_jsonl_data(mixed_data, 'outputs/mixed_data_test.jsonl')
37
41
 
38
42
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
@@ -40,14 +44,14 @@ class TestCollection(unittest.TestCase):
40
44
  from evalscope import TaskConfig, run_task
41
45
 
42
46
  task_cfg = TaskConfig(
43
- model='qwen2.5',
47
+ model='Qwen2.5-7B-Instruct',
44
48
  api_url='http://127.0.0.1:8801/v1/chat/completions',
45
49
  api_key='EMPTY',
46
50
  eval_type=EvalType.SERVICE,
47
51
  datasets=['data_collection'],
48
52
  dataset_args={'data_collection': {
49
- # 'local_path': 'outputs/mixed_data_test.jsonl'
50
- 'local_path': 'outputs/weighted_mixed_data.jsonl'
53
+ 'local_path': 'outputs/mixed_data_test.jsonl'
54
+ # 'local_path': 'outputs/weighted_mixed_data.jsonl'
51
55
  }},
52
56
  )
53
57
  run_task(task_cfg=task_cfg)
tests/cli/test_run.py CHANGED
@@ -128,12 +128,15 @@ class TestRun(unittest.TestCase):
128
128
  from evalscope.config import TaskConfig
129
129
 
130
130
  task_cfg = TaskConfig(
131
- model='qwen2.5',
131
+ model='Qwen2.5-7B-Instruct',
132
132
  api_url='http://127.0.0.1:8801/v1/chat/completions',
133
133
  api_key='EMPTY',
134
134
  eval_type=EvalType.SERVICE,
135
135
  datasets=[
136
- 'mmlu_pro',
136
+ 'iquiz',
137
+ # 'ifeval',
138
+ # 'mmlu',
139
+ # 'mmlu_pro',
137
140
  # 'race',
138
141
  # 'trivia_qa',
139
142
  # 'cmmlu',
@@ -145,8 +148,14 @@ class TestRun(unittest.TestCase):
145
148
  # 'bbh',
146
149
  # 'hellaswag',
147
150
  ],
148
- limit=2,
149
- debug=True
151
+ dataset_args={
152
+ 'ceval': {
153
+ 'subset_list': [
154
+ 'computer_network', 'operating_system', 'computer_architecture', 'college_programming'
155
+ ]
156
+ }
157
+ },
158
+ # limit=10
150
159
  )
151
160
 
152
161
  run_task(task_cfg=task_cfg)
@@ -1 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
@@ -1,133 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- import glob
4
- import json
5
- import os
6
- from collections import defaultdict
7
- from tabulate import tabulate
8
-
9
- from evalscope.utils.logger import get_logger
10
-
11
- logger = get_logger()
12
- """
13
- Combine and generate table for reports of LLMs.
14
- """
15
-
16
-
17
- def get_report(report_file: str):
18
- data_d: dict = json.load(open(report_file, 'r'))
19
- dataset_name = data_d['dataset_name']
20
- model_name = data_d['model_name']
21
- score = data_d['score'] # float or dict
22
- metric = data_d['metric']
23
- score_d = {}
24
- if isinstance(score, dict):
25
- score_d = score
26
- elif isinstance(score, float):
27
- score_d[metric] = score
28
- else:
29
- raise ValueError(f'Unknown score type: {type(score)}')
30
- score_str = '\n'.join(['(' + dataset_name + '/' + k + ') ' + str(v) for k, v in score_d.items()])
31
-
32
- return model_name, {'dataset_name': dataset_name, 'score': score_str}
33
-
34
-
35
- def get_model_reports(model_report_dir: str):
36
- model_report_dir = os.path.normpath(model_report_dir)
37
- report_files = glob.glob(os.path.join(model_report_dir, '**/*.json'))
38
-
39
- model_reports_d = defaultdict(list)
40
- for file_path in report_files:
41
- model_name, report_d = get_report(file_path)
42
- model_reports_d[model_name].append(report_d)
43
-
44
- return model_reports_d
45
-
46
-
47
- def gen_table(reports_path_list: list):
48
- table_values = []
49
- headers = ['Model']
50
- is_headers_set = False
51
-
52
- for report_path in reports_path_list:
53
- model_reports_d = get_model_reports(report_path)
54
- for model_name, report_list in model_reports_d.items():
55
- report_list = sorted(report_list, key=lambda x: x['dataset_name'])
56
- if not is_headers_set:
57
- headers.extend([x['dataset_name'] for x in report_list])
58
- is_headers_set = True
59
- single_row = []
60
- single_row.append(model_name)
61
- for single_report in report_list:
62
- # e.g. '28.51 (acc)'
63
- single_row.append(single_report['score'])
64
- table_values.append(single_row)
65
-
66
- report_table = tabulate(table_values, headers=headers, tablefmt='grid')
67
- return report_table
68
-
69
-
70
- class ReportsRecorder:
71
- COMMON_DATASET_PATH = []
72
- CUSTOM_DATASET_PATH = []
73
-
74
- def __init__(self, oss_url: str = '', endpoint: str = ''):
75
- if oss_url and endpoint:
76
- import oss2
77
- from oss2.credentials import EnvironmentVariableCredentialsProvider
78
-
79
- auth = oss2.ProviderAuth(EnvironmentVariableCredentialsProvider())
80
- oss_url = oss_url.replace('oss://', '').split('/')
81
- bucket_name = oss_url[0]
82
-
83
- self.object_path = '/'.join(oss_url[1:])
84
- self.bucket = oss2.Bucket(auth, endpoint, bucket_name)
85
- else:
86
- self.object_path = ''
87
- self.bucket = None
88
-
89
- def append_path(self, report_path: str, dataset_name: str):
90
- if dataset_name == 'general_qa':
91
- self.CUSTOM_DATASET_PATH.append(report_path)
92
- else:
93
- self.COMMON_DATASET_PATH.append(report_path)
94
-
95
- def dump_reports(self, output_dir: str):
96
- result = {'CommonDataset': [], 'CustomDataset': []}
97
- for line in self.COMMON_DATASET_PATH:
98
- with open(line, 'r') as f:
99
- report = json.load(f)
100
- result['CommonDataset'].append(report)
101
- for line in self.CUSTOM_DATASET_PATH:
102
- with open(line, 'r') as f:
103
- report = json.load(f)
104
- report.update({'name': os.path.basename(line)})
105
- result['CustomDataset'].append(report)
106
-
107
- os.makedirs(output_dir, exist_ok=True)
108
- output_file_name = 'metric.json'
109
- output_path = os.path.join(output_dir, output_file_name)
110
- with open(output_path, 'w+') as f:
111
- f.write(json.dumps(result, ensure_ascii=False, indent=4))
112
-
113
- if self.bucket:
114
- remote_path = os.path.join(self.object_path, output_file_name)
115
- logger.info(f'** Upload report to oss: {remote_path}')
116
- self.bucket.put_object_from_file(remote_path, output_path)
117
-
118
-
119
- if __name__ == '__main__':
120
- report_dir_1 = '/to/path/20231129_020533_default_ZhipuAI_chatglm2-6b-base_none/reports'
121
- report_dir_2 = '/to/path/20231129_020533_default_ZhipuAI_chatglm2-6b_none/reports'
122
-
123
- report_table = gen_table([report_dir_1, report_dir_2])
124
- print(report_table)
125
-
126
- # ALL VALUES ONLY FOR EXAMPLE
127
- # +--------------------------+-------------------+-------------+
128
- # | Model | CompetitionMath | GSM8K |
129
- # +==========================+===================+=============+
130
- # | ZhipuAI_chatglm2-6b-base | 25.0 (acc) | 30.50 (acc) |
131
- # +--------------------------+-------------------+-------------+
132
- # | ZhipuAI_chatglm2-6b | 30.5 (acc) | 40.50 (acc) |
133
- # +--------------------------+-------------------+-------------+
@@ -1,90 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- # Note: refer to https://github.com/hendrycks/test/blob/master/categories.py
4
-
5
- subcategories = {
6
- 'abstract_algebra': ['math'],
7
- 'anatomy': ['health'],
8
- 'astronomy': ['physics'],
9
- 'business_ethics': ['business'],
10
- 'clinical_knowledge': ['health'],
11
- 'college_biology': ['biology'],
12
- 'college_chemistry': ['chemistry'],
13
- 'college_computer_science': ['computer science'],
14
- 'college_mathematics': ['math'],
15
- 'college_medicine': ['health'],
16
- 'college_physics': ['physics'],
17
- 'computer_security': ['computer science'],
18
- 'conceptual_physics': ['physics'],
19
- 'econometrics': ['economics'],
20
- 'electrical_engineering': ['engineering'],
21
- 'elementary_mathematics': ['math'],
22
- 'formal_logic': ['philosophy'],
23
- 'global_facts': ['other'],
24
- 'high_school_biology': ['biology'],
25
- 'high_school_chemistry': ['chemistry'],
26
- 'high_school_computer_science': ['computer science'],
27
- 'high_school_european_history': ['history'],
28
- 'high_school_geography': ['geography'],
29
- 'high_school_government_and_politics': ['politics'],
30
- 'high_school_macroeconomics': ['economics'],
31
- 'high_school_mathematics': ['math'],
32
- 'high_school_microeconomics': ['economics'],
33
- 'high_school_physics': ['physics'],
34
- 'high_school_psychology': ['psychology'],
35
- 'high_school_statistics': ['math'],
36
- 'high_school_us_history': ['history'],
37
- 'high_school_world_history': ['history'],
38
- 'human_aging': ['health'],
39
- 'human_sexuality': ['culture'],
40
- 'international_law': ['law'],
41
- 'jurisprudence': ['law'],
42
- 'logical_fallacies': ['philosophy'],
43
- 'machine_learning': ['computer science'],
44
- 'management': ['business'],
45
- 'marketing': ['business'],
46
- 'medical_genetics': ['health'],
47
- 'miscellaneous': ['other'],
48
- 'moral_disputes': ['philosophy'],
49
- 'moral_scenarios': ['philosophy'],
50
- 'nutrition': ['health'],
51
- 'philosophy': ['philosophy'],
52
- 'prehistory': ['history'],
53
- 'professional_accounting': ['other'],
54
- 'professional_law': ['law'],
55
- 'professional_medicine': ['health'],
56
- 'professional_psychology': ['psychology'],
57
- 'public_relations': ['politics'],
58
- 'security_studies': ['politics'],
59
- 'sociology': ['culture'],
60
- 'us_foreign_policy': ['politics'],
61
- 'virology': ['health'],
62
- 'world_religions': ['philosophy'],
63
- }
64
-
65
- categories = {
66
- 'STEM': ['physics', 'chemistry', 'biology', 'computer science', 'math', 'engineering'],
67
- 'Humanities': ['history', 'philosophy', 'law'],
68
- 'Social Science': ['politics', 'culture', 'economics', 'geography', 'psychology'],
69
- 'Other': ['other', 'business', 'health'],
70
- }
71
-
72
-
73
- def main():
74
-
75
- reversed_categories = {}
76
- for category, subcategory_list in categories.items():
77
- for subcategory in subcategory_list:
78
- reversed_categories[subcategory] = category
79
-
80
- subject_mapping = {}
81
- for subject, subcategory_list in subcategories.items():
82
- category_name: str = reversed_categories[subcategory_list[0]]
83
- subject_show_name: str = ' '.join([item.capitalize() for item in subject.split('_')])
84
- subject_mapping[subject] = [subject_show_name, subcategory_list[0], category_name]
85
-
86
- print(subject_mapping)
87
-
88
-
89
- if __name__ == '__main__':
90
- main()