evalscope 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. evalscope/arguments.py +1 -0
  2. evalscope/benchmarks/arc/arc_adapter.py +3 -5
  3. evalscope/benchmarks/bbh/bbh_adapter.py +3 -3
  4. evalscope/benchmarks/benchmark.py +1 -1
  5. evalscope/benchmarks/ceval/ceval_adapter.py +5 -82
  6. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +5 -79
  7. evalscope/benchmarks/competition_math/competition_math_adapter.py +4 -4
  8. evalscope/benchmarks/data_adapter.py +69 -70
  9. evalscope/benchmarks/general_qa/general_qa_adapter.py +10 -63
  10. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +4 -5
  11. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +12 -6
  12. evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -4
  13. evalscope/benchmarks/ifeval/__init__.py +0 -0
  14. evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
  15. evalscope/benchmarks/ifeval/instructions.py +1478 -0
  16. evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
  17. evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
  18. evalscope/benchmarks/ifeval/utils.py +134 -0
  19. evalscope/benchmarks/iquiz/__init__.py +0 -0
  20. evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
  21. evalscope/benchmarks/mmlu/mmlu_adapter.py +8 -84
  22. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +2 -2
  23. evalscope/benchmarks/race/race_adapter.py +4 -73
  24. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -6
  25. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -57
  26. evalscope/cli/cli.py +2 -0
  27. evalscope/cli/start_app.py +29 -0
  28. evalscope/collections/evaluator.py +82 -62
  29. evalscope/collections/sampler.py +47 -41
  30. evalscope/collections/schema.py +14 -10
  31. evalscope/constants.py +4 -0
  32. evalscope/evaluator/evaluator.py +22 -13
  33. evalscope/metrics/__init__.py +2 -5
  34. evalscope/metrics/metrics.py +11 -2
  35. evalscope/metrics/named_metrics.py +17 -0
  36. evalscope/models/server_adapter.py +11 -4
  37. evalscope/perf/__init__.py +1 -0
  38. evalscope/perf/main.py +0 -1
  39. evalscope/perf/plugin/api/custom_api.py +1 -1
  40. evalscope/perf/plugin/api/openai_api.py +1 -1
  41. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  42. evalscope/perf/plugin/datasets/longalpaca.py +1 -1
  43. evalscope/report/__init__.py +5 -0
  44. evalscope/report/app.py +506 -0
  45. evalscope/report/combinator.py +73 -0
  46. evalscope/report/generator.py +80 -0
  47. evalscope/report/utils.py +133 -0
  48. evalscope/run.py +16 -11
  49. evalscope/summarizer.py +1 -1
  50. evalscope/utils/chat_service.py +1 -1
  51. evalscope/utils/logger.py +1 -0
  52. evalscope/utils/model_utils.py +5 -2
  53. evalscope/version.py +2 -2
  54. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/METADATA +84 -7
  55. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/RECORD +62 -50
  56. tests/cli/test_collection.py +11 -7
  57. tests/cli/test_run.py +13 -4
  58. evalscope/tools/__init__.py +0 -1
  59. evalscope/tools/combine_reports.py +0 -133
  60. evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
  61. /evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
  62. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/LICENSE +0 -0
  63. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/WHEEL +0 -0
  64. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt +0 -0
  65. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,11 @@
1
1
  evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
2
- evalscope/arguments.py,sha256=v0oKhnJ-2RUpEWWKC_-e7Km5osgPJeZC_aKw8R-3Y0A,4382
2
+ evalscope/arguments.py,sha256=v6IyhjgBACDkapnZYi6DeBI1aZxRVA-mx7KR1j72lYs,4493
3
3
  evalscope/config.py,sha256=4klkNziKT4r8a4Z1imkiY16-S8iER1BYPMOG4nJg9lU,8571
4
- evalscope/constants.py,sha256=SAa5IEjcDvcH_ePvCcbValAEyMvGnXPdO0jDmKk8uUs,3277
5
- evalscope/run.py,sha256=cFUwfsXDTQ8NGJYe314LDF_hnuM60UUQxzgbOcPRDbY,5619
4
+ evalscope/constants.py,sha256=bkcDVbB4Pr1Qxz83qefcWjEetVGiHTcx3m84WX14ASI,3330
5
+ evalscope/run.py,sha256=KKZBy2hr8_BscE0ZR1rN9U7iPc1eZYeeInfXe3EY7lA,5718
6
6
  evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
7
- evalscope/summarizer.py,sha256=FgdYz7LlNs5XpDMlj2ULkVQGIg5XVeeWdWJ1_OMweq0,5882
8
- evalscope/version.py,sha256=zr0PUDVLPIYwSv10FsTbYbOSBc6BNKFH3cDqhMMp1Jg,118
7
+ evalscope/summarizer.py,sha256=Wnt8Y61PasOkigo8zMpi1qdFzfETjfnDtCDDQ6VwgDw,5867
8
+ evalscope/version.py,sha256=59oai-Z2lJog2HCNhMbBxRg4D3vkwPK5sfffmDSPntE,119
9
9
  evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
11
11
  evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
@@ -56,13 +56,13 @@ evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4ty
56
56
  evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=ZB0wYzPBPyIM0zjfp5C71GcjAgKxKoWxF7RrhuDQBYM,5931
57
57
  evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=D-8Sr0tZblf-3ZR5T0lOggbgFRuy5wq9Nm02GUpNyAs,1620
58
58
  evalscope/benchmarks/__init__.py,sha256=b_SWdV1ZyOqFiwc_9lIjKrIvK1rwnF2cCIF7XN9CN8E,932
59
- evalscope/benchmarks/benchmark.py,sha256=RuQEH5cQv4I9B1XxBZ0vAKTAfYZSUS9eK0o0RrMFVMA,2407
60
- evalscope/benchmarks/data_adapter.py,sha256=-5Z_fdTRmkcXf1wnRuHgPrGVMKIl8Sq8RBTF9_HYo9A,12146
59
+ evalscope/benchmarks/benchmark.py,sha256=SFDjyxd4t4KEcLBP82zE_KCJ_wXuv8J3XFzIR4M9fFI,2419
60
+ evalscope/benchmarks/data_adapter.py,sha256=Aaspp5dR1aINXAopm0y7LHeMwJbmYXfy5bNm9DpagRo,12051
61
61
  evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
62
62
  evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
63
- evalscope/benchmarks/arc/arc_adapter.py,sha256=8xw01LNkx19J4BNN-D2SbzcA6GA_9nAVMH7WNPzBWXs,6661
63
+ evalscope/benchmarks/arc/arc_adapter.py,sha256=TdDB3lazJNdUt2bBo1G7zaOAN6YkKXdcgMui1ygQj3Y,6591
64
64
  evalscope/benchmarks/bbh/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
65
- evalscope/benchmarks/bbh/bbh_adapter.py,sha256=vpFy-05ubDwJ1IIsIV802_fWicgPJvq3uXtIneVhr48,8293
65
+ evalscope/benchmarks/bbh/bbh_adapter.py,sha256=pkgIEr_4QyzngUcs0j4oOscFljGoYZcCAS861Afnt_0,8316
66
66
  evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt,sha256=xnzlaIRyeGlogG49v8nt4vpJO40J06ev4yc8cv0VSRY,1781
67
67
  evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt,sha256=sfo-2iOeVzB0OGgd7NSQFELTGDTsr2DQ3u-g0ivI-sM,3653
68
68
  evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt,sha256=UJBsc3Mwz8TZngdWH_NFlhhNbLhNHK6FvW9FHcS8H5g,1167
@@ -91,63 +91,73 @@ evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt
91
91
  evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt,sha256=s_x6u5MLeKpuAHZj3GNQqY1I8vWqQIfJasOp9XcM7Ck,2945
92
92
  evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt,sha256=qfTZafCzNiz9ULBaDlfy_LISL617NyH5Nc0-nO0K0LE,2164
93
93
  evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
94
- evalscope/benchmarks/ceval/ceval_adapter.py,sha256=-qrzeXWC3dmF-mpJV-Gtz5PDIzCbWaLGdi5x1ha1ZC4,14347
94
+ evalscope/benchmarks/ceval/ceval_adapter.py,sha256=2PvM5cvviyVNeFGnz-ymYVhEyPoea52OL_dg7dwVzQQ,11429
95
95
  evalscope/benchmarks/ceval/ceval_exam.py,sha256=ngOvb6Fymt7iPWIb2fzrUVpqmUT2VBoqh7X_IH8Bcsc,4824
96
96
  evalscope/benchmarks/ceval/samples.jsonl,sha256=dyWhGAdt4eq6Amgu2Ykx8RevUJVFtbhGFSTbDAeUgHc,448
97
97
  evalscope/benchmarks/cmmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
98
98
  evalscope/benchmarks/cmmlu/cmmlu.py,sha256=Y59NIGUFzJEztJbkehZsG4Cz0J_v9Cyju6xazHMYIcA,5022
99
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=G1EnVVeYhycQ58a8PiXfYb3Pe4iEuf8ngHNJ4CUJz14,13311
99
+ evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=O6FIsJDgg4OiHZSafaDq7jZ2gubWumPMhkdVb8WN-D8,10526
100
100
  evalscope/benchmarks/cmmlu/samples.jsonl,sha256=FXbyPQSDorKBGSD0lnOzioZmFjG07lIL87FRDRaMPSY,1722
101
101
  evalscope/benchmarks/competition_math/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
102
102
  evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc8TdmfblZ6mMc39wQWtqqbUi34QLc,2629
103
- evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=xAH3_EiJNhHO1iGTNC7CqTVOF-tpr-9o6Hj_DF5-gNg,6766
103
+ evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=ns2WPbqkR52rRKo244WoAeAO9VOESEl_sHCPhym2DnM,6768
104
104
  evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
105
- evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=F33qTr2LksJOkkR8VqFM4dwM1CKHSsdWfNrZ7w09z2Y,5650
105
+ evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=1MQXl3Wf_Dnzn7_7BSTu7RT6BOfhhiVyAnqECawxyfM,3899
106
106
  evalscope/benchmarks/gsm8k/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
107
107
  evalscope/benchmarks/gsm8k/gsm8k.py,sha256=ZDN5lfeZyc_pkTDVY0voC_zUExHE1ZoEgEaTvt5hpXg,4233
108
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=Qo-4fKHMFzSH5TEkc8NbciKOfP9ESY8CcGRV7dgjh7k,11212
108
+ evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=9DuNos8xCOVFOUSJ04LAoBRVPbtqgR4XmOVk6r8ADU8,11114
109
109
  evalscope/benchmarks/hellaswag/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
110
110
  evalscope/benchmarks/hellaswag/hellaswag.py,sha256=5_c9WbaS1LIdvgXzqEcvjAEtKi2V2Yn0YtszPlFqhXI,4610
111
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=Ea_LTREFtroil7D6EGxPT9-QxVGdot5ZhfixUqjuYqo,6046
111
+ evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=p7Nu-1B2mgbjfth1IhkMSWEC0TxOtD6tp_bOWeeRjts,6332
112
112
  evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
113
113
  evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
114
- evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=iGxgOMVJTDAmJMmSzCmErLOwTMpPd11afoF5YgtvMJs,5224
114
+ evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=mjWkJqeRM1JVlrLXaCz1qscneLhYySZt8cgdXZSmJWY,5215
115
+ evalscope/benchmarks/ifeval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
116
+ evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=JwJoXfAiawx9Rey1MsEtwCdo7QMl_wxOjspiWAuJFko,2074
117
+ evalscope/benchmarks/ifeval/instructions.py,sha256=8mV4f9H1vE8tEnbF1k8uVoDjzJL2tt7lCu2JQaqJelw,56247
118
+ evalscope/benchmarks/ifeval/instructions_registry.py,sha256=tVUmhuSwnOidLtI8onOAw_gpJ6bi8FL07GiX19hSuo8,7288
119
+ evalscope/benchmarks/ifeval/instructions_util.py,sha256=vkemXeylJMmgW8LgfQe4cSy2OF-oH_NcSZtzyZDURW4,25780
120
+ evalscope/benchmarks/ifeval/utils.py,sha256=TKrM1m2qDCUauahogItDdICf4mDk0OjasSxgnxjt2KY,4517
121
+ evalscope/benchmarks/iquiz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
122
+ evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=gByj-11KGRTQk2wF1UwNACl8i1svBAEDaj-KJm1XEmw,2387
115
123
  evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
116
124
  evalscope/benchmarks/mmlu/mmlu.py,sha256=sA8AC0bN7iURrSazqkY31s_reNVbDZSUCB-NCTQsVeI,5042
117
- evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=8hfAcTXN4c6I45GA8IhU1bJmQMTGJBXoEyaZEuR-ays,14761
125
+ evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=-ONQW0EPAPXFPIpH_Y6zRE-t9j5dT7yABgAU8wxIH4M,11829
118
126
  evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
119
127
  evalscope/benchmarks/mmlu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
120
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=CYDfZTqn6qVwTE66PUpSt-RRqZHwXNZdykQr2QSECSY,4388
128
+ evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=9Mg7AKb2YL7aCilsXNA5_f1JmETfXQd1kOvLkGcKFEA,4372
121
129
  evalscope/benchmarks/race/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
122
130
  evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO61nU,3608
123
- evalscope/benchmarks/race/race_adapter.py,sha256=1tLSb9nCvqCQ_6JjwiknFPD-L1E5pgvOBwZ-11G0JMU,9220
131
+ evalscope/benchmarks/race/race_adapter.py,sha256=9uyQLDA9kVKGu0XhwcBoMyxcgUh3jqWXRO5DahRqUpg,6678
124
132
  evalscope/benchmarks/race/samples.jsonl,sha256=bhSktBgU6axYQCClRtQ7nN8D1x815AU8xMAIG1oflG0,1243
125
133
  evalscope/benchmarks/trivia_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
126
134
  evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=1isBD62PGhCiNbzQa-GFrHHL4XLHIkojWfgSvn7ktf8,3445
127
135
  evalscope/benchmarks/trivia_qa/trivia_qa.py,sha256=eekxaXppMLb5tCQqNLOw2MaWlYDhI2IicPzRsTHqb5A,3070
128
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=pS8-uqNBqRIxTER8oVrLvu8kGJ9L3pvNCqCHZHiCPAc,5191
136
+ evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=e-jrcCvl8fbPzWCOYKq_sbl4XCulsPzAECGtvTPE-rM,5106
129
137
  evalscope/benchmarks/truthful_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
130
138
  evalscope/benchmarks/truthful_qa/truthful_qa.py,sha256=A4abSL6WMRcXez8flxsHy-0ZFyRg-moq9rTeOA1TalY,6909
131
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=UpzhcW7yCMv4GDzDKqL_y0KxeDkvbupuzoRh5qCsiys,14623
139
+ evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=tCVO0RTD_S7z1ky7su5z67dnpgbsEtcH5j0vCpfvUV8,12908
132
140
  evalscope/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
133
141
  evalscope/cli/base.py,sha256=m1DFlF16L0Lyrn0YNuFj8ByGjVJIoI0jKzAoodIXjRk,404
134
- evalscope/cli/cli.py,sha256=yNL3ZeolBc-cVr5D4GByGZWKrmpKIK-48R6wXOXO7Y0,641
142
+ evalscope/cli/cli.py,sha256=w_dtXljur9s5lmTn6LbbFL_viTPQB1WAEzhYcId09Og,729
143
+ evalscope/cli/start_app.py,sha256=icLwBq5yHVmJ4C9y-sYq_o_rPvCT-oO-F2r7RlegHv0,706
135
144
  evalscope/cli/start_eval.py,sha256=2lyD2WSQ0DnP6T31VvTimQ-6POnwxeEP9GLPFnT7Tfo,767
136
145
  evalscope/cli/start_perf.py,sha256=lEHJBSpzNsO4KGlWfQc-EfZGXq1M_FpOwtRxRdb4fso,813
137
146
  evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
138
147
  evalscope/collections/__init__.py,sha256=hd68Qf-ryeDsz5Pu-Dh83M5V5RE5mhLsG-vc55n5n0o,228
139
- evalscope/collections/evaluator.py,sha256=6bF7TtgHMWOSpuBzpuu9A40y9dNTxdI8vizC5-3LRhI,7404
140
- evalscope/collections/sampler.py,sha256=psvciGq9lE_-EnJxR3l06SM7NC9XmDnRdu1ckH79kXI,4526
141
- evalscope/collections/schema.py,sha256=Eq64Hr8GebsBsO_THixfrIWCioVCpr3LXsGXMaehui0,4055
148
+ evalscope/collections/evaluator.py,sha256=_XaLn_cSKvAW96aNwaaPbrBDPl9qn0VrsTjID_y7SpM,8910
149
+ evalscope/collections/sampler.py,sha256=6Tp0jN7bJQqG-7AQ2UDPDur6O5aC_nl0N-OV9HfuE9Q,4769
150
+ evalscope/collections/schema.py,sha256=Ns47HXt7Ym4sPdPyxStxALHUid2cW7kWhqvw_jK_p-4,4172
142
151
  evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
143
- evalscope/evaluator/evaluator.py,sha256=S3VWI6kFX4cJdsI1Px0-P1y4wmC_PoOqXMFeM3v-C74,16310
152
+ evalscope/evaluator/evaluator.py,sha256=0IOuWQ4KgWuMisNmFqh4-id3d1Kkbkf4JW-6hVz7tqU,16638
144
153
  evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
145
154
  evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
146
155
  evalscope/evaluator/reviewer/auto_reviewer.py,sha256=PDz1Nt_qq0oGZBBwek2-M8VBUNLkOkmYObzR8gO7nEc,16624
147
- evalscope/metrics/__init__.py,sha256=CnhvODaILc4X0dnBoSPuSbTE2WbSf5NEEzM2M9a6uII,434
156
+ evalscope/metrics/__init__.py,sha256=yzuZjXufrPqVhzNTNaJLJwhs7-Sgb-iNG0I3BdOX7Tg,291
148
157
  evalscope/metrics/code_metric.py,sha256=EXE2BZAc9JJT_cpd6eCb0Jo9wwtnzXzHBZxmLzG5Jpw,3300
149
158
  evalscope/metrics/math_accuracy.py,sha256=a0L_YT70bsJYn5_POICJyj6ZVFbHek1ly6j_ssV9Xsc,5585
150
- evalscope/metrics/metrics.py,sha256=XutNgiBAWACPZEIBSzylugDGFV4fDvo-qIYkxG7w2Mc,12634
159
+ evalscope/metrics/metrics.py,sha256=H02Hhj9Me2qzUjSzdV57i5Gj6xP_w5kbuPcuPpejlI0,12860
160
+ evalscope/metrics/named_metrics.py,sha256=j-y-d5EJ4FJzOxlIKobKIMUNu--nzAIIc2j0TvDfFb0,574
151
161
  evalscope/metrics/rouge_metric.py,sha256=zhIUqenSuxnORR9tamLQBGjFwP91Zei2UiLtcOyseVM,4639
152
162
  evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
153
163
  evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=Kq6AObenmLVQ5tN3NgN042a6mgRFQmRO21-ohd9mSa8,11972
@@ -160,27 +170,28 @@ evalscope/models/choice_adapter.py,sha256=Zb-UUFpF2tpMGuGH_wFleMxpSb__-SuN1cMF7y
160
170
  evalscope/models/custom_adapter.py,sha256=uj4kbBCwhrXjvSq9f6HgTJ5yJ9FJpvs1k5-9Ekm9RmA,2272
161
171
  evalscope/models/local_model.py,sha256=EBclVq5tqUFNOZebRlNnZSvzwtSun7FsZRf2tx0cMt0,2486
162
172
  evalscope/models/model.py,sha256=diu4TE1ZFWdynTxsl4DejTNsLdwjxoyj2nsKR-Y8EZE,7343
163
- evalscope/models/server_adapter.py,sha256=InS4M_LprbBV4xHcbPCm5y_S8-kApKDYhR-HEKXzG8Q,4169
173
+ evalscope/models/server_adapter.py,sha256=VGk_nTwkLWO7Ln7lV_KSaIBzlSRZzyIs_bWDeJ_pOho,4469
164
174
  evalscope/models/custom/__init__.py,sha256=wb6f_Bi39s5sj-VO7EXRDXB2WhyFb49BUtEMk77ksNQ,102
165
175
  evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
166
- evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
176
+ evalscope/models/custom/dummy_model.py,sha256=ODD6pt9FvZq_a54oYsehBDslRKHOsk9zsC9iAZvi5Yg,2020
177
+ evalscope/perf/__init__.py,sha256=rgSXzxIJ67yB_SLUdl4ljem2-ilB-Gw3640f4KWLO1k,51
167
178
  evalscope/perf/arguments.py,sha256=8KiD4u51B_twEaIiI0_kw4Jknk3YG4S6XN-vgvutChA,9233
168
179
  evalscope/perf/benchmark.py,sha256=qNgDNseW8N0beuAB_4-JVtTdHs7ZaJEHK5XnkMU9vRU,9618
169
180
  evalscope/perf/http_client.py,sha256=TfnQT9OaBlUCpGwi4ifSJBaaGsn3P2KVBPMGuw-Rqkk,7073
170
- evalscope/perf/main.py,sha256=Qg99KhGUjnVAMkNofbDsvMGFxijewH8ri3QoW1y1U7U,1292
181
+ evalscope/perf/main.py,sha256=SUMz8S2XPL8JaSL1-vy8qkrb34d5vp6DfQdwIGOUXTk,1277
171
182
  evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
172
183
  evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
173
184
  evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
174
185
  evalscope/perf/plugin/api/base.py,sha256=B_H04qKx7eRTn155rnDrbTYur7PK1mvxfQKYcqYbndU,2118
175
- evalscope/perf/plugin/api/custom_api.py,sha256=IplmkCu8v9yQrY5CeqBEQDWdOfOp3vRkiDYUcvhw2yY,3775
186
+ evalscope/perf/plugin/api/custom_api.py,sha256=ay1AGi4y2opjwyRl0J0A54-vLB-pBj3QBFkzog0KA-g,3787
176
187
  evalscope/perf/plugin/api/dashscope_api.py,sha256=V5fwn-p_fLH0dWKzhN9TvYSHRgla4INfXC4NDaIjoQ8,3825
177
- evalscope/perf/plugin/api/openai_api.py,sha256=WV2EUIl1PTg-Dj7HMSxJrAE7OUxJZqQmZLJZLHffcJo,6805
188
+ evalscope/perf/plugin/api/openai_api.py,sha256=JxQGlzAbM7MBWcr3MvWiAg6E4lqdQLfkk1qK0vUWvn8,6817
178
189
  evalscope/perf/plugin/datasets/__init__.py,sha256=9mz2TnVHhxbEKAS9pLbKMQuIoShNlZpGiRo9e2RQLUs,490
179
190
  evalscope/perf/plugin/datasets/base.py,sha256=Z-INWueeYjfEZhP4lbTlBMVwIa6BcXZKWx-w7Pop3mA,1786
180
191
  evalscope/perf/plugin/datasets/custom.py,sha256=_GSC5yR90_BjcRjdJqrJT2vHQAzskz5XxYOxngUM2Pg,815
181
- evalscope/perf/plugin/datasets/flickr8k.py,sha256=CGYtmRw71-ycJIObAHm2gmmJl_1MXPJOwmHV-0WS8DY,1581
192
+ evalscope/perf/plugin/datasets/flickr8k.py,sha256=UzAIFIO0m5inWOkWM1mO6wfV2HOuXAqiTxCJ4b0SiZM,1589
182
193
  evalscope/perf/plugin/datasets/line_by_line.py,sha256=IKVZMpKei6XW9DTm9VEssWHE96i1lTqMf0621dA_img,836
183
- evalscope/perf/plugin/datasets/longalpaca.py,sha256=Yx5nxHGkmD4lJOJ-jcyqm2ZsGAxotJc77jUCkO1z0a4,1164
194
+ evalscope/perf/plugin/datasets/longalpaca.py,sha256=2aENqCly_DX1dyNcurYsLFJIvXYFph6jWm7z7XETvMk,1176
184
195
  evalscope/perf/plugin/datasets/openqa.py,sha256=2pv7yyPSFYTjPhvAGBsHl0eQO8gt7Wk1CaKcfTi3Tnc,1394
185
196
  evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=ef8MXhT6756y6LsXSpYeWjmwswu2hRXe2BOVS2_OgVM,1968
186
197
  evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -210,6 +221,11 @@ evalscope/registry/tasks/general_qa.yaml,sha256=S3kdlrazWX2VAX2PMhNtBnFZVSnUKBNi
210
221
  evalscope/registry/tasks/gsm8k.yaml,sha256=M2I7otwOSy0usD8yG8d6QziASQlKdhKLflRHMG0LXiM,729
211
222
  evalscope/registry/tasks/mmlu.yaml,sha256=cJcMH1Cvgo9PlYoTmeGx2bcZayysltaa6ehK57dDkvo,726
212
223
  evalscope/registry/tasks/mmlu_mini.yaml,sha256=K8ouHh7ve5ZsbkqRtV3Jl-DF01YFPuObfwEdACJA4Pk,778
224
+ evalscope/report/__init__.py,sha256=0Wes3ot2hy9s-WwZaBztst8qkNrXkOF-Hwa1WW1e8lY,260
225
+ evalscope/report/app.py,sha256=rqjKgo7BFow4cA-vN9GaihQCd2m55ndHgUkWVr4Koyk,19470
226
+ evalscope/report/combinator.py,sha256=bi6nvTbMrzraZ8kUZ6mIMikk8-qEIVYUhdaH4RE1Tg8,2653
227
+ evalscope/report/generator.py,sha256=2DULY9W8QCUxdtyfNjo8XAP_YxI1LgR95jknK__kYPU,3600
228
+ evalscope/report/utils.py,sha256=DRlbjbqHEmM8rGlA4pwtlHFhOZtyUzcqiS-mejfIDkU,4584
213
229
  evalscope/third_party/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
214
230
  evalscope/third_party/longbench_write/README.md,sha256=1yLKeSVIcihpoc4KXr8NpK86JbcLssCPx76aOKdPbYI,5431
215
231
  evalscope/third_party/longbench_write/__init__.py,sha256=GNbBDc7HAh_V2Hfy5HhND_u7z6OI79czoBlP8lX4PVo,126
@@ -237,23 +253,19 @@ evalscope/third_party/toolbench_static/requirements.txt,sha256=OW91Z8hfzh7yQUYgP
237
253
  evalscope/third_party/toolbench_static/toolbench_static.py,sha256=ABb9Gy09zMt30tY50AZGxSZ46k3NVEsvuDj6xlLOjeA,1966
238
254
  evalscope/third_party/toolbench_static/llm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
239
255
  evalscope/third_party/toolbench_static/llm/swift_infer.py,sha256=usmVelh0ogBlCtSUL0dqp89w2mAqH1Ptv9MURVoGrc8,1209
240
- evalscope/tools/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
241
- evalscope/tools/combine_reports.py,sha256=JFf3P_GJLPdlSqpv30D8ioPb7dup3tOTktsELmsKXLI,4900
242
- evalscope/tools/gen_mmlu_subject_mapping.py,sha256=CUmRdReEU7QfMyprh9I56KmHoRww_zUda_JuyxmCL1A,3277
243
- evalscope/tools/rewrite_eval_results.py,sha256=ODD6pt9FvZq_a54oYsehBDslRKHOsk9zsC9iAZvi5Yg,2020
244
256
  evalscope/utils/__init__.py,sha256=jLVoGryuqUh4Km9QWWQBzpqkcVNRK0MbwNaSgckqdiU,139
245
257
  evalscope/utils/arena_utils.py,sha256=Gf8VpH4C_oF2Abif_QeL0rAP6tvTzsc0gglpdNkUE48,7155
246
- evalscope/utils/chat_service.py,sha256=h6Z9CpgdmalD9u2WNxdfJw2MdzDqsMfDHmnNk8GkffY,8666
258
+ evalscope/utils/chat_service.py,sha256=Kh3hEUW_HF158a0QqHbWepHIHRQFJgUM-jCDAcQ_maw,8674
247
259
  evalscope/utils/completion_parsers.py,sha256=YWHkLkSfURTcUjNNlCL6PPDICd4F2Ns9figgPN4C97c,2933
248
260
  evalscope/utils/io_utils.py,sha256=vm6uJBBqx4fc7jsHGbwNQ6Hbx7XYhjT1Q2dQ7aHjDD0,4172
249
- evalscope/utils/logger.py,sha256=Cke17sVV9MrccINeuEsiVouJarDvS4Wt2JUaWK5NFLM,3582
250
- evalscope/utils/model_utils.py,sha256=PqIu1nMhoD7sauZATkuxkPo4lrYTQRh8kleERrWD-Po,678
261
+ evalscope/utils/logger.py,sha256=49F2WDi1g_o8aW8Z29wOt9YHE9LDqkHIgb-d8TVybJY,3635
262
+ evalscope/utils/model_utils.py,sha256=PK7pKNY8ovtGZHNRvDpZ-d8zBHMOkxd6fRVkM8VF06I,736
251
263
  evalscope/utils/utils.py,sha256=a6a2vDDxqlj7nY8xynkKkWs_ZPXEU2UMwvxp0JEpHjg,9686
252
264
  tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
253
265
  tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
254
266
  tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
255
- tests/cli/test_collection.py,sha256=pS-omRGU6yuvk5O5RPRIOklVKWKsV3lvPNvmk7rVIMY,2825
256
- tests/cli/test_run.py,sha256=V5lxiqtuNcpbjewPaE3KD8ssuIolvhhIzYEU7iDXlZE,5492
267
+ tests/cli/test_collection.py,sha256=gx3GySIAPNaLUSf3D3Q3V0WZc21BPdNthIbECHQN0TI,3026
268
+ tests/cli/test_run.py,sha256=aywruYPPweMEHaBOynf0G3liKBKMH_H_e4Znq2PcaR4,5821
257
269
  tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
258
270
  tests/perf/test_perf.py,sha256=iB8Mg565SfwPsObdAByHYfZNqN71kUtPW7ucmyiOWo8,3025
259
271
  tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -266,9 +278,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZU
266
278
  tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
267
279
  tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
268
280
  tests/vlm/test_vlmeval.py,sha256=nzWXjw49SlxXgDnYS9N5JSFtcUp8xPOW2YNNzupvtt4,1806
269
- evalscope-0.9.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
270
- evalscope-0.9.0.dist-info/METADATA,sha256=KbU5bo3jjt1FsaTVXvdRqJJQEgge_431xW3uQHYKawI,25136
271
- evalscope-0.9.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
272
- evalscope-0.9.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
273
- evalscope-0.9.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
274
- evalscope-0.9.0.dist-info/RECORD,,
281
+ evalscope-0.10.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
282
+ evalscope-0.10.0.dist-info/METADATA,sha256=BwbHLPw5NELgkYNQ90wn_iUoDyUQfQD2WSHRD5XkYcM,28975
283
+ evalscope-0.10.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
284
+ evalscope-0.10.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
285
+ evalscope-0.10.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
286
+ evalscope-0.10.0.dist-info/RECORD,,
@@ -12,15 +12,19 @@ class TestCollection(unittest.TestCase):
12
12
  def test_create_collection(self):
13
13
  schema = CollectionSchema(name='math&reasoning', datasets=[
14
14
  CollectionSchema(name='math', datasets=[
15
+ CollectionSchema(name='generation', datasets=[
15
16
  DatasetInfo(name='gsm8k', weight=1, task_type='math', tags=['en', 'math']),
16
17
  DatasetInfo(name='competition_math', weight=1, task_type='math', tags=['en', 'math']),
18
+ ]),
19
+ CollectionSchema(name='multiple_choice', datasets=[
17
20
  DatasetInfo(name='cmmlu', weight=2, task_type='math', tags=['zh', 'math'], args={'subset_list': ['college_mathematics', 'high_school_mathematics']}),
18
21
  DatasetInfo(name='ceval', weight=3, task_type='math', tags=['zh', 'math'], args={'subset_list': ['advanced_mathematics', 'high_school_mathematics', 'discrete_mathematics', 'middle_school_mathematics']}),
22
+ ]),
19
23
  ]),
20
24
  CollectionSchema(name='reasoning', datasets=[
21
- DatasetInfo(name='arc', weight=1, task_type='reasoning', tags=['en', 'reasoning']),
22
- DatasetInfo(name='ceval', weight=1, task_type='reasoning', tags=['zh', 'reasoning'], args={'subset_list': ['logic']}),
23
- DatasetInfo(name='race', weight=1, task_type='reasoning', tags=['en', 'reasoning']),
25
+ DatasetInfo(name='arc', weight=1, task_type='reasoning', tags=['en', 'reasoning']),
26
+ DatasetInfo(name='ceval', weight=1, task_type='reasoning', tags=['zh', 'reasoning'], args={'subset_list': ['logic']}),
27
+ DatasetInfo(name='race', weight=1, task_type='reasoning', tags=['en', 'reasoning']),
24
28
  ]),
25
29
  ])
26
30
  print(schema.to_dict())
@@ -32,7 +36,7 @@ class TestCollection(unittest.TestCase):
32
36
  def test_generate_data(self):
33
37
  schema = CollectionSchema.from_dict(json.load(open('outputs/schema_test.json', 'r')))
34
38
  print(schema.to_dict())
35
- mixed_data = WeightedSampler(schema, 100).sample()
39
+ mixed_data = WeightedSampler(schema).sample(100)
36
40
  dump_jsonl_data(mixed_data, 'outputs/mixed_data_test.jsonl')
37
41
 
38
42
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
@@ -40,14 +44,14 @@ class TestCollection(unittest.TestCase):
40
44
  from evalscope import TaskConfig, run_task
41
45
 
42
46
  task_cfg = TaskConfig(
43
- model='qwen2.5',
47
+ model='Qwen2.5-7B-Instruct',
44
48
  api_url='http://127.0.0.1:8801/v1/chat/completions',
45
49
  api_key='EMPTY',
46
50
  eval_type=EvalType.SERVICE,
47
51
  datasets=['data_collection'],
48
52
  dataset_args={'data_collection': {
49
- # 'local_path': 'outputs/mixed_data_test.jsonl'
50
- 'local_path': 'outputs/weighted_mixed_data.jsonl'
53
+ 'local_path': 'outputs/mixed_data_test.jsonl'
54
+ # 'local_path': 'outputs/weighted_mixed_data.jsonl'
51
55
  }},
52
56
  )
53
57
  run_task(task_cfg=task_cfg)
tests/cli/test_run.py CHANGED
@@ -128,12 +128,15 @@ class TestRun(unittest.TestCase):
128
128
  from evalscope.config import TaskConfig
129
129
 
130
130
  task_cfg = TaskConfig(
131
- model='qwen2.5',
131
+ model='Qwen2.5-7B-Instruct',
132
132
  api_url='http://127.0.0.1:8801/v1/chat/completions',
133
133
  api_key='EMPTY',
134
134
  eval_type=EvalType.SERVICE,
135
135
  datasets=[
136
- 'mmlu_pro',
136
+ 'iquiz',
137
+ # 'ifeval',
138
+ # 'mmlu',
139
+ # 'mmlu_pro',
137
140
  # 'race',
138
141
  # 'trivia_qa',
139
142
  # 'cmmlu',
@@ -145,8 +148,14 @@ class TestRun(unittest.TestCase):
145
148
  # 'bbh',
146
149
  # 'hellaswag',
147
150
  ],
148
- limit=2,
149
- debug=True
151
+ dataset_args={
152
+ 'ceval': {
153
+ 'subset_list': [
154
+ 'computer_network', 'operating_system', 'computer_architecture', 'college_programming'
155
+ ]
156
+ }
157
+ },
158
+ # limit=10
150
159
  )
151
160
 
152
161
  run_task(task_cfg=task_cfg)
@@ -1 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
@@ -1,133 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- import glob
4
- import json
5
- import os
6
- from collections import defaultdict
7
- from tabulate import tabulate
8
-
9
- from evalscope.utils.logger import get_logger
10
-
11
- logger = get_logger()
12
- """
13
- Combine and generate table for reports of LLMs.
14
- """
15
-
16
-
17
- def get_report(report_file: str):
18
- data_d: dict = json.load(open(report_file, 'r'))
19
- dataset_name = data_d['dataset_name']
20
- model_name = data_d['model_name']
21
- score = data_d['score'] # float or dict
22
- metric = data_d['metric']
23
- score_d = {}
24
- if isinstance(score, dict):
25
- score_d = score
26
- elif isinstance(score, float):
27
- score_d[metric] = score
28
- else:
29
- raise ValueError(f'Unknown score type: {type(score)}')
30
- score_str = '\n'.join(['(' + dataset_name + '/' + k + ') ' + str(v) for k, v in score_d.items()])
31
-
32
- return model_name, {'dataset_name': dataset_name, 'score': score_str}
33
-
34
-
35
- def get_model_reports(model_report_dir: str):
36
- model_report_dir = os.path.normpath(model_report_dir)
37
- report_files = glob.glob(os.path.join(model_report_dir, '**/*.json'))
38
-
39
- model_reports_d = defaultdict(list)
40
- for file_path in report_files:
41
- model_name, report_d = get_report(file_path)
42
- model_reports_d[model_name].append(report_d)
43
-
44
- return model_reports_d
45
-
46
-
47
- def gen_table(reports_path_list: list):
48
- table_values = []
49
- headers = ['Model']
50
- is_headers_set = False
51
-
52
- for report_path in reports_path_list:
53
- model_reports_d = get_model_reports(report_path)
54
- for model_name, report_list in model_reports_d.items():
55
- report_list = sorted(report_list, key=lambda x: x['dataset_name'])
56
- if not is_headers_set:
57
- headers.extend([x['dataset_name'] for x in report_list])
58
- is_headers_set = True
59
- single_row = []
60
- single_row.append(model_name)
61
- for single_report in report_list:
62
- # e.g. '28.51 (acc)'
63
- single_row.append(single_report['score'])
64
- table_values.append(single_row)
65
-
66
- report_table = tabulate(table_values, headers=headers, tablefmt='grid')
67
- return report_table
68
-
69
-
70
- class ReportsRecorder:
71
- COMMON_DATASET_PATH = []
72
- CUSTOM_DATASET_PATH = []
73
-
74
- def __init__(self, oss_url: str = '', endpoint: str = ''):
75
- if oss_url and endpoint:
76
- import oss2
77
- from oss2.credentials import EnvironmentVariableCredentialsProvider
78
-
79
- auth = oss2.ProviderAuth(EnvironmentVariableCredentialsProvider())
80
- oss_url = oss_url.replace('oss://', '').split('/')
81
- bucket_name = oss_url[0]
82
-
83
- self.object_path = '/'.join(oss_url[1:])
84
- self.bucket = oss2.Bucket(auth, endpoint, bucket_name)
85
- else:
86
- self.object_path = ''
87
- self.bucket = None
88
-
89
- def append_path(self, report_path: str, dataset_name: str):
90
- if dataset_name == 'general_qa':
91
- self.CUSTOM_DATASET_PATH.append(report_path)
92
- else:
93
- self.COMMON_DATASET_PATH.append(report_path)
94
-
95
- def dump_reports(self, output_dir: str):
96
- result = {'CommonDataset': [], 'CustomDataset': []}
97
- for line in self.COMMON_DATASET_PATH:
98
- with open(line, 'r') as f:
99
- report = json.load(f)
100
- result['CommonDataset'].append(report)
101
- for line in self.CUSTOM_DATASET_PATH:
102
- with open(line, 'r') as f:
103
- report = json.load(f)
104
- report.update({'name': os.path.basename(line)})
105
- result['CustomDataset'].append(report)
106
-
107
- os.makedirs(output_dir, exist_ok=True)
108
- output_file_name = 'metric.json'
109
- output_path = os.path.join(output_dir, output_file_name)
110
- with open(output_path, 'w+') as f:
111
- f.write(json.dumps(result, ensure_ascii=False, indent=4))
112
-
113
- if self.bucket:
114
- remote_path = os.path.join(self.object_path, output_file_name)
115
- logger.info(f'** Upload report to oss: {remote_path}')
116
- self.bucket.put_object_from_file(remote_path, output_path)
117
-
118
-
119
- if __name__ == '__main__':
120
- report_dir_1 = '/to/path/20231129_020533_default_ZhipuAI_chatglm2-6b-base_none/reports'
121
- report_dir_2 = '/to/path/20231129_020533_default_ZhipuAI_chatglm2-6b_none/reports'
122
-
123
- report_table = gen_table([report_dir_1, report_dir_2])
124
- print(report_table)
125
-
126
- # ALL VALUES ONLY FOR EXAMPLE
127
- # +--------------------------+-------------------+-------------+
128
- # | Model | CompetitionMath | GSM8K |
129
- # +==========================+===================+=============+
130
- # | ZhipuAI_chatglm2-6b-base | 25.0 (acc) | 30.50 (acc) |
131
- # +--------------------------+-------------------+-------------+
132
- # | ZhipuAI_chatglm2-6b | 30.5 (acc) | 40.50 (acc) |
133
- # +--------------------------+-------------------+-------------+
@@ -1,90 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- # Note: refer to https://github.com/hendrycks/test/blob/master/categories.py
4
-
5
- subcategories = {
6
- 'abstract_algebra': ['math'],
7
- 'anatomy': ['health'],
8
- 'astronomy': ['physics'],
9
- 'business_ethics': ['business'],
10
- 'clinical_knowledge': ['health'],
11
- 'college_biology': ['biology'],
12
- 'college_chemistry': ['chemistry'],
13
- 'college_computer_science': ['computer science'],
14
- 'college_mathematics': ['math'],
15
- 'college_medicine': ['health'],
16
- 'college_physics': ['physics'],
17
- 'computer_security': ['computer science'],
18
- 'conceptual_physics': ['physics'],
19
- 'econometrics': ['economics'],
20
- 'electrical_engineering': ['engineering'],
21
- 'elementary_mathematics': ['math'],
22
- 'formal_logic': ['philosophy'],
23
- 'global_facts': ['other'],
24
- 'high_school_biology': ['biology'],
25
- 'high_school_chemistry': ['chemistry'],
26
- 'high_school_computer_science': ['computer science'],
27
- 'high_school_european_history': ['history'],
28
- 'high_school_geography': ['geography'],
29
- 'high_school_government_and_politics': ['politics'],
30
- 'high_school_macroeconomics': ['economics'],
31
- 'high_school_mathematics': ['math'],
32
- 'high_school_microeconomics': ['economics'],
33
- 'high_school_physics': ['physics'],
34
- 'high_school_psychology': ['psychology'],
35
- 'high_school_statistics': ['math'],
36
- 'high_school_us_history': ['history'],
37
- 'high_school_world_history': ['history'],
38
- 'human_aging': ['health'],
39
- 'human_sexuality': ['culture'],
40
- 'international_law': ['law'],
41
- 'jurisprudence': ['law'],
42
- 'logical_fallacies': ['philosophy'],
43
- 'machine_learning': ['computer science'],
44
- 'management': ['business'],
45
- 'marketing': ['business'],
46
- 'medical_genetics': ['health'],
47
- 'miscellaneous': ['other'],
48
- 'moral_disputes': ['philosophy'],
49
- 'moral_scenarios': ['philosophy'],
50
- 'nutrition': ['health'],
51
- 'philosophy': ['philosophy'],
52
- 'prehistory': ['history'],
53
- 'professional_accounting': ['other'],
54
- 'professional_law': ['law'],
55
- 'professional_medicine': ['health'],
56
- 'professional_psychology': ['psychology'],
57
- 'public_relations': ['politics'],
58
- 'security_studies': ['politics'],
59
- 'sociology': ['culture'],
60
- 'us_foreign_policy': ['politics'],
61
- 'virology': ['health'],
62
- 'world_religions': ['philosophy'],
63
- }
64
-
65
- categories = {
66
- 'STEM': ['physics', 'chemistry', 'biology', 'computer science', 'math', 'engineering'],
67
- 'Humanities': ['history', 'philosophy', 'law'],
68
- 'Social Science': ['politics', 'culture', 'economics', 'geography', 'psychology'],
69
- 'Other': ['other', 'business', 'health'],
70
- }
71
-
72
-
73
- def main():
74
-
75
- reversed_categories = {}
76
- for category, subcategory_list in categories.items():
77
- for subcategory in subcategory_list:
78
- reversed_categories[subcategory] = category
79
-
80
- subject_mapping = {}
81
- for subject, subcategory_list in subcategories.items():
82
- category_name: str = reversed_categories[subcategory_list[0]]
83
- subject_show_name: str = ' '.join([item.capitalize() for item in subject.split('_')])
84
- subject_mapping[subject] = [subject_show_name, subcategory_list[0], category_name]
85
-
86
- print(subject_mapping)
87
-
88
-
89
- if __name__ == '__main__':
90
- main()