evalscope 0.10.0__py3-none-any.whl → 0.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (59) hide show
  1. evalscope/arguments.py +1 -0
  2. evalscope/benchmarks/aime24/__init__.py +0 -0
  3. evalscope/benchmarks/aime24/aime24_adapter.py +49 -0
  4. evalscope/benchmarks/arc/arc_adapter.py +5 -7
  5. evalscope/benchmarks/bbh/bbh_adapter.py +17 -9
  6. evalscope/benchmarks/benchmark.py +2 -2
  7. evalscope/benchmarks/ceval/ceval_adapter.py +9 -9
  8. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -11
  9. evalscope/benchmarks/competition_math/competition_math_adapter.py +34 -23
  10. evalscope/benchmarks/data_adapter.py +18 -12
  11. evalscope/benchmarks/data_collection/__init__.py +0 -0
  12. evalscope/benchmarks/data_collection/data_collection_adapter.py +71 -0
  13. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  14. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +129 -0
  15. evalscope/benchmarks/general_qa/general_qa_adapter.py +6 -6
  16. evalscope/benchmarks/gpqa/__init__.py +0 -0
  17. evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
  18. evalscope/benchmarks/gpqa/gpqa_adapter.py +121 -0
  19. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +8 -13
  20. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -7
  21. evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -6
  22. evalscope/benchmarks/ifeval/ifeval_adapter.py +14 -14
  23. evalscope/benchmarks/ifeval/instructions.py +3 -4
  24. evalscope/benchmarks/iquiz/iquiz_adapter.py +5 -5
  25. evalscope/benchmarks/math_500/__init__.py +0 -0
  26. evalscope/benchmarks/math_500/math_500_adapter.py +49 -0
  27. evalscope/benchmarks/mmlu/mmlu_adapter.py +7 -11
  28. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +27 -15
  29. evalscope/benchmarks/race/race_adapter.py +3 -3
  30. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +1 -2
  31. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -8
  32. evalscope/cli/start_app.py +3 -2
  33. evalscope/collections/evaluator.py +103 -39
  34. evalscope/collections/sampler.py +2 -1
  35. evalscope/collections/schema.py +1 -2
  36. evalscope/config.py +1 -0
  37. evalscope/evaluator/evaluator.py +78 -64
  38. evalscope/metrics/math_parser.py +526 -0
  39. evalscope/metrics/metrics.py +16 -1
  40. evalscope/metrics/named_metrics.py +31 -7
  41. evalscope/models/chat_adapter.py +69 -47
  42. evalscope/models/choice_adapter.py +52 -45
  43. evalscope/models/custom_adapter.py +2 -2
  44. evalscope/models/local_model.py +4 -0
  45. evalscope/models/server_adapter.py +28 -34
  46. evalscope/report/app.py +298 -96
  47. evalscope/run.py +10 -7
  48. evalscope/utils/chat_service.py +2 -2
  49. evalscope/utils/io_utils.py +1 -1
  50. evalscope/version.py +2 -2
  51. {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/METADATA +20 -11
  52. {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/RECORD +57 -47
  53. tests/cli/test_run.py +93 -16
  54. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  55. evalscope/metrics/math_accuracy.py +0 -200
  56. {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/LICENSE +0 -0
  57. {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/WHEEL +0 -0
  58. {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/entry_points.txt +0 -0
  59. {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/top_level.txt +0 -0
@@ -174,7 +174,7 @@ class ChatService:
174
174
  )
175
175
 
176
176
  def _prepare_text_inputs(self, request: TextCompletionRequest):
177
- inputs = self.tokenizer(request.prompt, return_tensors='pt', padding=True).to(self.device)
177
+ inputs = self.tokenizer(request.prompt, return_tensors='pt', padding=False).to(self.device)
178
178
  prompt_tokens = len(inputs['input_ids'][0])
179
179
  return inputs, prompt_tokens
180
180
 
@@ -204,7 +204,7 @@ class ChatService:
204
204
  def _prepare_chat_inputs(self, request: ChatCompletionRequest):
205
205
  formatted_prompt = self.tokenizer.apply_chat_template(
206
206
  request.messages, tokenize=False, add_generation_prompt=True)
207
- inputs = self.tokenizer(formatted_prompt, return_tensors='pt', padding=True).to(self.device)
207
+ inputs = self.tokenizer(formatted_prompt, return_tensors='pt', padding=False).to(self.device)
208
208
  prompt_tokens = len(inputs['input_ids'][0])
209
209
  return formatted_prompt, inputs, prompt_tokens
210
210
 
@@ -135,7 +135,7 @@ def dict_to_yaml(d: dict, yaml_file: str):
135
135
  Dump dict to yaml file.
136
136
  """
137
137
  with open(yaml_file, 'w') as f:
138
- yaml.dump(d, f, default_flow_style=False)
138
+ yaml.dump(d, f, default_flow_style=False, allow_unicode=True)
139
139
 
140
140
 
141
141
  def json_to_dict(json_file) -> dict:
evalscope/version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- __version__ = '0.10.0'
4
- __release_datetime__ = '2025-01-20 20:00:00'
3
+ __version__ = '0.11.0'
4
+ __release_datetime__ = '2025-02-13 12:00:00'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.10.0
3
+ Version: 0.11.0
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -19,10 +19,12 @@ License-File: LICENSE
19
19
  Requires-Dist: absl-py
20
20
  Requires-Dist: accelerate
21
21
  Requires-Dist: cachetools
22
- Requires-Dist: datasets<=3.0.1,>=3.0.0
22
+ Requires-Dist: datasets<=3.2.0,>=3.0.0
23
23
  Requires-Dist: editdistance
24
24
  Requires-Dist: jieba
25
25
  Requires-Dist: jsonlines
26
+ Requires-Dist: langdetect
27
+ Requires-Dist: latex2sympy2
26
28
  Requires-Dist: matplotlib
27
29
  Requires-Dist: modelscope[framework]
28
30
  Requires-Dist: nltk>=3.9
@@ -42,20 +44,24 @@ Requires-Dist: scikit-learn
42
44
  Requires-Dist: seaborn
43
45
  Requires-Dist: sentencepiece
44
46
  Requires-Dist: simple-ddl-parser
47
+ Requires-Dist: sympy
45
48
  Requires-Dist: tabulate
46
49
  Requires-Dist: tiktoken
47
50
  Requires-Dist: torch
48
51
  Requires-Dist: tqdm
49
52
  Requires-Dist: transformers>=4.33
50
53
  Requires-Dist: transformers-stream-generator
54
+ Requires-Dist: word2number
51
55
  Provides-Extra: all
52
56
  Requires-Dist: absl-py; extra == "all"
53
57
  Requires-Dist: accelerate; extra == "all"
54
58
  Requires-Dist: cachetools; extra == "all"
55
- Requires-Dist: datasets<=3.0.1,>=3.0.0; extra == "all"
59
+ Requires-Dist: datasets<=3.2.0,>=3.0.0; extra == "all"
56
60
  Requires-Dist: editdistance; extra == "all"
57
61
  Requires-Dist: jieba; extra == "all"
58
62
  Requires-Dist: jsonlines; extra == "all"
63
+ Requires-Dist: langdetect; extra == "all"
64
+ Requires-Dist: latex2sympy2; extra == "all"
59
65
  Requires-Dist: matplotlib; extra == "all"
60
66
  Requires-Dist: modelscope[framework]; extra == "all"
61
67
  Requires-Dist: nltk>=3.9; extra == "all"
@@ -75,12 +81,14 @@ Requires-Dist: scikit-learn; extra == "all"
75
81
  Requires-Dist: seaborn; extra == "all"
76
82
  Requires-Dist: sentencepiece; extra == "all"
77
83
  Requires-Dist: simple-ddl-parser; extra == "all"
84
+ Requires-Dist: sympy; extra == "all"
78
85
  Requires-Dist: tabulate; extra == "all"
79
86
  Requires-Dist: tiktoken; extra == "all"
80
87
  Requires-Dist: torch; extra == "all"
81
88
  Requires-Dist: tqdm; extra == "all"
82
89
  Requires-Dist: transformers>=4.33; extra == "all"
83
90
  Requires-Dist: transformers-stream-generator; extra == "all"
91
+ Requires-Dist: word2number; extra == "all"
84
92
  Requires-Dist: ms-opencompass>=0.1.4; extra == "all"
85
93
  Requires-Dist: ms-vlmeval>=0.0.9; extra == "all"
86
94
  Requires-Dist: mteb==1.19.4; extra == "all"
@@ -92,10 +100,10 @@ Requires-Dist: numpy; extra == "all"
92
100
  Requires-Dist: sse-starlette; extra == "all"
93
101
  Requires-Dist: transformers; extra == "all"
94
102
  Requires-Dist: unicorn; extra == "all"
95
- Requires-Dist: gradio>=5.4.0; extra == "all"
103
+ Requires-Dist: gradio==5.4.0; extra == "all"
96
104
  Requires-Dist: plotly>=5.23.0; extra == "all"
97
105
  Provides-Extra: app
98
- Requires-Dist: gradio>=5.4.0; extra == "app"
106
+ Requires-Dist: gradio==5.4.0; extra == "app"
99
107
  Requires-Dist: plotly>=5.23.0; extra == "app"
100
108
  Provides-Extra: inner
101
109
  Requires-Dist: absl-py; extra == "inner"
@@ -215,7 +223,8 @@ Please scan the QR code below to join our community groups:
215
223
 
216
224
 
217
225
  ## 🎉 News
218
- - 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visulization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
226
+ - 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/zh-cn/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
227
+ - 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
219
228
  - 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
220
229
  - 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
221
230
  - 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
@@ -416,27 +425,27 @@ To create a public link, set `share=True` in `launch()`.
416
425
  <table>
417
426
  <tr>
418
427
  <td style="text-align: center;">
419
- <img src="docs/zh/get_started/images/setting.png" alt="Setting" style="width: 100%;" />
428
+ <img src="docs/en/get_started/images/setting.png" alt="Setting" style="width: 75%;" />
420
429
  <p>Setting Interface</p>
421
430
  </td>
422
431
  <td style="text-align: center;">
423
- <img src="docs/zh/get_started/images/model_compare.png" alt="Model Compare" style="width: 100%;" />
432
+ <img src="docs/en/get_started/images/model_compare.png" alt="Model Compare" style="width: 100%;" />
424
433
  <p>Model Comparison</p>
425
434
  </td>
426
435
  </tr>
427
436
  <tr>
428
437
  <td style="text-align: center;">
429
- <img src="docs/zh/get_started/images/report_overview.png" alt="Report Overview" style="width: 100%;" />
438
+ <img src="docs/en/get_started/images/report_overview.png" alt="Report Overview" style="width: 100%;" />
430
439
  <p>Report Overview</p>
431
440
  </td>
432
441
  <td style="text-align: center;">
433
- <img src="docs/zh/get_started/images/report_details.png" alt="Report Details" style="width: 100%;" />
442
+ <img src="docs/en/get_started/images/report_details.png" alt="Report Details" style="width: 80%;" />
434
443
  <p>Report Details</p>
435
444
  </td>
436
445
  </tr>
437
446
  </table>
438
447
 
439
- For more details, refer to: [📖 Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visulization.html)
448
+ For more details, refer to: [📖 Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html)
440
449
 
441
450
  ## 🌐 Evaluation of Specified Model API
442
451
 
@@ -1,11 +1,11 @@
1
1
  evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
2
- evalscope/arguments.py,sha256=v6IyhjgBACDkapnZYi6DeBI1aZxRVA-mx7KR1j72lYs,4493
3
- evalscope/config.py,sha256=4klkNziKT4r8a4Z1imkiY16-S8iER1BYPMOG4nJg9lU,8571
2
+ evalscope/arguments.py,sha256=r8gOMX6i8dWMl_WXLsBdHla7cuauBAyv9apky9VxLsE,4598
3
+ evalscope/config.py,sha256=D7C_K0f0xsfzFUSNSJJUTz3n9tmA6zLDbf8pZ_9ltpw,8600
4
4
  evalscope/constants.py,sha256=bkcDVbB4Pr1Qxz83qefcWjEetVGiHTcx3m84WX14ASI,3330
5
- evalscope/run.py,sha256=KKZBy2hr8_BscE0ZR1rN9U7iPc1eZYeeInfXe3EY7lA,5718
5
+ evalscope/run.py,sha256=qfMqVWlUiXEiIJ665p3-IYWknhIeNZkCJe3Yn07Y74U,5692
6
6
  evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
7
7
  evalscope/summarizer.py,sha256=Wnt8Y61PasOkigo8zMpi1qdFzfETjfnDtCDDQ6VwgDw,5867
8
- evalscope/version.py,sha256=59oai-Z2lJog2HCNhMbBxRg4D3vkwPK5sfffmDSPntE,119
8
+ evalscope/version.py,sha256=h6YAZAgeAreWmKtpfr4D6BEvnWZxb1bka9hrpYOO0l8,119
9
9
  evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
11
11
  evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
@@ -56,13 +56,15 @@ evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4ty
56
56
  evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=ZB0wYzPBPyIM0zjfp5C71GcjAgKxKoWxF7RrhuDQBYM,5931
57
57
  evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=D-8Sr0tZblf-3ZR5T0lOggbgFRuy5wq9Nm02GUpNyAs,1620
58
58
  evalscope/benchmarks/__init__.py,sha256=b_SWdV1ZyOqFiwc_9lIjKrIvK1rwnF2cCIF7XN9CN8E,932
59
- evalscope/benchmarks/benchmark.py,sha256=SFDjyxd4t4KEcLBP82zE_KCJ_wXuv8J3XFzIR4M9fFI,2419
60
- evalscope/benchmarks/data_adapter.py,sha256=Aaspp5dR1aINXAopm0y7LHeMwJbmYXfy5bNm9DpagRo,12051
59
+ evalscope/benchmarks/benchmark.py,sha256=IY2xYmNR58aYnZK7rnUDONWiLQopo_ZifGS2SfN2L-Q,2422
60
+ evalscope/benchmarks/data_adapter.py,sha256=xCBvJe4ubgpP1J8ElcWAJwF6B5CSrBEv_uMwQzlUaLY,12540
61
+ evalscope/benchmarks/aime24/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
62
+ evalscope/benchmarks/aime24/aime24_adapter.py,sha256=FYH8NsT1nis3VoBMzRM_ueOsGNXjOKZCa6J_wpUM3RQ,1772
61
63
  evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
62
64
  evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
63
- evalscope/benchmarks/arc/arc_adapter.py,sha256=TdDB3lazJNdUt2bBo1G7zaOAN6YkKXdcgMui1ygQj3Y,6591
65
+ evalscope/benchmarks/arc/arc_adapter.py,sha256=vfwAy01LA141qn1lsSyZmEIGWbbhOCRMOGoSM-K2z6M,6490
64
66
  evalscope/benchmarks/bbh/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
65
- evalscope/benchmarks/bbh/bbh_adapter.py,sha256=pkgIEr_4QyzngUcs0j4oOscFljGoYZcCAS861Afnt_0,8316
67
+ evalscope/benchmarks/bbh/bbh_adapter.py,sha256=37wY3r1qW5qdjyKF-8n7UIM0IVcpaQugMb5Rkjbppxg,8524
66
68
  evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt,sha256=xnzlaIRyeGlogG49v8nt4vpJO40J06ev4yc8cv0VSRY,1781
67
69
  evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt,sha256=sfo-2iOeVzB0OGgd7NSQFELTGDTsr2DQ3u-g0ivI-sM,3653
68
70
  evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt,sha256=UJBsc3Mwz8TZngdWH_NFlhhNbLhNHK6FvW9FHcS8H5g,1167
@@ -91,73 +93,81 @@ evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt
91
93
  evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt,sha256=s_x6u5MLeKpuAHZj3GNQqY1I8vWqQIfJasOp9XcM7Ck,2945
92
94
  evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt,sha256=qfTZafCzNiz9ULBaDlfy_LISL617NyH5Nc0-nO0K0LE,2164
93
95
  evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
94
- evalscope/benchmarks/ceval/ceval_adapter.py,sha256=2PvM5cvviyVNeFGnz-ymYVhEyPoea52OL_dg7dwVzQQ,11429
96
+ evalscope/benchmarks/ceval/ceval_adapter.py,sha256=Qz2oNGw0H_4FtfY-Izdxv9fgwxScJksyvwzeQw-aVyo,11374
95
97
  evalscope/benchmarks/ceval/ceval_exam.py,sha256=ngOvb6Fymt7iPWIb2fzrUVpqmUT2VBoqh7X_IH8Bcsc,4824
96
- evalscope/benchmarks/ceval/samples.jsonl,sha256=dyWhGAdt4eq6Amgu2Ykx8RevUJVFtbhGFSTbDAeUgHc,448
97
98
  evalscope/benchmarks/cmmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
98
99
  evalscope/benchmarks/cmmlu/cmmlu.py,sha256=Y59NIGUFzJEztJbkehZsG4Cz0J_v9Cyju6xazHMYIcA,5022
99
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=O6FIsJDgg4OiHZSafaDq7jZ2gubWumPMhkdVb8WN-D8,10526
100
+ evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=1RmhI0SNxHK-Fz-iTIR76zeBRDLlm0m6_7rJywqk3Rk,10446
100
101
  evalscope/benchmarks/cmmlu/samples.jsonl,sha256=FXbyPQSDorKBGSD0lnOzioZmFjG07lIL87FRDRaMPSY,1722
101
102
  evalscope/benchmarks/competition_math/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
102
103
  evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc8TdmfblZ6mMc39wQWtqqbUi34QLc,2629
103
- evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=ns2WPbqkR52rRKo244WoAeAO9VOESEl_sHCPhym2DnM,6768
104
+ evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=i0E4TNajMVcWT8lc5haIjKvdmHuI5qzgpssIm5Fw7bs,7413
105
+ evalscope/benchmarks/data_collection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
106
+ evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=o3Q6ke-RLx4qUbF5FgASZogv3-kCJ6qpK43F_LARU3Y,2496
107
+ evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
108
+ evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=vDHgsWpsIZQWNadl3mI8M3rDKkvPM2N2KAkW-8aeOHY,5130
104
109
  evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
105
- evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=1MQXl3Wf_Dnzn7_7BSTu7RT6BOfhhiVyAnqECawxyfM,3899
110
+ evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=fu14ZzGYyg2MEdJbxZGBoIbais6xA9Um2BEAJTvBZZM,3823
111
+ evalscope/benchmarks/gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
112
+ evalscope/benchmarks/gpqa/chain_of_thought.txt,sha256=pgoT5a-DMPJaMhoH_M8zfU5s80ibWDTVW6vnonITd8k,5610
113
+ evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=tiy8Cn1ZmNKjVg8lqNAxWBbsKp8h0uiDNpWuHfcID0A,4689
106
114
  evalscope/benchmarks/gsm8k/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
107
115
  evalscope/benchmarks/gsm8k/gsm8k.py,sha256=ZDN5lfeZyc_pkTDVY0voC_zUExHE1ZoEgEaTvt5hpXg,4233
108
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=9DuNos8xCOVFOUSJ04LAoBRVPbtqgR4XmOVk6r8ADU8,11114
116
+ evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=4qtMX_SfqkXRMgGLOA6tNGMK9EkITWbjLlJT9gWbT20,10664
109
117
  evalscope/benchmarks/hellaswag/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
110
118
  evalscope/benchmarks/hellaswag/hellaswag.py,sha256=5_c9WbaS1LIdvgXzqEcvjAEtKi2V2Yn0YtszPlFqhXI,4610
111
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=p7Nu-1B2mgbjfth1IhkMSWEC0TxOtD6tp_bOWeeRjts,6332
119
+ evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=qArX2umdrYJZkDA9i3XGBGljCton99v5Yss9be9iZYw,6269
112
120
  evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
113
121
  evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
114
- evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=mjWkJqeRM1JVlrLXaCz1qscneLhYySZt8cgdXZSmJWY,5215
122
+ evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=onacZB_6SF9239Ly-U70__WYsinS9iWpnf3oiYMNxKc,5164
115
123
  evalscope/benchmarks/ifeval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
116
- evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=JwJoXfAiawx9Rey1MsEtwCdo7QMl_wxOjspiWAuJFko,2074
117
- evalscope/benchmarks/ifeval/instructions.py,sha256=8mV4f9H1vE8tEnbF1k8uVoDjzJL2tt7lCu2JQaqJelw,56247
124
+ evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=3HsAdNj5JJGCFA17sPXi-59yv-pfcB0UeXKdY_mQcwU,2015
125
+ evalscope/benchmarks/ifeval/instructions.py,sha256=oaJ9D_4rvS67BraHBNPpDtFd4TblFAnR4A3YW9HWfuY,56304
118
126
  evalscope/benchmarks/ifeval/instructions_registry.py,sha256=tVUmhuSwnOidLtI8onOAw_gpJ6bi8FL07GiX19hSuo8,7288
119
127
  evalscope/benchmarks/ifeval/instructions_util.py,sha256=vkemXeylJMmgW8LgfQe4cSy2OF-oH_NcSZtzyZDURW4,25780
120
128
  evalscope/benchmarks/ifeval/utils.py,sha256=TKrM1m2qDCUauahogItDdICf4mDk0OjasSxgnxjt2KY,4517
121
129
  evalscope/benchmarks/iquiz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
122
- evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=gByj-11KGRTQk2wF1UwNACl8i1svBAEDaj-KJm1XEmw,2387
130
+ evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=nv4mzKOPp1YPcr6e7daZuZyQ3jRNNG6PUzi38REuwSk,2356
131
+ evalscope/benchmarks/math_500/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
132
+ evalscope/benchmarks/math_500/math_500_adapter.py,sha256=mBzsllop5sTHw-uK04FjhEWDiEDjDaNUFDUBIVN7Xgg,1742
123
133
  evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
124
134
  evalscope/benchmarks/mmlu/mmlu.py,sha256=sA8AC0bN7iURrSazqkY31s_reNVbDZSUCB-NCTQsVeI,5042
125
- evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=-ONQW0EPAPXFPIpH_Y6zRE-t9j5dT7yABgAU8wxIH4M,11829
135
+ evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=pmT1y9dbWJcZK3U6hkXa3-lBDABx7DhQ7oHc3O-Nkg0,11769
126
136
  evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
127
137
  evalscope/benchmarks/mmlu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
128
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=9Mg7AKb2YL7aCilsXNA5_f1JmETfXQd1kOvLkGcKFEA,4372
138
+ evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=Fdrj26MfYmPzio2tI23WTcofrwD69_m41mkVpvlxzVU,4815
129
139
  evalscope/benchmarks/race/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
130
140
  evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO61nU,3608
131
- evalscope/benchmarks/race/race_adapter.py,sha256=9uyQLDA9kVKGu0XhwcBoMyxcgUh3jqWXRO5DahRqUpg,6678
141
+ evalscope/benchmarks/race/race_adapter.py,sha256=dC9I-3T9UFh2OVpmWKRmSszPOlFZAZ40xOPa4zN3daI,6661
132
142
  evalscope/benchmarks/race/samples.jsonl,sha256=bhSktBgU6axYQCClRtQ7nN8D1x815AU8xMAIG1oflG0,1243
133
143
  evalscope/benchmarks/trivia_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
134
144
  evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=1isBD62PGhCiNbzQa-GFrHHL4XLHIkojWfgSvn7ktf8,3445
135
145
  evalscope/benchmarks/trivia_qa/trivia_qa.py,sha256=eekxaXppMLb5tCQqNLOw2MaWlYDhI2IicPzRsTHqb5A,3070
136
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=e-jrcCvl8fbPzWCOYKq_sbl4XCulsPzAECGtvTPE-rM,5106
146
+ evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=GVuJT-Xz4ugVtcUSTRxcBgViHVowcqJf3yVsotcZoZI,5062
137
147
  evalscope/benchmarks/truthful_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
138
148
  evalscope/benchmarks/truthful_qa/truthful_qa.py,sha256=A4abSL6WMRcXez8flxsHy-0ZFyRg-moq9rTeOA1TalY,6909
139
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=tCVO0RTD_S7z1ky7su5z67dnpgbsEtcH5j0vCpfvUV8,12908
149
+ evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=6rT1zuQh0nLuYymcchO-cMP98EY0vWizbfTfnUERWgo,12905
140
150
  evalscope/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
141
151
  evalscope/cli/base.py,sha256=m1DFlF16L0Lyrn0YNuFj8ByGjVJIoI0jKzAoodIXjRk,404
142
152
  evalscope/cli/cli.py,sha256=w_dtXljur9s5lmTn6LbbFL_viTPQB1WAEzhYcId09Og,729
143
- evalscope/cli/start_app.py,sha256=icLwBq5yHVmJ4C9y-sYq_o_rPvCT-oO-F2r7RlegHv0,706
153
+ evalscope/cli/start_app.py,sha256=_NTmCd15tZOROAnPacGWirMS4OXHrL3n2eZj1kokpks,758
144
154
  evalscope/cli/start_eval.py,sha256=2lyD2WSQ0DnP6T31VvTimQ-6POnwxeEP9GLPFnT7Tfo,767
145
155
  evalscope/cli/start_perf.py,sha256=lEHJBSpzNsO4KGlWfQc-EfZGXq1M_FpOwtRxRdb4fso,813
146
156
  evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
147
157
  evalscope/collections/__init__.py,sha256=hd68Qf-ryeDsz5Pu-Dh83M5V5RE5mhLsG-vc55n5n0o,228
148
- evalscope/collections/evaluator.py,sha256=_XaLn_cSKvAW96aNwaaPbrBDPl9qn0VrsTjID_y7SpM,8910
149
- evalscope/collections/sampler.py,sha256=6Tp0jN7bJQqG-7AQ2UDPDur6O5aC_nl0N-OV9HfuE9Q,4769
150
- evalscope/collections/schema.py,sha256=Ns47HXt7Ym4sPdPyxStxALHUid2cW7kWhqvw_jK_p-4,4172
158
+ evalscope/collections/evaluator.py,sha256=FJx3KGdLi0-TIqWC_067HEmA4P298BKdwHIrbcai46M,12065
159
+ evalscope/collections/sampler.py,sha256=2NwvhJVdi-mrDeK7RWwEGOoE7DdxtpyASRUZU_D6hWw,4855
160
+ evalscope/collections/schema.py,sha256=mjJfNmy_athJ1TmnuJRkrKRlefzefuQXZuTtjn8SHKo,4073
151
161
  evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
152
- evalscope/evaluator/evaluator.py,sha256=0IOuWQ4KgWuMisNmFqh4-id3d1Kkbkf4JW-6hVz7tqU,16638
162
+ evalscope/evaluator/evaluator.py,sha256=E0NiP5O56WbF8eiUmw9IY2ouotRog9H-2SRyTzZld0I,17569
153
163
  evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
154
164
  evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
155
165
  evalscope/evaluator/reviewer/auto_reviewer.py,sha256=PDz1Nt_qq0oGZBBwek2-M8VBUNLkOkmYObzR8gO7nEc,16624
156
166
  evalscope/metrics/__init__.py,sha256=yzuZjXufrPqVhzNTNaJLJwhs7-Sgb-iNG0I3BdOX7Tg,291
157
167
  evalscope/metrics/code_metric.py,sha256=EXE2BZAc9JJT_cpd6eCb0Jo9wwtnzXzHBZxmLzG5Jpw,3300
158
- evalscope/metrics/math_accuracy.py,sha256=a0L_YT70bsJYn5_POICJyj6ZVFbHek1ly6j_ssV9Xsc,5585
159
- evalscope/metrics/metrics.py,sha256=H02Hhj9Me2qzUjSzdV57i5Gj6xP_w5kbuPcuPpejlI0,12860
160
- evalscope/metrics/named_metrics.py,sha256=j-y-d5EJ4FJzOxlIKobKIMUNu--nzAIIc2j0TvDfFb0,574
168
+ evalscope/metrics/math_parser.py,sha256=uTDudn305G3b8-GboWTrDE6OfrEwAW-areHnoGXZ6Is,17302
169
+ evalscope/metrics/metrics.py,sha256=r4FHyEvvFhMu0vAHBw-ByFefObDBC3DQdr53klSk6Wk,13325
170
+ evalscope/metrics/named_metrics.py,sha256=SeBXmgWyK4y4tKiGKro3k-CZU1OShuKe6qxwpT3tizY,1313
161
171
  evalscope/metrics/rouge_metric.py,sha256=zhIUqenSuxnORR9tamLQBGjFwP91Zei2UiLtcOyseVM,4639
162
172
  evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
163
173
  evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=Kq6AObenmLVQ5tN3NgN042a6mgRFQmRO21-ohd9mSa8,11972
@@ -165,12 +175,12 @@ evalscope/metrics/resources/gpt2-zhcn3-v4.bpe,sha256=J_K-oSt9usFEw87Av6j7ETn3J48
165
175
  evalscope/metrics/resources/gpt2-zhcn3-v4.json,sha256=WkM4J_FDPPNQwYi0kj5sM5SVjk2_6bci7tqf8dV9p_U,1289341
166
176
  evalscope/models/__init__.py,sha256=pafIEbJq_2DrYjQbgI0SNVxywNYOxvqwk7Dr1P7KEwk,923
167
177
  evalscope/models/base_adapter.py,sha256=fT3i8c9jRmz_VBcUYMMmXrlCM6JWcixPdgak5yT6Wkw,2177
168
- evalscope/models/chat_adapter.py,sha256=P6CE0JqWDsE7afNfU_wicdisHLfc46Rw3rwTA0sEGQQ,5398
169
- evalscope/models/choice_adapter.py,sha256=Zb-UUFpF2tpMGuGH_wFleMxpSb__-SuN1cMF7yj25aI,7661
170
- evalscope/models/custom_adapter.py,sha256=uj4kbBCwhrXjvSq9f6HgTJ5yJ9FJpvs1k5-9Ekm9RmA,2272
171
- evalscope/models/local_model.py,sha256=EBclVq5tqUFNOZebRlNnZSvzwtSun7FsZRf2tx0cMt0,2486
178
+ evalscope/models/chat_adapter.py,sha256=nOrNDuvuNKkTcW9zNcR_EIqbzkqK5PFws-5YsSxBR9E,6120
179
+ evalscope/models/choice_adapter.py,sha256=jj_6KB1BAsvv4Yufn2bM2tCiLovFUum2368lseogmb8,8036
180
+ evalscope/models/custom_adapter.py,sha256=Ed_MGEcZxKK4mkXTpUY4GXTsayprHzIEOC1L9gqwjf4,2284
181
+ evalscope/models/local_model.py,sha256=s0YVX9Djqazusk7qzSpWQB76jGGuzJxqQlZzomsCFsk,2621
172
182
  evalscope/models/model.py,sha256=diu4TE1ZFWdynTxsl4DejTNsLdwjxoyj2nsKR-Y8EZE,7343
173
- evalscope/models/server_adapter.py,sha256=VGk_nTwkLWO7Ln7lV_KSaIBzlSRZzyIs_bWDeJ_pOho,4469
183
+ evalscope/models/server_adapter.py,sha256=iVJuUJlHGVGxnlrDMnbHZ8WQ4OR2HK5HrXH4obD2_cg,4173
174
184
  evalscope/models/custom/__init__.py,sha256=wb6f_Bi39s5sj-VO7EXRDXB2WhyFb49BUtEMk77ksNQ,102
175
185
  evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
176
186
  evalscope/models/custom/dummy_model.py,sha256=ODD6pt9FvZq_a54oYsehBDslRKHOsk9zsC9iAZvi5Yg,2020
@@ -222,7 +232,7 @@ evalscope/registry/tasks/gsm8k.yaml,sha256=M2I7otwOSy0usD8yG8d6QziASQlKdhKLflRHM
222
232
  evalscope/registry/tasks/mmlu.yaml,sha256=cJcMH1Cvgo9PlYoTmeGx2bcZayysltaa6ehK57dDkvo,726
223
233
  evalscope/registry/tasks/mmlu_mini.yaml,sha256=K8ouHh7ve5ZsbkqRtV3Jl-DF01YFPuObfwEdACJA4Pk,778
224
234
  evalscope/report/__init__.py,sha256=0Wes3ot2hy9s-WwZaBztst8qkNrXkOF-Hwa1WW1e8lY,260
225
- evalscope/report/app.py,sha256=rqjKgo7BFow4cA-vN9GaihQCd2m55ndHgUkWVr4Koyk,19470
235
+ evalscope/report/app.py,sha256=adP1rVVOxYMbCTdopV3FKWBhUzB7t1AXcDOxW4Ct56g,26647
226
236
  evalscope/report/combinator.py,sha256=bi6nvTbMrzraZ8kUZ6mIMikk8-qEIVYUhdaH4RE1Tg8,2653
227
237
  evalscope/report/generator.py,sha256=2DULY9W8QCUxdtyfNjo8XAP_YxI1LgR95jknK__kYPU,3600
228
238
  evalscope/report/utils.py,sha256=DRlbjbqHEmM8rGlA4pwtlHFhOZtyUzcqiS-mejfIDkU,4584
@@ -255,9 +265,9 @@ evalscope/third_party/toolbench_static/llm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1
255
265
  evalscope/third_party/toolbench_static/llm/swift_infer.py,sha256=usmVelh0ogBlCtSUL0dqp89w2mAqH1Ptv9MURVoGrc8,1209
256
266
  evalscope/utils/__init__.py,sha256=jLVoGryuqUh4Km9QWWQBzpqkcVNRK0MbwNaSgckqdiU,139
257
267
  evalscope/utils/arena_utils.py,sha256=Gf8VpH4C_oF2Abif_QeL0rAP6tvTzsc0gglpdNkUE48,7155
258
- evalscope/utils/chat_service.py,sha256=Kh3hEUW_HF158a0QqHbWepHIHRQFJgUM-jCDAcQ_maw,8674
268
+ evalscope/utils/chat_service.py,sha256=eZ8uyVeVFpXZo_uvRFyVhnFyJpL14zcn9UA6K4Ax5J4,8676
259
269
  evalscope/utils/completion_parsers.py,sha256=YWHkLkSfURTcUjNNlCL6PPDICd4F2Ns9figgPN4C97c,2933
260
- evalscope/utils/io_utils.py,sha256=vm6uJBBqx4fc7jsHGbwNQ6Hbx7XYhjT1Q2dQ7aHjDD0,4172
270
+ evalscope/utils/io_utils.py,sha256=Tjdgen1FsAA4ArqiUzu734L0Px5NuiS0GKRRiGIzxSA,4192
261
271
  evalscope/utils/logger.py,sha256=49F2WDi1g_o8aW8Z29wOt9YHE9LDqkHIgb-d8TVybJY,3635
262
272
  evalscope/utils/model_utils.py,sha256=PK7pKNY8ovtGZHNRvDpZ-d8zBHMOkxd6fRVkM8VF06I,736
263
273
  evalscope/utils/utils.py,sha256=a6a2vDDxqlj7nY8xynkKkWs_ZPXEU2UMwvxp0JEpHjg,9686
@@ -265,7 +275,7 @@ tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
265
275
  tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
266
276
  tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
267
277
  tests/cli/test_collection.py,sha256=gx3GySIAPNaLUSf3D3Q3V0WZc21BPdNthIbECHQN0TI,3026
268
- tests/cli/test_run.py,sha256=aywruYPPweMEHaBOynf0G3liKBKMH_H_e4Znq2PcaR4,5821
278
+ tests/cli/test_run.py,sha256=gtId2SF1LlDCIn4S_WKRpAyTig_pWOhY8yto4P5B1EY,8303
269
279
  tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
270
280
  tests/perf/test_perf.py,sha256=iB8Mg565SfwPsObdAByHYfZNqN71kUtPW7ucmyiOWo8,3025
271
281
  tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -278,9 +288,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZU
278
288
  tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
279
289
  tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
280
290
  tests/vlm/test_vlmeval.py,sha256=nzWXjw49SlxXgDnYS9N5JSFtcUp8xPOW2YNNzupvtt4,1806
281
- evalscope-0.10.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
282
- evalscope-0.10.0.dist-info/METADATA,sha256=BwbHLPw5NELgkYNQ90wn_iUoDyUQfQD2WSHRD5XkYcM,28975
283
- evalscope-0.10.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
284
- evalscope-0.10.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
285
- evalscope-0.10.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
286
- evalscope-0.10.0.dist-info/RECORD,,
291
+ evalscope-0.11.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
292
+ evalscope-0.11.0.dist-info/METADATA,sha256=GL8Ybyby65DYg8jxjxzdcFYvXBhKzE7eRFIBRiJ0-hc,29584
293
+ evalscope-0.11.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
294
+ evalscope-0.11.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
295
+ evalscope-0.11.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
296
+ evalscope-0.11.0.dist-info/RECORD,,
tests/cli/test_run.py CHANGED
@@ -73,16 +73,18 @@ class TestRun(unittest.TestCase):
73
73
  def test_run_task(self):
74
74
  task_cfg = {'model': 'qwen/Qwen2-0.5B-Instruct',
75
75
  'datasets': [
76
- 'mmlu_pro',
76
+ # 'mmlu_pro',
77
77
  # 'bbh',
78
- 'hellaswag',
78
+ # 'hellaswag',
79
79
  # 'gsm8k',
80
- # 'arc'
80
+ # 'arc',
81
81
  # 'race',
82
+ 'ifeval',
82
83
  # 'truthful_qa',
83
84
  # 'trivia_qa',
84
85
  ],
85
- 'limit': 20,
86
+ 'limit': 2,
87
+ 'eval_batch_size': 2,
86
88
  'debug': True}
87
89
  run_task(task_cfg=task_cfg)
88
90
 
@@ -93,9 +95,9 @@ class TestRun(unittest.TestCase):
93
95
 
94
96
  task_cfg = TaskConfig(
95
97
  model='qwen/Qwen2-0.5B-Instruct',
96
- datasets=['ceval', 'general_qa'], # 数据格式,选择题格式固定为 'ceval'
98
+ datasets=['general_mcq', 'general_qa'], # 数据格式,选择题格式固定为 'ceval'
97
99
  dataset_args={
98
- 'ceval': {
100
+ 'general_mcq': {
99
101
  'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
100
102
  'subset_list': [
101
103
  'example' # 评测数据集名称,上述 *_dev.csv 中的 *
@@ -117,8 +119,17 @@ class TestRun(unittest.TestCase):
117
119
 
118
120
  task_cfg = TaskConfig(
119
121
  model='qwen/Qwen2-0.5B-Instruct',
120
- datasets=['humaneval'],
121
- limit=2
122
+ datasets=[
123
+ # 'math_500',
124
+ # 'aime24',
125
+ 'competition_math'
126
+ ],
127
+ dataset_args={
128
+ 'competition_math': {
129
+ 'subset_list': ['Level 4', 'Level 5']
130
+ }
131
+ },
132
+ limit=5
122
133
  )
123
134
 
124
135
  run_task(task_cfg=task_cfg)
@@ -128,12 +139,12 @@ class TestRun(unittest.TestCase):
128
139
  from evalscope.config import TaskConfig
129
140
 
130
141
  task_cfg = TaskConfig(
131
- model='Qwen2.5-7B-Instruct',
142
+ model='Qwen2.5-0.5B-Instruct',
132
143
  api_url='http://127.0.0.1:8801/v1/chat/completions',
133
144
  api_key='EMPTY',
134
145
  eval_type=EvalType.SERVICE,
135
146
  datasets=[
136
- 'iquiz',
147
+ # 'iquiz',
137
148
  # 'ifeval',
138
149
  # 'mmlu',
139
150
  # 'mmlu_pro',
@@ -141,25 +152,91 @@ class TestRun(unittest.TestCase):
141
152
  # 'trivia_qa',
142
153
  # 'cmmlu',
143
154
  # 'humaneval',
144
- # 'competition_math',
145
155
  # 'gsm8k',
156
+ # 'bbh',
157
+ 'competition_math',
158
+ 'math_500',
159
+ 'aime24',
160
+ 'gpqa',
146
161
  # 'arc',
147
162
  # 'ceval',
148
- # 'bbh',
149
163
  # 'hellaswag',
150
164
  ],
151
165
  dataset_args={
166
+ 'mmlu': {
167
+ 'subset_list': ['elementary_mathematics'],
168
+ 'few_shot_num': 0
169
+ },
170
+ 'mmlu_pro': {
171
+ 'subset_list': ['math'],
172
+ 'few_shot_num': 0
173
+ },
152
174
  'ceval': {
153
175
  'subset_list': [
154
- 'computer_network', 'operating_system', 'computer_architecture', 'college_programming'
155
- ]
156
- }
176
+ 'computer_network', 'operating_system', 'computer_architecture'
177
+ ],
178
+ 'few_shot_num': 0
179
+ },
180
+ 'cmmlu': {
181
+ 'subset_list': ['elementary_chinese'],
182
+ 'few_shot_num': 0
183
+ },
184
+ 'bbh': {
185
+ 'subset_list': ['word_sorting', 'movie_recommendation'],
186
+ },
187
+ 'gpqa': {
188
+ 'subset_list': ['gpqa_diamond'],
189
+ 'few_shot_num': 0
190
+ },
191
+ 'humaneval': {
192
+ 'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
193
+ },
194
+ 'competition_math': {
195
+ 'subset_list': ['Level 1']
196
+ },
197
+ },
198
+ eval_batch_size=5,
199
+ limit=10,
200
+ debug=True,
201
+ generation_config={
202
+ 'temperature': 0.7,
203
+ 'n': 5
157
204
  },
158
- # limit=10
205
+ use_cache='/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250212_150525'
159
206
  )
160
207
 
161
208
  run_task(task_cfg=task_cfg)
162
209
 
163
210
 
211
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
212
+ def test_run_batch_eval(self):
213
+ from evalscope.config import TaskConfig
214
+
215
+ task_cfg = TaskConfig(
216
+ model='LLM-Research/Llama-3.2-1B-Instruct',
217
+ datasets=[
218
+ # 'math_500',
219
+ # 'aime24',
220
+ # 'competition_math'
221
+ # 'arc',
222
+ 'gsm8k'
223
+ # 'truthful_qa'
224
+ ],
225
+ dataset_args={
226
+ 'competition_math': {
227
+ 'subset_list': ['Level 4', 'Level 5']
228
+ }
229
+ },
230
+ eval_batch_size=2,
231
+ limit=5,
232
+ generation_config={
233
+ 'max_new_tokens': 2048,
234
+ 'temperature': 0.7,
235
+ 'num_return_sequences': 2,
236
+ }
237
+ )
238
+
239
+ run_task(task_cfg=task_cfg)
240
+
164
241
  if __name__ == '__main__':
165
242
  unittest.main()
@@ -1 +0,0 @@
1
- {'id': 0, 'question': '下列关于税法基本原则的表述中,不正确的是____。', 'A': '税收法定原则包括税收要件法定原则和税务合法性原则', 'B': '税收公平原则源于法律上的平等性原则', 'C': '税收效率原则包含经济效率和行政效率两个方面', 'D': '税务机关按法定程序依法征税,可以自由做出减征、停征或免征税款的决定', 'answer': 'D', 'explanation': ''}