evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (273) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/__init__.py +0 -0
  3. evalscope/api/benchmark/__init__.py +3 -0
  4. evalscope/api/benchmark/adapters/__init__.py +3 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
  8. evalscope/api/benchmark/benchmark.py +321 -0
  9. evalscope/api/benchmark/meta.py +115 -0
  10. evalscope/api/dataset/__init__.py +2 -0
  11. evalscope/api/dataset/dataset.py +349 -0
  12. evalscope/api/dataset/loader.py +261 -0
  13. evalscope/api/dataset/utils.py +143 -0
  14. evalscope/api/evaluator/__init__.py +3 -0
  15. evalscope/api/evaluator/cache.py +355 -0
  16. evalscope/api/evaluator/evaluator.py +56 -0
  17. evalscope/api/evaluator/state.py +264 -0
  18. evalscope/api/filter/__init__.py +1 -0
  19. evalscope/api/filter/filter.py +72 -0
  20. evalscope/api/messages/__init__.py +11 -0
  21. evalscope/api/messages/chat_message.py +198 -0
  22. evalscope/api/messages/content.py +102 -0
  23. evalscope/api/messages/utils.py +35 -0
  24. evalscope/api/metric/__init__.py +2 -0
  25. evalscope/api/metric/metric.py +55 -0
  26. evalscope/api/metric/scorer.py +105 -0
  27. evalscope/api/mixin/__init__.py +2 -0
  28. evalscope/api/mixin/dataset_mixin.py +105 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +157 -0
  32. evalscope/api/model/model.py +383 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/ui/app_ui.py +2 -1
  40. evalscope/app/ui/multi_model.py +50 -25
  41. evalscope/app/ui/single_model.py +23 -11
  42. evalscope/app/utils/data_utils.py +42 -26
  43. evalscope/app/utils/text_utils.py +0 -2
  44. evalscope/app/utils/visualization.py +9 -4
  45. evalscope/arguments.py +6 -7
  46. evalscope/backend/opencompass/api_meta_template.py +2 -1
  47. evalscope/backend/opencompass/backend_manager.py +6 -3
  48. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  49. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  50. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  51. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  52. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  53. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  55. evalscope/backend/rag_eval/utils/embedding.py +2 -1
  56. evalscope/backend/rag_eval/utils/llm.py +13 -12
  57. evalscope/benchmarks/__init__.py +0 -2
  58. evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
  59. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
  60. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
  61. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
  62. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
  63. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
  64. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
  65. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  66. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  67. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  68. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  69. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  70. evalscope/benchmarks/arena_hard/utils.py +37 -1
  71. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  72. evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
  73. evalscope/benchmarks/bfcl/generation.py +222 -0
  74. evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
  75. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  76. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  77. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  78. evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
  79. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  80. evalscope/benchmarks/docmath/utils.py +4 -5
  81. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  82. evalscope/benchmarks/frames/frames_adapter.py +135 -52
  83. evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
  84. evalscope/benchmarks/general_arena/utils.py +23 -27
  85. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  86. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  87. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  88. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  89. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  90. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  91. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  92. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  93. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  94. evalscope/benchmarks/ifeval/instructions.py +109 -64
  95. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  96. evalscope/benchmarks/ifeval/utils.py +6 -7
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  105. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  106. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  107. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  108. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
  109. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  110. evalscope/benchmarks/race/race_adapter.py +33 -119
  111. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  112. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  113. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  114. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  115. evalscope/benchmarks/tau_bench/generation.py +147 -0
  116. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
  117. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  118. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  119. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
  120. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  121. evalscope/cli/cli.py +2 -0
  122. evalscope/cli/start_server.py +6 -3
  123. evalscope/collections/__init__.py +2 -10
  124. evalscope/collections/sampler.py +10 -10
  125. evalscope/collections/schema.py +13 -11
  126. evalscope/config.py +95 -54
  127. evalscope/constants.py +29 -61
  128. evalscope/evaluator/__init__.py +1 -1
  129. evalscope/evaluator/evaluator.py +277 -423
  130. evalscope/filters/__init__.py +2 -0
  131. evalscope/filters/extraction.py +126 -0
  132. evalscope/filters/selection.py +57 -0
  133. evalscope/metrics/__init__.py +13 -13
  134. evalscope/metrics/llm_judge.py +32 -30
  135. evalscope/metrics/math_parser.py +27 -22
  136. evalscope/metrics/metric.py +307 -0
  137. evalscope/metrics/metrics.py +22 -18
  138. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  139. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  140. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  141. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  142. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  143. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  144. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  145. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  146. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  147. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  148. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  149. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  150. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  151. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  152. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  153. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  154. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  155. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  156. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  157. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  158. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  159. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  160. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  162. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  163. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  184. evalscope/models/__init__.py +6 -29
  185. evalscope/models/mockllm.py +65 -0
  186. evalscope/models/model_apis.py +47 -0
  187. evalscope/models/modelscope.py +455 -0
  188. evalscope/models/openai_compatible.py +123 -0
  189. evalscope/models/text2image_model.py +124 -0
  190. evalscope/models/utils/openai.py +698 -0
  191. evalscope/perf/benchmark.py +2 -1
  192. evalscope/perf/http_client.py +4 -2
  193. evalscope/perf/plugin/api/custom_api.py +5 -4
  194. evalscope/perf/plugin/api/openai_api.py +11 -9
  195. evalscope/perf/plugin/datasets/custom.py +2 -1
  196. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  197. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  198. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  199. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  200. evalscope/perf/plugin/datasets/openqa.py +4 -2
  201. evalscope/perf/utils/benchmark_util.py +7 -5
  202. evalscope/perf/utils/db_util.py +9 -6
  203. evalscope/perf/utils/local_server.py +8 -3
  204. evalscope/perf/utils/rich_display.py +16 -10
  205. evalscope/report/__init__.py +2 -2
  206. evalscope/report/combinator.py +18 -12
  207. evalscope/report/generator.py +101 -6
  208. evalscope/report/{utils.py → report.py} +8 -6
  209. evalscope/run.py +26 -44
  210. evalscope/summarizer.py +1 -1
  211. evalscope/utils/__init__.py +21 -2
  212. evalscope/utils/chat_service.py +2 -1
  213. evalscope/utils/deprecation_utils.py +12 -1
  214. evalscope/utils/function_utils.py +29 -0
  215. evalscope/utils/io_utils.py +100 -5
  216. evalscope/utils/json_schema.py +208 -0
  217. evalscope/utils/logger.py +51 -12
  218. evalscope/utils/model_utils.py +10 -7
  219. evalscope/utils/multi_choices.py +271 -0
  220. evalscope/utils/url_utils.py +65 -0
  221. evalscope/version.py +2 -2
  222. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
  223. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
  224. tests/aigc/test_t2i.py +22 -4
  225. tests/benchmark/__init__.py +1 -0
  226. tests/benchmark/test_eval.py +386 -0
  227. tests/cli/test_all.py +3 -5
  228. tests/cli/test_collection.py +13 -4
  229. tests/cli/test_custom.py +22 -15
  230. tests/rag/test_clip_benchmark.py +1 -0
  231. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  232. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  233. evalscope/benchmarks/benchmark.py +0 -81
  234. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  235. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  236. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  237. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  238. evalscope/benchmarks/data_adapter.py +0 -528
  239. evalscope/benchmarks/filters.py +0 -59
  240. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  241. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  242. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  243. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  244. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  245. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  246. evalscope/benchmarks/race/race.py +0 -104
  247. evalscope/benchmarks/race/samples.jsonl +0 -5
  248. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  249. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  250. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  251. evalscope/benchmarks/utils.py +0 -60
  252. evalscope/collections/evaluator.py +0 -375
  253. evalscope/metrics/completion_parsers.py +0 -227
  254. evalscope/metrics/named_metrics.py +0 -55
  255. evalscope/models/adapters/__init__.py +0 -14
  256. evalscope/models/adapters/base_adapter.py +0 -84
  257. evalscope/models/adapters/bfcl_adapter.py +0 -246
  258. evalscope/models/adapters/chat_adapter.py +0 -207
  259. evalscope/models/adapters/choice_adapter.py +0 -222
  260. evalscope/models/adapters/custom_adapter.py +0 -71
  261. evalscope/models/adapters/server_adapter.py +0 -236
  262. evalscope/models/adapters/t2i_adapter.py +0 -79
  263. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  264. evalscope/models/custom/__init__.py +0 -4
  265. evalscope/models/custom/custom_model.py +0 -50
  266. evalscope/models/custom/dummy_model.py +0 -99
  267. evalscope/models/local_model.py +0 -128
  268. evalscope/models/register.py +0 -41
  269. tests/cli/test_run.py +0 -489
  270. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
  271. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
  272. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
  273. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,5 @@
1
1
  import numpy as np
2
- import os
3
2
  import random
4
- import torch
5
3
  from enum import Enum
6
4
  from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
7
5
 
@@ -69,8 +67,13 @@ def seed_everything(seed: int):
69
67
  """
70
68
  random.seed(seed)
71
69
  np.random.seed(seed)
72
- torch.manual_seed(seed)
73
- if torch.cuda.is_available():
74
- torch.cuda.manual_seed_all(seed)
75
- torch.backends.cudnn.deterministic = True
76
- torch.backends.cudnn.benchmark = False
70
+ try:
71
+ import torch
72
+
73
+ torch.manual_seed(seed)
74
+ if torch.cuda.is_available():
75
+ torch.cuda.manual_seed_all(seed)
76
+ torch.backends.cudnn.deterministic = True
77
+ torch.backends.cudnn.benchmark = False
78
+ except ImportError:
79
+ pass
@@ -0,0 +1,271 @@
1
+ # flake8: noqa: E501
2
+ from __future__ import annotations
3
+
4
+ import re
5
+ from typing import TYPE_CHECKING, List, Optional
6
+
7
+ if TYPE_CHECKING:
8
+ from evalscope.api.evaluator import Choices, Target, TaskState
9
+
10
+ FEW_SHOT_TEMPLATE = r"""Here are some examples of how to answer similar questions:
11
+
12
+ {fewshot}
13
+
14
+ """.lstrip()
15
+
16
+ CHINESE_FEW_SHOT_TEMPLATE = r"""以下是一些示例问题:
17
+
18
+ {fewshot}
19
+
20
+ """.lstrip()
21
+
22
+ CHINESE_SINGLE_ANSWER_TEMPLATE = r"""回答下面的单项选择题,请选出其中的正确答案。你的回答的最后一行应该是这样的格式:"答案:LETTER"(不带引号),其中 LETTER 是 {letters} 中的一个。
23
+
24
+ 问题:{question}
25
+ 选项:
26
+ {choices}
27
+ """.lstrip()
28
+
29
+ CHINESE_SINGLE_ANSWER_TEMPLATE_COT = r"""回答下面的单项选择题,请选出其中的正确答案。你的回答的最后一行应该是这样的格式:"答案:LETTER"(不带引号),其中 LETTER 是 {letters} 中的一个。请在回答前进行一步步思考。
30
+
31
+ 问题:{question}
32
+ 选项:
33
+ {choices}
34
+ """.lstrip()
35
+
36
+ SINGLE_ANSWER_TEMPLATE = r"""
37
+ Answer the following multiple choice question. The entire content of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of {letters}.
38
+
39
+ {question}
40
+
41
+ {choices}
42
+ """.strip()
43
+
44
+ SINGLE_ANSWER_TEMPLATE_COT = r"""
45
+ Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of {letters}. Think step by step before answering.
46
+
47
+ {question}
48
+
49
+ {choices}
50
+ """.strip()
51
+
52
+ MULTIPLE_ANSWER_TEMPLATE = r"""
53
+ Answer the following multiple choice question where multiple answers may be correct. The entire content of your response should be of the following format: 'ANSWER: $LETTERS' (without quotes) where LETTERS is one or more of {letters}.
54
+
55
+ {question}
56
+
57
+ {choices}
58
+ """.strip()
59
+
60
+ MULTIPLE_ANSWER_TEMPLATE_COT = r"""
61
+ Answer the following multiple choice question where multiple answers may be correct. The last line of your response should be of the following format: 'ANSWER: $LETTERS' (without quotes) where LETTERS is one or more of {letters}. Think step by step before answering.
62
+
63
+ {question}
64
+
65
+ {choices}
66
+ """.strip()
67
+
68
+
69
+ def unshuffle_choices(choices: Choices) -> Choices:
70
+ # `sorted` returns `list[Choice]`, but for consistency we wrap this back
71
+ # into a `Choices` object
72
+ return Choices(sorted(choices, key=lambda choice: choice.original_position))
73
+
74
+
75
+ def answer_options(choices: Choices) -> str:
76
+ r"""
77
+ Returns the `choices` formatted as a multiple choice question, e.g.:
78
+
79
+ ["choice 1", "choice 2", "choice 3"] ->
80
+ "A) choice 1\nB) choice 2\nC) choice 3"
81
+ """
82
+ indexes = list(range(len(choices)))
83
+
84
+ return '\n'.join([f'{answer_character(i)}) {choices[j].value}' for i, j in enumerate(indexes)])
85
+
86
+
87
+ def prompt(question: str, choices: Choices, template: str, fewshot: Optional[str] = None) -> str:
88
+
89
+ choices_text = answer_options(choices)
90
+ letters = ','.join(answer_character(i) for i in range(len(choices)))
91
+ if not fewshot:
92
+ return template.format(
93
+ choices=choices_text,
94
+ letters=letters,
95
+ question=question,
96
+ )
97
+ else:
98
+ return template.format(
99
+ choices=choices_text,
100
+ letters=letters,
101
+ question=question,
102
+ fewshot=fewshot,
103
+ )
104
+
105
+
106
+ def format_example(
107
+ question: str,
108
+ choices: Choices,
109
+ answer: Target,
110
+ ) -> str:
111
+ """Format a single example for few-shot learning.
112
+
113
+ Args:
114
+ question (str): The question text.
115
+ choices (list[str]): The list of choices.
116
+ answer (list[str]): The correct answers.
117
+
118
+ Returns:
119
+ str: Formatted example string.
120
+ """
121
+ choices_text = answer_options(choices)
122
+ return f'{question}\n{choices_text}\nANSWER: {answer.text}'
123
+
124
+
125
+ def parse_answers(state: TaskState, multiple_correct: bool = False) -> set[str]:
126
+ """
127
+ Convenience function for extracting answers from the state output.
128
+
129
+ The generated response must be in the format 'ANSWER: <answers>',
130
+ otherwise we can't extract what the model thinks is "true". We can be a
131
+ bit flexible whether these are "AB" vs "A,B" vs "A B".
132
+
133
+ However, if the answer isn't in the expected format the model has
134
+ failed in the task so we'll ultimately just mark it as incorrect
135
+ """
136
+ # First check whether the string strictly ends with the expected answer
137
+ # In this case, we're looking for a single line which contains the expected
138
+ # ANSWER: <answer> string with only whitespace or a period/full stop at the end.
139
+ match = re.search(
140
+ r'(?i)^ANSWER\s*:\s*([A-Za-z\d ,]+)\s*(?:$|\n|\.)',
141
+ state.output.completion,
142
+ flags=re.MULTILINE,
143
+ )
144
+
145
+ # If we couldn't match the strict version, we can try the less strict
146
+ # version for backward compatibility
147
+ if match is None:
148
+ match = re.search(
149
+ r'(?i)ANSWER\s*:\s*([A-Za-z\d ,]+)(?:[^\w]|\n|$|\.)',
150
+ state.output.completion,
151
+ )
152
+
153
+ if match is None:
154
+ return set()
155
+
156
+ matched = match.group(1)
157
+
158
+ # Strip trailing period / full stop
159
+ matched = matched.strip()
160
+ matched = matched.rstrip('.')
161
+
162
+ allowed_options = set(answer_character(i) for i in range(len(state.choices)))
163
+
164
+ if multiple_correct:
165
+ # Match must contain only the allowed choices
166
+ # (may be separated by commas, spaces, the word 'and', or nothing at all)
167
+
168
+ matched = matched.replace(' and ', '')
169
+
170
+ matched = matched.replace(' ', '')
171
+
172
+ split_comma = set(matched.split(','))
173
+ if split_comma.issubset(allowed_options):
174
+ answers = split_comma
175
+ return answers
176
+
177
+ split_nothing = set(matched)
178
+ if split_nothing.issubset(allowed_options):
179
+ answers = split_nothing
180
+ return answers
181
+
182
+ else:
183
+ # Match must contain a single letter in the allowed choices
184
+ if matched in allowed_options:
185
+ answers = {matched}
186
+ return answers
187
+
188
+ return set()
189
+
190
+
191
+ def parse_answers_zh(state: TaskState, multiple_correct: bool = False) -> set[str]:
192
+ """
193
+ Convenience function for extracting answers from the state output in Chinese format.
194
+
195
+ The generated response must be in the format '答案:选项',
196
+ otherwise we can't extract what the model thinks is "true". We can be a
197
+ bit flexible whether these are "AB" vs "A,B" vs "A B".
198
+ """
199
+ # Simple pattern to capture answers with optional bold markdown
200
+ pattern = r'答案\s*[::]\s*([A-Za-z0-9,,]+)'
201
+ match = re.search(pattern, state.output.completion, flags=re.MULTILINE)
202
+
203
+ if match is None:
204
+ return set()
205
+
206
+ matched = match.group(1).strip().rstrip('。.')
207
+ allowed_options = set(answer_character(i) for i in range(len(state.choices)))
208
+
209
+ if multiple_correct:
210
+ # Handle comma-separated or continuous letters
211
+ matched = matched.replace(' 和 ', '').replace(' ', '').replace(',', ',')
212
+ answers = set(matched.split(',')) if ',' in matched else set(matched)
213
+ return answers if answers.issubset(allowed_options) else set()
214
+ else:
215
+ # Single answer
216
+ return {matched} if matched in allowed_options else set()
217
+
218
+
219
+ def set_choices_based_on_generated_response(state: TaskState, answers: set[str]) -> None:
220
+ true_answers = [answer_index(letter) for letter in answers]
221
+
222
+ for i in range(len(state.choices)):
223
+ if i in true_answers:
224
+ state.choices.mark_choice(i, True)
225
+ else:
226
+ state.choices.mark_choice(i, False)
227
+
228
+
229
+ def valid_template(template: str) -> bool:
230
+ """Check if a template has the required capture groups for a multiple choice question"""
231
+ return bool(re.search(r'\{question\}', template) and re.search(r'\{choices\}', template))
232
+
233
+
234
+ class MultipleChoiceTemplate:
235
+ """
236
+ Templates for multiple choice questions.
237
+ """
238
+
239
+ SINGLE_ANSWER = SINGLE_ANSWER_TEMPLATE
240
+ SINGLE_ANSWER_COT = SINGLE_ANSWER_TEMPLATE_COT
241
+ MULTIPLE_ANSWER = MULTIPLE_ANSWER_TEMPLATE
242
+ MULTIPLE_ANSWER_COT = MULTIPLE_ANSWER_TEMPLATE_COT
243
+ CHINESE_FEW_SHOT_TEMPLATE = CHINESE_FEW_SHOT_TEMPLATE
244
+ CHINESE_SINGLE_ANSWER_TEMPLATE = CHINESE_SINGLE_ANSWER_TEMPLATE
245
+ CHINESE_SINGLE_ANSWER_TEMPLATE_COT = CHINESE_SINGLE_ANSWER_TEMPLATE_COT
246
+
247
+
248
+ def answer_character(index: int) -> str:
249
+ r"""
250
+ Helper to go from array index to char, for example:
251
+
252
+ 0 -> 'A', 1 -> 'B', etc
253
+ """
254
+ if index < 26:
255
+ return chr(ord('A') + index)
256
+ else:
257
+ return str(index - 25)
258
+
259
+
260
+ def answer_index(char: str) -> int:
261
+ r"""
262
+ Helper to go from char to array index, for example:
263
+
264
+ 'A' -> 0, 'B' -> 1, etc
265
+ """
266
+ if char.isalpha() or char == ',' or char == ' ':
267
+ return ord(char.upper()) - ord('A')
268
+ elif char.isnumeric():
269
+ return 25 + int(char)
270
+ else:
271
+ raise ValueError(f'Unepxected multiple choice answer: {char} (must be a letter or number)')
@@ -0,0 +1,65 @@
1
+ import base64
2
+ import httpx
3
+ import mimetypes
4
+ import re
5
+
6
+
7
+ def is_http_url(url: str) -> bool:
8
+ return url.startswith('http://') or url.startswith('https://')
9
+
10
+
11
+ def is_data_uri(url: str) -> bool:
12
+ pattern = r'^data:([^;]+);base64,.*'
13
+ return re.match(pattern, url) is not None
14
+
15
+
16
+ def data_uri_mime_type(data_url: str) -> str | None:
17
+ pattern = r'^data:([^;]+);.*'
18
+ match = re.match(pattern, data_url)
19
+ if match:
20
+ mime_type = match.group(1)
21
+ return mime_type
22
+ else:
23
+ return None
24
+
25
+
26
+ def data_uri_to_base64(data_uri: str) -> str:
27
+ pattern = r'^data:[^,]+,'
28
+ stripped_uri = re.sub(pattern, '', data_uri)
29
+ return stripped_uri
30
+
31
+
32
+ def file_as_data(file: str) -> tuple[bytes, str]:
33
+ if is_data_uri(file):
34
+ # resolve mime type and base64 content
35
+ mime_type = data_uri_mime_type(file) or 'image/png'
36
+ file_base64 = data_uri_to_base64(file)
37
+ file_bytes = base64.b64decode(file_base64)
38
+ else:
39
+ # guess mime type; need strict=False for webp images
40
+ type, _ = mimetypes.guess_type(file, strict=False)
41
+ if type:
42
+ mime_type = type
43
+ else:
44
+ mime_type = 'image/png'
45
+
46
+ # handle url or file
47
+ if is_http_url(file):
48
+ client = httpx.Client()
49
+ file_bytes = client.get(file).content
50
+ else:
51
+ with open(file, 'rb') as f:
52
+ file_bytes = f.read()
53
+
54
+ # return bytes and type
55
+ return file_bytes, mime_type
56
+
57
+
58
+ def file_as_data_uri(file: str) -> str:
59
+ if is_data_uri(file):
60
+ return file
61
+ else:
62
+ bytes, mime_type = file_as_data(file)
63
+ base64_file = base64.b64encode(bytes).decode('utf-8')
64
+ file = f'data:{mime_type};base64,{base64_file}'
65
+ return file
evalscope/version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- __version__ = '0.17.1'
4
- __release_datetime__ = '2025-07-18 17:00:00'
3
+ __version__ = '1.0.0'
4
+ __release_datetime__ = '2025-08-25 12:00:00'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.17.1
3
+ Version: 1.0.0
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -18,7 +18,10 @@ Requires-Python: >=3.9
18
18
  Description-Content-Type: text/markdown
19
19
  License-File: LICENSE
20
20
  Requires-Dist: accelerate
21
- Requires-Dist: datasets==3.2.0
21
+ Requires-Dist: colorlog
22
+ Requires-Dist: datasets==3.6.0
23
+ Requires-Dist: docstring-parser
24
+ Requires-Dist: dotenv
22
25
  Requires-Dist: immutabledict
23
26
  Requires-Dist: jieba
24
27
  Requires-Dist: jsonlines
@@ -28,11 +31,14 @@ Requires-Dist: matplotlib
28
31
  Requires-Dist: modelscope[framework]>=1.27
29
32
  Requires-Dist: nltk>=3.9
30
33
  Requires-Dist: openai
34
+ Requires-Dist: overrides
31
35
  Requires-Dist: pandas
32
36
  Requires-Dist: pillow
33
37
  Requires-Dist: pyarrow
38
+ Requires-Dist: pydantic
34
39
  Requires-Dist: pyyaml>=5.1
35
40
  Requires-Dist: requests
41
+ Requires-Dist: rich
36
42
  Requires-Dist: rouge-chinese
37
43
  Requires-Dist: rouge-score>=0.1.0
38
44
  Requires-Dist: sacrebleu
@@ -50,10 +56,14 @@ Requires-Dist: iopath; extra == "aigc"
50
56
  Requires-Dist: omegaconf; extra == "aigc"
51
57
  Requires-Dist: open-clip-torch; extra == "aigc"
52
58
  Requires-Dist: opencv-python; extra == "aigc"
59
+ Requires-Dist: peft>=0.17; extra == "aigc"
53
60
  Requires-Dist: torchvision; extra == "aigc"
54
61
  Provides-Extra: all
55
62
  Requires-Dist: accelerate; extra == "all"
56
- Requires-Dist: datasets==3.2.0; extra == "all"
63
+ Requires-Dist: colorlog; extra == "all"
64
+ Requires-Dist: datasets==3.6.0; extra == "all"
65
+ Requires-Dist: docstring-parser; extra == "all"
66
+ Requires-Dist: dotenv; extra == "all"
57
67
  Requires-Dist: immutabledict; extra == "all"
58
68
  Requires-Dist: jieba; extra == "all"
59
69
  Requires-Dist: jsonlines; extra == "all"
@@ -63,11 +73,14 @@ Requires-Dist: matplotlib; extra == "all"
63
73
  Requires-Dist: modelscope[framework]>=1.27; extra == "all"
64
74
  Requires-Dist: nltk>=3.9; extra == "all"
65
75
  Requires-Dist: openai; extra == "all"
76
+ Requires-Dist: overrides; extra == "all"
66
77
  Requires-Dist: pandas; extra == "all"
67
78
  Requires-Dist: pillow; extra == "all"
68
79
  Requires-Dist: pyarrow; extra == "all"
80
+ Requires-Dist: pydantic; extra == "all"
69
81
  Requires-Dist: pyyaml>=5.1; extra == "all"
70
82
  Requires-Dist: requests; extra == "all"
83
+ Requires-Dist: rich; extra == "all"
71
84
  Requires-Dist: rouge-chinese; extra == "all"
72
85
  Requires-Dist: rouge-score>=0.1.0; extra == "all"
73
86
  Requires-Dist: sacrebleu; extra == "all"
@@ -91,7 +104,6 @@ Requires-Dist: webdataset>0.2.0; extra == "all"
91
104
  Requires-Dist: aiohttp; extra == "all"
92
105
  Requires-Dist: fastapi; extra == "all"
93
106
  Requires-Dist: numpy; extra == "all"
94
- Requires-Dist: rich; extra == "all"
95
107
  Requires-Dist: sse-starlette; extra == "all"
96
108
  Requires-Dist: transformers; extra == "all"
97
109
  Requires-Dist: uvicorn; extra == "all"
@@ -102,8 +114,9 @@ Requires-Dist: iopath; extra == "all"
102
114
  Requires-Dist: omegaconf; extra == "all"
103
115
  Requires-Dist: open-clip-torch; extra == "all"
104
116
  Requires-Dist: opencv-python; extra == "all"
117
+ Requires-Dist: peft>=0.17; extra == "all"
105
118
  Requires-Dist: torchvision; extra == "all"
106
- Requires-Dist: bfcl-eval; extra == "all"
119
+ Requires-Dist: bfcl-eval==2025.6.16; extra == "all"
107
120
  Requires-Dist: human-eval; extra == "all"
108
121
  Requires-Dist: pytest; extra == "all"
109
122
  Requires-Dist: pytest-cov; extra == "all"
@@ -112,7 +125,7 @@ Provides-Extra: app
112
125
  Requires-Dist: gradio==5.4.0; extra == "app"
113
126
  Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
114
127
  Provides-Extra: dev
115
- Requires-Dist: bfcl-eval; extra == "dev"
128
+ Requires-Dist: bfcl-eval==2025.6.16; extra == "dev"
116
129
  Requires-Dist: human-eval; extra == "dev"
117
130
  Requires-Dist: pytest; extra == "dev"
118
131
  Requires-Dist: pytest-cov; extra == "dev"
@@ -175,9 +188,9 @@ Requires-Dist: ms-vlmeval>=0.0.17; extra == "vlmeval"
175
188
  - [📝 Introduction](#-introduction)
176
189
  - [☎ User Groups](#-user-groups)
177
190
  - [🎉 News](#-news)
178
- - [🛠️ Installation](#️-installation)
179
- - [Method 1: Install Using pip](#method-1-install-using-pip)
180
- - [Method 2: Install from Source](#method-2-install-from-source)
191
+ - [🛠️ Environment Setup](#️-environment-setup)
192
+ - [Method 1. Install via pip](#method-1-install-via-pip)
193
+ - [Method 2. Install from source](#method-2-install-from-source)
181
194
  - [🚀 Quick Start](#-quick-start)
182
195
  - [Method 1. Using Command Line](#method-1-using-command-line)
183
196
  - [Method 2. Using Python Code](#method-2-using-python-code)
@@ -258,6 +271,13 @@ Please scan the QR code below to join our community groups:
258
271
 
259
272
 
260
273
  ## 🎉 News
274
+
275
+ > [!IMPORTANT]
276
+ > **Version 1.0 Refactoring**
277
+ >
278
+ > Version 1.0 introduces a major overhaul of the evaluation framework, establishing a new, more modular and extensible API layer under `evalscope/api`. Key improvements include standardized data models for benchmarks, samples, and results; a registry-based design for components such as benchmarks and metrics; and a rewritten core evaluator that orchestrates the new architecture. Existing benchmark adapters have been migrated to this API, resulting in cleaner, more consistent, and easier-to-maintain implementations.
279
+
280
+ - 🔥 **[2025.08.22]** Version 1.0 Refactoring.
261
281
  - 🔥 **[2025.07.18]** The model stress testing now supports randomly generating image-text data for multimodal model evaluation. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#id4).
262
282
  - 🔥 **[2025.07.16]** Support for [τ-bench](https://github.com/sierra-research/tau-bench) has been added, enabling the evaluation of AI Agent performance and reliability in real-world scenarios involving dynamic user and tool interactions. For usage instructions, please refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/get_started/supported_dataset/llm.html#bench).
263
283
  - 🔥 **[2025.07.14]** Support for "Humanity's Last Exam" ([Humanity's-Last-Exam](https://modelscope.cn/datasets/cais/hle)), a highly challenging evaluation benchmark. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset/llm.html#humanity-s-last-exam).
@@ -268,12 +288,12 @@ Please scan the QR code below to join our community groups:
268
288
  - 🔥 **[2025.05.29]** Added support for two long document evaluation benchmarks: [DocMath](https://modelscope.cn/datasets/yale-nlp/DocMath-Eval/summary) and [FRAMES](https://modelscope.cn/datasets/iic/frames/summary). For usage guidelines, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html).
269
289
  - 🔥 **[2025.05.16]** Model service performance stress testing now supports setting various levels of concurrency and outputs a performance test report. [Reference example](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#id3).
270
290
  - 🔥 **[2025.05.13]** Added support for the [ToolBench-Static](https://modelscope.cn/datasets/AI-ModelScope/ToolBench-Static) dataset to evaluate model's tool-calling capabilities. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) for usage instructions. Also added support for the [DROP](https://modelscope.cn/datasets/AI-ModelScope/DROP/dataPeview) and [Winogrande](https://modelscope.cn/datasets/AI-ModelScope/winogrande_val) benchmarks to assess the reasoning capabilities of models.
291
+ <details><summary>More</summary>
292
+
271
293
  - 🔥 **[2025.04.29]** Added Qwen3 Evaluation Best Practices, [welcome to read 📖](https://evalscope.readthedocs.io/en/latest/best_practice/qwen3.html)
272
294
  - 🔥 **[2025.04.27]** Support for text-to-image evaluation: Supports 8 metrics including MPS, HPSv2.1Score, etc., and evaluation benchmarks such as EvalMuse, GenAI-Bench. Refer to the [user documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/t2i.html) for more details.
273
295
  - 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
274
296
  - 🔥 **[2025.04.08]** Support for evaluating embedding model services compatible with the OpenAI API has been added. For more details, check the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html#configure-evaluation-parameters).
275
- <details><summary>More</summary>
276
-
277
297
  - 🔥 **[2025.03.27]** Added support for [AlpacaEval](https://www.modelscope.cn/datasets/AI-ModelScope/alpaca_eval/dataPeview) and [ArenaHard](https://modelscope.cn/datasets/AI-ModelScope/arena-hard-auto-v0.1/summary) evaluation benchmarks. For usage notes, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html)
278
298
  - 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
279
299
  - 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
@@ -306,58 +326,87 @@ Please scan the QR code below to join our community groups:
306
326
 
307
327
  </details>
308
328
 
309
- ## 🛠️ Installation
310
- ### Method 1: Install Using pip
311
- We recommend using conda to manage your environment and installing dependencies with pip:
329
+ ## 🛠️ Environment Setup
330
+
331
+ ### Method 1. Install via pip
332
+
333
+ We recommend using conda to manage your environment and pip to install dependencies. This allows you to use the latest evalscope PyPI package.
312
334
 
313
335
  1. Create a conda environment (optional)
336
+ ```shell
337
+ # Python 3.10 is recommended
338
+ conda create -n evalscope python=3.10
339
+
340
+ # Activate the conda environment
341
+ conda activate evalscope
342
+ ```
343
+ 2. Install dependencies via pip
344
+ ```shell
345
+ pip install evalscope
346
+ ```
347
+ 3. Install additional dependencies (optional)
348
+ - To use model service inference benchmarking features, install the perf dependency:
314
349
  ```shell
315
- # It is recommended to use Python 3.10
316
- conda create -n evalscope python=3.10
317
- # Activate the conda environment
318
- conda activate evalscope
350
+ pip install 'evalscope[perf]'
319
351
  ```
320
-
321
- 2. Install dependencies using pip
352
+ - To use visualization features, install the app dependency:
353
+ ```shell
354
+ pip install 'evalscope[app]'
355
+ ```
356
+ - If you need to use other evaluation backends, you can install OpenCompass, VLMEvalKit, or RAGEval as needed:
357
+ ```shell
358
+ pip install 'evalscope[opencompass]'
359
+ pip install 'evalscope[vlmeval]'
360
+ pip install 'evalscope[rag]'
361
+ ```
362
+ - To install all dependencies:
322
363
  ```shell
323
- pip install evalscope # Install Native backend (default)
324
- # Additional options
325
- pip install 'evalscope[opencompass]' # Install OpenCompass backend
326
- pip install 'evalscope[vlmeval]' # Install VLMEvalKit backend
327
- pip install 'evalscope[rag]' # Install RAGEval backend
328
- pip install 'evalscope[perf]' # Install dependencies for the model performance testing module
329
- pip install 'evalscope[app]' # Install dependencies for visualization
330
- pip install 'evalscope[all]' # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
364
+ pip install 'evalscope[all]'
331
365
  ```
332
366
 
333
- > [!WARNING]
334
- > As the project has been renamed to `evalscope`, for versions `v0.4.3` or earlier, you can install using the following command:
367
+ > [!NOTE]
368
+ > The project has been renamed to `evalscope`. For version `v0.4.3` or earlier, you can install it with:
335
369
  > ```shell
336
- > pip install llmuses<=0.4.3
370
+ > pip install llmuses<=0.4.3
337
371
  > ```
338
- > To import relevant dependencies using `llmuses`:
339
- > ``` python
372
+ > Then, import related dependencies using `llmuses`:
373
+ > ```python
340
374
  > from llmuses import ...
341
375
  > ```
342
376
 
343
- ### Method 2: Install from Source
344
- 1. Download the source code
345
- ```shell
346
- git clone https://github.com/modelscope/evalscope.git
347
- ```
377
+ ### Method 2. Install from source
378
+
379
+ Installing from source allows you to use the latest code and makes it easier for further development and debugging.
348
380
 
381
+ 1. Clone the source code
382
+ ```shell
383
+ git clone https://github.com/modelscope/evalscope.git
384
+ ```
349
385
  2. Install dependencies
350
- ```shell
351
- cd evalscope/
352
- pip install -e . # Install Native backend
353
- # Additional options
354
- pip install -e '.[opencompass]' # Install OpenCompass backend
355
- pip install -e '.[vlmeval]' # Install VLMEvalKit backend
356
- pip install -e '.[rag]' # Install RAGEval backend
357
- pip install -e '.[perf]' # Install Perf dependencies
358
- pip install -e '.[app]' # Install visualization dependencies
359
- pip install -e '.[all]' # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
360
- ```
386
+ ```shell
387
+ cd evalscope/
388
+
389
+ pip install -e .
390
+ ```
391
+ 3. Install additional dependencies
392
+ - To use model service inference benchmarking features, install the perf dependency:
393
+ ```shell
394
+ pip install '.[perf]'
395
+ ```
396
+ - To use visualization features, install the app dependency:
397
+ ```shell
398
+ pip install '.[app]'
399
+ ```
400
+ - If you need to use other evaluation backends, you can install OpenCompass, VLMEvalKit, or RAGEval as needed:
401
+ ```shell
402
+ pip install '.[opencompass]'
403
+ pip install '.[vlmeval]'
404
+ pip install '.[rag]'
405
+ ```
406
+ - To install all dependencies:
407
+ ```shell
408
+ pip install '.[all]'
409
+ ```
361
410
 
362
411
 
363
412
  ## 🚀 Quick Start