evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (273) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/__init__.py +0 -0
  3. evalscope/api/benchmark/__init__.py +3 -0
  4. evalscope/api/benchmark/adapters/__init__.py +3 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
  8. evalscope/api/benchmark/benchmark.py +321 -0
  9. evalscope/api/benchmark/meta.py +115 -0
  10. evalscope/api/dataset/__init__.py +2 -0
  11. evalscope/api/dataset/dataset.py +349 -0
  12. evalscope/api/dataset/loader.py +261 -0
  13. evalscope/api/dataset/utils.py +143 -0
  14. evalscope/api/evaluator/__init__.py +3 -0
  15. evalscope/api/evaluator/cache.py +355 -0
  16. evalscope/api/evaluator/evaluator.py +56 -0
  17. evalscope/api/evaluator/state.py +264 -0
  18. evalscope/api/filter/__init__.py +1 -0
  19. evalscope/api/filter/filter.py +72 -0
  20. evalscope/api/messages/__init__.py +11 -0
  21. evalscope/api/messages/chat_message.py +198 -0
  22. evalscope/api/messages/content.py +102 -0
  23. evalscope/api/messages/utils.py +35 -0
  24. evalscope/api/metric/__init__.py +2 -0
  25. evalscope/api/metric/metric.py +55 -0
  26. evalscope/api/metric/scorer.py +105 -0
  27. evalscope/api/mixin/__init__.py +2 -0
  28. evalscope/api/mixin/dataset_mixin.py +105 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +157 -0
  32. evalscope/api/model/model.py +383 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/ui/app_ui.py +2 -1
  40. evalscope/app/ui/multi_model.py +50 -25
  41. evalscope/app/ui/single_model.py +23 -11
  42. evalscope/app/utils/data_utils.py +42 -26
  43. evalscope/app/utils/text_utils.py +0 -2
  44. evalscope/app/utils/visualization.py +9 -4
  45. evalscope/arguments.py +6 -7
  46. evalscope/backend/opencompass/api_meta_template.py +2 -1
  47. evalscope/backend/opencompass/backend_manager.py +6 -3
  48. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  49. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  50. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  51. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  52. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  53. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  55. evalscope/backend/rag_eval/utils/embedding.py +2 -1
  56. evalscope/backend/rag_eval/utils/llm.py +13 -12
  57. evalscope/benchmarks/__init__.py +0 -2
  58. evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
  59. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
  60. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
  61. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
  62. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
  63. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
  64. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
  65. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  66. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  67. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  68. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  69. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  70. evalscope/benchmarks/arena_hard/utils.py +37 -1
  71. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  72. evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
  73. evalscope/benchmarks/bfcl/generation.py +222 -0
  74. evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
  75. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  76. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  77. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  78. evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
  79. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  80. evalscope/benchmarks/docmath/utils.py +4 -5
  81. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  82. evalscope/benchmarks/frames/frames_adapter.py +135 -52
  83. evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
  84. evalscope/benchmarks/general_arena/utils.py +23 -27
  85. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  86. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  87. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  88. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  89. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  90. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  91. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  92. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  93. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  94. evalscope/benchmarks/ifeval/instructions.py +109 -64
  95. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  96. evalscope/benchmarks/ifeval/utils.py +6 -7
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  105. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  106. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  107. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  108. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
  109. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  110. evalscope/benchmarks/race/race_adapter.py +33 -119
  111. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  112. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  113. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  114. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  115. evalscope/benchmarks/tau_bench/generation.py +147 -0
  116. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
  117. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  118. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  119. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
  120. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  121. evalscope/cli/cli.py +2 -0
  122. evalscope/cli/start_server.py +6 -3
  123. evalscope/collections/__init__.py +2 -10
  124. evalscope/collections/sampler.py +10 -10
  125. evalscope/collections/schema.py +13 -11
  126. evalscope/config.py +95 -54
  127. evalscope/constants.py +29 -61
  128. evalscope/evaluator/__init__.py +1 -1
  129. evalscope/evaluator/evaluator.py +277 -423
  130. evalscope/filters/__init__.py +2 -0
  131. evalscope/filters/extraction.py +126 -0
  132. evalscope/filters/selection.py +57 -0
  133. evalscope/metrics/__init__.py +13 -13
  134. evalscope/metrics/llm_judge.py +32 -30
  135. evalscope/metrics/math_parser.py +27 -22
  136. evalscope/metrics/metric.py +307 -0
  137. evalscope/metrics/metrics.py +22 -18
  138. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  139. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  140. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  141. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  142. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  143. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  144. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  145. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  146. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  147. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  148. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  149. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  150. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  151. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  152. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  153. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  154. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  155. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  156. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  157. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  158. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  159. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  160. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  162. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  163. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  184. evalscope/models/__init__.py +6 -29
  185. evalscope/models/mockllm.py +65 -0
  186. evalscope/models/model_apis.py +47 -0
  187. evalscope/models/modelscope.py +455 -0
  188. evalscope/models/openai_compatible.py +123 -0
  189. evalscope/models/text2image_model.py +124 -0
  190. evalscope/models/utils/openai.py +698 -0
  191. evalscope/perf/benchmark.py +2 -1
  192. evalscope/perf/http_client.py +4 -2
  193. evalscope/perf/plugin/api/custom_api.py +5 -4
  194. evalscope/perf/plugin/api/openai_api.py +11 -9
  195. evalscope/perf/plugin/datasets/custom.py +2 -1
  196. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  197. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  198. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  199. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  200. evalscope/perf/plugin/datasets/openqa.py +4 -2
  201. evalscope/perf/utils/benchmark_util.py +7 -5
  202. evalscope/perf/utils/db_util.py +9 -6
  203. evalscope/perf/utils/local_server.py +8 -3
  204. evalscope/perf/utils/rich_display.py +16 -10
  205. evalscope/report/__init__.py +2 -2
  206. evalscope/report/combinator.py +18 -12
  207. evalscope/report/generator.py +101 -6
  208. evalscope/report/{utils.py → report.py} +8 -6
  209. evalscope/run.py +26 -44
  210. evalscope/summarizer.py +1 -1
  211. evalscope/utils/__init__.py +21 -2
  212. evalscope/utils/chat_service.py +2 -1
  213. evalscope/utils/deprecation_utils.py +12 -1
  214. evalscope/utils/function_utils.py +29 -0
  215. evalscope/utils/io_utils.py +100 -5
  216. evalscope/utils/json_schema.py +208 -0
  217. evalscope/utils/logger.py +51 -12
  218. evalscope/utils/model_utils.py +10 -7
  219. evalscope/utils/multi_choices.py +271 -0
  220. evalscope/utils/url_utils.py +65 -0
  221. evalscope/version.py +2 -2
  222. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
  223. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
  224. tests/aigc/test_t2i.py +22 -4
  225. tests/benchmark/__init__.py +1 -0
  226. tests/benchmark/test_eval.py +386 -0
  227. tests/cli/test_all.py +3 -5
  228. tests/cli/test_collection.py +13 -4
  229. tests/cli/test_custom.py +22 -15
  230. tests/rag/test_clip_benchmark.py +1 -0
  231. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  232. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  233. evalscope/benchmarks/benchmark.py +0 -81
  234. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  235. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  236. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  237. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  238. evalscope/benchmarks/data_adapter.py +0 -528
  239. evalscope/benchmarks/filters.py +0 -59
  240. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  241. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  242. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  243. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  244. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  245. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  246. evalscope/benchmarks/race/race.py +0 -104
  247. evalscope/benchmarks/race/samples.jsonl +0 -5
  248. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  249. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  250. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  251. evalscope/benchmarks/utils.py +0 -60
  252. evalscope/collections/evaluator.py +0 -375
  253. evalscope/metrics/completion_parsers.py +0 -227
  254. evalscope/metrics/named_metrics.py +0 -55
  255. evalscope/models/adapters/__init__.py +0 -14
  256. evalscope/models/adapters/base_adapter.py +0 -84
  257. evalscope/models/adapters/bfcl_adapter.py +0 -246
  258. evalscope/models/adapters/chat_adapter.py +0 -207
  259. evalscope/models/adapters/choice_adapter.py +0 -222
  260. evalscope/models/adapters/custom_adapter.py +0 -71
  261. evalscope/models/adapters/server_adapter.py +0 -236
  262. evalscope/models/adapters/t2i_adapter.py +0 -79
  263. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  264. evalscope/models/custom/__init__.py +0 -4
  265. evalscope/models/custom/custom_model.py +0 -50
  266. evalscope/models/custom/dummy_model.py +0 -99
  267. evalscope/models/local_model.py +0 -128
  268. evalscope/models/register.py +0 -41
  269. tests/cli/test_run.py +0 -489
  270. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
  271. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
  272. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
  273. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,128 +0,0 @@
1
- import importlib
2
- from abc import ABC, abstractmethod
3
- from typing import TYPE_CHECKING, Optional
4
-
5
- from evalscope.constants import DEFAULT_MODEL_CACHE_DIR, DEFAULT_MODEL_REVISION, EvalType, ModelTask
6
- from evalscope.utils.logger import get_logger
7
- from evalscope.utils.model_utils import get_device
8
-
9
- if TYPE_CHECKING:
10
- from evalscope.config import TaskConfig
11
-
12
- logger = get_logger()
13
-
14
-
15
- class LocalModel(ABC):
16
-
17
- def __init__(self,
18
- model_id: str,
19
- model_revision: str = None,
20
- device_map: str = None,
21
- torch_dtype: str = 'auto',
22
- cache_dir: str = None,
23
- **kwargs):
24
-
25
- self.model_id = model_id
26
- self.model_revision = model_revision or DEFAULT_MODEL_REVISION
27
- self.device = device_map or get_device()
28
- self.cache_dir = cache_dir or DEFAULT_MODEL_CACHE_DIR
29
- self.kwargs = kwargs
30
- self.model = None
31
- self.tokenizer = None
32
-
33
- if isinstance(torch_dtype, str) and torch_dtype != 'auto':
34
- import torch
35
- torch_dtype = eval(torch_dtype)
36
- self.torch_dtype = torch_dtype
37
-
38
- self.model_cfg = {
39
- 'model_id': self.model_id,
40
- 'device_map': self.device,
41
- 'torch_dtype': str(self.torch_dtype),
42
- }
43
-
44
- @abstractmethod
45
- def load_model(self):
46
- pass
47
-
48
-
49
- class LocalChatModel(LocalModel):
50
-
51
- def __init__(self, **kwargs):
52
- super().__init__(**kwargs)
53
-
54
- def load_model(self):
55
- from modelscope import AutoModelForCausalLM, AutoTokenizer
56
-
57
- logger.info(f'Loading model {self.model_id} ...')
58
-
59
- self.tokenizer = AutoTokenizer.from_pretrained(
60
- self.model_id,
61
- revision=self.model_revision,
62
- trust_remote_code=True,
63
- cache_dir=self.cache_dir,
64
- )
65
-
66
- # Fix no padding
67
- if self.tokenizer.pad_token is None:
68
- self.tokenizer.pad_token = self.tokenizer.eos_token
69
-
70
- self.model = AutoModelForCausalLM.from_pretrained(
71
- self.model_id,
72
- revision=self.model_revision,
73
- device_map=self.device,
74
- trust_remote_code=True,
75
- torch_dtype=self.torch_dtype,
76
- cache_dir=self.cache_dir,
77
- )
78
-
79
-
80
- class LocalImageModel(LocalModel):
81
-
82
- def __init__(self, **kwargs):
83
- super().__init__(**kwargs)
84
-
85
- self.pipeline_cls = self.kwargs.pop('pipeline_cls', None)
86
- # default to DiffusionPipeline if not specified
87
- if self.pipeline_cls is None:
88
- if 'flux' in self.model_id.lower():
89
- self.pipeline_cls = 'FluxPipeline'
90
- else:
91
- self.pipeline_cls = 'DiffusionPipeline'
92
-
93
- def load_model(self):
94
- # from modelscope import pipeline_cls
95
- module = getattr(importlib.import_module('modelscope'), self.pipeline_cls)
96
-
97
- logger.info(f'Loading model {self.model_id} with {self.pipeline_cls} ...')
98
-
99
- self.model = module.from_pretrained(
100
- self.model_id,
101
- revision=self.model_revision,
102
- torch_dtype=self.torch_dtype,
103
- cache_dir=self.cache_dir,
104
- **self.kwargs,
105
- )
106
-
107
- self.model.to(self.device)
108
-
109
- def __call__(self, *args, **kwargs):
110
- return self.model(*args, **kwargs)
111
-
112
-
113
- def get_local_model(task_cfg: 'TaskConfig') -> Optional[LocalModel]:
114
- """Get the base local model for the task. If the task is not checkpoint-based, return None.
115
- Avoids loading model multiple times for different datasets.
116
- """
117
- if task_cfg.eval_type != EvalType.CHECKPOINT:
118
- return None
119
- elif task_cfg.model_task == ModelTask.TEXT_GENERATION:
120
- base_model = LocalChatModel(model_id=task_cfg.model, **task_cfg.model_args)
121
- base_model.load_model()
122
- return base_model
123
- elif task_cfg.model_task == ModelTask.IMAGE_GENERATION:
124
- base_model = LocalImageModel(model_id=task_cfg.model, **task_cfg.model_args)
125
- base_model.load_model()
126
- return base_model
127
- else:
128
- raise ValueError(f'Unsupported model task: {task_cfg.model_task} for model checkpoint.')
@@ -1,41 +0,0 @@
1
- MODEL_ADAPTERS = {}
2
-
3
-
4
- def register_model_adapter(name):
5
- """
6
- Decorator to register a model adapter with a given name.
7
- :param name: The name of the model adapter.
8
- """
9
-
10
- def decorator(adapter):
11
- if name in MODEL_ADAPTERS:
12
- raise ValueError(f"Model adapter '{name}' is already registered.")
13
- MODEL_ADAPTERS[name] = adapter
14
- return adapter
15
-
16
- return decorator
17
-
18
-
19
- def get_model_adapter(name):
20
- """
21
- Retrieve a registered model adapter by name.
22
- :param name: The name of the model adapter.
23
- :return: The model adapter class or function.
24
- """
25
- if name not in MODEL_ADAPTERS:
26
- raise ValueError(
27
- f"Model adapter '{name}' is not registered. Available model adapters: {list(MODEL_ADAPTERS.keys())}")
28
- return MODEL_ADAPTERS[name]
29
-
30
-
31
- def register_model_adapter_class(cls, name=None):
32
- """
33
- Register a model adapter class.
34
- :param cls: The model adapter class to register
35
- :param name: Optional name for the model adapter. If not provided, the class name will be used.
36
- """
37
- if name is None:
38
- name = cls.__name__
39
- if name in MODEL_ADAPTERS:
40
- raise ValueError(f"Model adapter class '{name}' is already registered.")
41
- MODEL_ADAPTERS[name] = cls
tests/cli/test_run.py DELETED
@@ -1,489 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
- from dotenv import dotenv_values
3
-
4
- from tests.utils import test_level_list
5
-
6
- env = dotenv_values('.env')
7
-
8
- import os
9
- import subprocess
10
- import unittest
11
-
12
- from evalscope.config import TaskConfig
13
- from evalscope.constants import EvalStage, EvalType, JudgeStrategy, OutputType
14
- from evalscope.run import run_task
15
- from evalscope.utils.import_utils import is_module_installed
16
- from evalscope.utils.logger import get_logger
17
-
18
- os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
19
-
20
- logger = get_logger()
21
-
22
-
23
- class TestRun(unittest.TestCase):
24
-
25
- def setUp(self) -> None:
26
- logger.info('Init env for evalscope native run UTs ...\n')
27
- self._check_env('evalscope')
28
-
29
- def tearDown(self) -> None:
30
- pass
31
-
32
- @staticmethod
33
- def _check_env(module_name: str):
34
- if is_module_installed(module_name):
35
- logger.info(f'{module_name} is installed.')
36
- else:
37
- raise ModuleNotFoundError(f'run: pip install {module_name}')
38
-
39
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
40
- def test_run_simple_eval(self):
41
- model = 'qwen/Qwen2-0.5B-Instruct'
42
- datasets = 'arc' # arc ceval
43
- limit = 10
44
-
45
- cmd_simple = f'evalscope eval ' \
46
- f'--model {model} ' \
47
- f'--datasets {datasets} ' \
48
- f'--limit {limit}'
49
-
50
- logger.info(f'Start to run command: {cmd_simple}')
51
- run_res = subprocess.run(cmd_simple, text=True, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
52
-
53
- assert run_res.returncode == 0, f'Failed to run command: {cmd_simple}'
54
- logger.info(f'>>test_run_simple_eval stdout: {run_res.stdout}')
55
- logger.error(f'>>test_run_simple_eval stderr: {run_res.stderr}')
56
-
57
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
58
- def test_run_eval_with_args(self):
59
- model = 'qwen/Qwen2-0.5B-Instruct'
60
- datasets = 'arc' # arc ceval
61
- limit = 5
62
- dataset_args = '{"ceval": {"few_shot_num": 0, "few_shot_random": false}}'
63
-
64
- cmd_with_args = f'evalscope eval ' \
65
- f'--model {model} ' \
66
- f'--datasets {datasets} ' \
67
- f'--limit {limit} ' \
68
- f'--generation-config do_sample=true,temperature=0.6,max_length=65535,max_new_tokens=65535,max_tokens=65535,n=1,top_p=0.95,top_k=20 ' \
69
- f"""--dataset-args \'{dataset_args}\' """
70
-
71
- logger.info(f'Start to run command: {cmd_with_args}')
72
- run_res = subprocess.run(cmd_with_args, text=True, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
73
-
74
- assert run_res.returncode == 0, f'Failed to run command: {cmd_with_args}'
75
- logger.info(f'>>test_run_eval_with_args stdout: {run_res.stdout}')
76
- logger.error(f'>>test_run_eval_with_args stderr: {run_res.stderr}')
77
-
78
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
79
- def test_run_yaml_config(self):
80
- from evalscope import run_task
81
-
82
- run_task(task_cfg='examples/tasks/eval_native.yaml')
83
-
84
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
85
- def test_run_task(self):
86
- task_cfg = TaskConfig(
87
- model='qwen/Qwen2.5-0.5B-Instruct',
88
- datasets=[
89
- 'iquiz',
90
- # 'ifeval',
91
- # 'mmlu',
92
- # 'mmlu_pro',
93
- # 'musr',
94
- # 'process_bench',
95
- # 'race',
96
- # 'trivia_qa',
97
- # 'cmmlu',
98
- # 'humaneval',
99
- # 'super_gpqa',
100
- # 'gsm8k',
101
- # 'bbh',
102
- # 'competition_math',
103
- # 'math_500',
104
- 'aime24',
105
- 'gpqa',
106
- # 'arc',
107
- # 'ceval',
108
- # 'hellaswag',
109
- # 'general_mcq',
110
- # 'general_qa'
111
- ],
112
- dataset_args={
113
- 'mmlu': {
114
- 'subset_list': ['elementary_mathematics'],
115
- 'few_shot_num': 0
116
- },
117
- 'mmlu_pro': {
118
- 'subset_list': ['math', 'health'],
119
- 'few_shot_num': 4
120
- },
121
- 'ceval': {
122
- 'subset_list': [
123
- 'computer_network', 'operating_system', 'computer_architecture'
124
- ],
125
- 'few_shot_num': 0
126
- },
127
- 'cmmlu': {
128
- 'subset_list': ['elementary_chinese'],
129
- 'few_shot_num': 0
130
- },
131
- 'bbh': {
132
- 'subset_list': ['word_sorting', 'movie_recommendation'],
133
- },
134
- 'gpqa': {
135
- 'subset_list': ['gpqa_diamond'],
136
- 'few_shot_num': 0
137
- },
138
- 'humaneval': {
139
- 'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
140
- },
141
- 'competition_math': {
142
- 'subset_list': ['Level 1']
143
- },
144
- 'process_bench': {
145
- 'subset_list': ['gsm8k'],
146
- },
147
- 'musr': {
148
- 'subset_list': ['murder_mysteries'],
149
- },
150
- 'general_mcq': {
151
- 'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
152
- 'subset_list': [
153
- 'example' # 评测数据集名称,上述 *_dev.csv 中的 *
154
- ],
155
- 'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}' # 问题模板
156
- },
157
- 'general_qa': {
158
- 'local_path': 'custom_eval/text/qa', # 自定义数据集路径
159
- 'subset_list': [
160
- 'example', # 评测数据集名称,上述 *_dev.csv 中的 *
161
- # 'test'
162
- ],
163
- 'metric_list': ['AverageBLEU']
164
- },
165
- 'super_gpqa': {
166
- 'subset_list': ['Philosophy', 'Education'],
167
- 'few_shot_num': 0
168
- },
169
- 'ifeval': {
170
- 'filters': {
171
- 'remove_until': '</think>'
172
- }
173
- }
174
- },
175
- limit=2,
176
- eval_batch_size=2,
177
- generation_config={
178
- 'max_new_tokens': 2048,
179
- 'temperature': 0.7,
180
- 'num_return_sequences': 1,
181
- },
182
- # debug=True
183
- )
184
- run_task(task_cfg=task_cfg)
185
-
186
-
187
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
188
- def test_run_one_task(self):
189
- from evalscope.config import TaskConfig
190
-
191
- task_cfg = TaskConfig(
192
- model='Qwen/Qwen3-1.7B',
193
- datasets=[
194
- # 'iquiz',
195
- # 'math_500',
196
- # 'aime24',
197
- # 'competition_math',
198
- # 'mmlu',
199
- # 'simple_qa',
200
- 'truthful_qa',
201
- ],
202
- dataset_args={
203
- 'competition_math': {
204
- 'subset_list': ['Level 4', 'Level 5']
205
- },
206
- 'mmlu': {
207
- 'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
208
- 'few_shot_num': 0
209
- },
210
- },
211
- limit=5,
212
- eval_batch_size=5,
213
- generation_config={
214
- 'max_new_tokens': 1000, # 最大生成token数,建议设置为较大值避免输出截断
215
- 'temperature': 0.7, # 采样温度 (qwen 报告推荐值)
216
- 'top_p': 0.8, # top-p采样 (qwen 报告推荐值)
217
- 'top_k': 20, # top-k采样 (qwen 报告推荐值)
218
- 'chat_template_kwargs': {'enable_thinking': False} # 关闭思考模式
219
- },
220
- judge_strategy=JudgeStrategy.AUTO,
221
- )
222
-
223
- run_task(task_cfg=task_cfg)
224
-
225
-
226
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
227
- def test_run_task_loop(self):
228
- os.environ['CUDA_VISIBLE_DEVICES'] = '2'
229
- from evalscope.config import TaskConfig
230
-
231
- task_cfg1 = TaskConfig(
232
- model='Qwen/Qwen2.5-0.5B-Instruct',
233
- model_id='model1',
234
- datasets=['iquiz'],
235
- limit=10
236
- )
237
- task_cfg2 = TaskConfig(
238
- model='Qwen/Qwen2.5-0.5B-Instruct',
239
- model_id='model2',
240
- datasets=['iquiz'],
241
- limit=10
242
- )
243
- task_cfg3 = TaskConfig(
244
- model='Qwen/Qwen2.5-0.5B-Instruct',
245
- model_id='model3',
246
- datasets=['iquiz'],
247
- limit=10
248
- )
249
-
250
- run_task(task_cfg=[task_cfg1, task_cfg2, task_cfg3])
251
-
252
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
253
- def test_run_server_model(self):
254
- from evalscope.config import TaskConfig
255
-
256
- task_cfg = TaskConfig(
257
- model='qwen-plus',
258
- api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
259
- api_key= env.get('DASHSCOPE_API_KEY'),
260
- eval_type=EvalType.SERVICE,
261
- datasets=[
262
- # 'iquiz',
263
- # 'ifeval',
264
- # 'mmlu',
265
- # 'mmlu_pro',
266
- # 'musr',
267
- # 'process_bench',
268
- # 'race',
269
- 'trivia_qa',
270
- # 'cmmlu',
271
- # 'humaneval',
272
- # 'gsm8k',
273
- # 'bbh',
274
- # 'competition_math',
275
- # 'math_500',
276
- # 'aime24',
277
- # 'gpqa',
278
- # 'arc',
279
- # 'ceval',
280
- # 'hellaswag',
281
- # 'general_mcq',
282
- # 'general_qa',
283
- # 'super_gpqa',
284
- # 'mmlu_redux',
285
- # 'maritime_bench',
286
- # 'drop',
287
- # 'winogrande',
288
- # 'tool_bench',
289
- # 'frames',
290
- # 'bfcl_v3',
291
- # 'truthful_qa',
292
- # 'tau_bench',
293
- # 'hle'
294
- ],
295
- dataset_args={
296
- 'mmlu': {
297
- 'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
298
- 'few_shot_num': 0
299
- },
300
- 'mmlu_pro': {
301
- 'subset_list': ['math', 'health'],
302
- 'few_shot_num': 0
303
- },
304
- 'ceval': {
305
- 'subset_list': [
306
- 'computer_network', 'operating_system', 'computer_architecture'
307
- ],
308
- 'few_shot_num': 0
309
- },
310
- 'cmmlu': {
311
- 'subset_list': ['elementary_chinese'],
312
- 'few_shot_num': 0
313
- },
314
- 'bbh': {
315
- 'subset_list': ['word_sorting', 'movie_recommendation'],
316
- },
317
- 'gpqa': {
318
- # 'subset_list': ['gpqa_diamond'],
319
- 'few_shot_num': 0,
320
- 'local_path': './data/data/gpqa',
321
- },
322
- 'humaneval': {
323
- 'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
324
- },
325
- 'competition_math': {
326
- 'subset_list': ['Level 1']
327
- },
328
- 'process_bench': {
329
- 'subset_list': ['gsm8k'],
330
- },
331
- 'musr': {
332
- 'subset_list': ['murder_mysteries'],
333
- },
334
- 'general_mcq': {
335
- 'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
336
- 'subset_list': [
337
- 'example' # 评测数据集名称,上述 *_dev.csv 中的 *
338
- ],
339
- 'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}' # 问题模板
340
- },
341
- 'general_qa': {
342
- 'local_path': 'custom_eval/text/qa', # 自定义数据集路径
343
- 'subset_list': [
344
- 'example', # 评测数据集名称,上述 *_dev.csv 中的 *
345
- # 'test'
346
- ],
347
- 'metric_list': ['AverageRouge']
348
- },
349
- 'super_gpqa': {
350
- 'subset_list': ['Philosophy', 'Education'],
351
- 'few_shot_num': 0
352
- },
353
- 'mmlu_redux':{
354
- 'subset_list': ['abstract_algebra']
355
- },
356
- 'frames':{
357
- 'local_path': 'data/iic/frames',
358
- },
359
- 'bfcl_v3': {
360
- 'subset_list': ['parallel'],
361
- 'extra_params': {
362
- # 'is_fc_model': False,
363
- }
364
- },
365
- 'tau_bench': {
366
- 'extra_params': {
367
- 'user_model': 'qwen-plus',
368
- 'api_key': env.get('DASHSCOPE_API_KEY'),
369
- 'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
370
- }
371
- },
372
- 'hle': {
373
- 'subset_list': ['Math', 'Other'],
374
- },
375
- },
376
- eval_batch_size=10,
377
- limit=10,
378
- # debug=True,
379
- stream=True,
380
- generation_config={
381
- 'temperature': 0.6,
382
- 'n': 1,
383
- 'max_tokens': 4096,
384
- # 'extra_headers':{'key': 'value'},
385
- },
386
- ignore_errors=False,
387
- )
388
-
389
- run_task(task_cfg=task_cfg)
390
-
391
-
392
-
393
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
394
- def test_run_judge_model(self):
395
- from evalscope.config import TaskConfig
396
-
397
- task_cfg = TaskConfig(
398
- model='qwen-plus',
399
- api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
400
- api_key= env.get('DASHSCOPE_API_KEY'),
401
- eval_type=EvalType.SERVICE,
402
- datasets=[
403
- # 'math_500',
404
- # 'aime24',
405
- # 'competition_math',
406
- # 'arc',
407
- # 'gsm8k',
408
- # 'truthful_qa',
409
- # 'simple_qa',
410
- # 'chinese_simpleqa',
411
- # 'live_code_bench',
412
- # 'humaneval',
413
- # 'general_qa',
414
- # 'alpaca_eval',
415
- # 'arena_hard',
416
- # 'frames',
417
- # 'docmath',
418
- # 'needle_haystack',
419
- # 'ifeval',
420
- 'hle'
421
- ],
422
- dataset_args={
423
- 'needle_haystack': {
424
- 'subset_list': ['english'],
425
- 'extra_params': {
426
- 'show_score': True,
427
- }
428
- },
429
- 'competition_math': {
430
- 'subset_list': ['Level 4']
431
- },
432
- 'live_code_bench': {
433
- 'extra_params': {
434
- 'start_date': '2024-08-01',
435
- 'end_date': '2025-02-28'
436
- },
437
- 'local_path': '/root/.cache/modelscope/hub/datasets/AI-ModelScope/code_generation_lite'
438
- },
439
- 'general_qa': {
440
- 'local_path': 'custom_eval/text/qa', # 自定义数据集路径
441
- 'subset_list': [
442
- 'example', # 评测数据集名称,上述 *_dev.csv 中的 *
443
- # 'test'
444
- ]
445
- },
446
- 'chinese_simpleqa': {
447
- 'subset_list': [
448
- '中华文化'
449
- ]
450
- },
451
- 'frames': {
452
- 'local_path': '/root/.cache/modelscope/hub/datasets/iic/frames'
453
- },
454
- 'hle': {
455
- 'subset_list': ['Math', 'Other'],
456
- },
457
- },
458
- eval_batch_size=10,
459
- limit=3,
460
- judge_strategy=JudgeStrategy.LLM,
461
- judge_worker_num=5,
462
- judge_model_args={
463
- 'model_id': 'qwen2.5-72b-instruct',
464
- 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
465
- 'api_key': env.get('DASHSCOPE_API_KEY'),
466
- 'generation_config': {
467
- 'temperature': 0.0,
468
- 'max_tokens': 4096
469
- }
470
- },
471
- generation_config={
472
- 'max_new_tokens': 20000,
473
- 'temperature': 0.0,
474
- 'seed': 42,
475
- 'n': 1
476
- },
477
- timeout=60000,
478
- stream=True,
479
- use_cache='outputs/20250714_150626'
480
- # analysis_report=True,
481
- # debug=True,
482
- # use_cache='outputs/20250616_161931'
483
- )
484
-
485
- run_task(task_cfg=task_cfg)
486
-
487
-
488
- if __name__ == '__main__':
489
- unittest.main()