evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (273) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/__init__.py +0 -0
  3. evalscope/api/benchmark/__init__.py +3 -0
  4. evalscope/api/benchmark/adapters/__init__.py +3 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
  8. evalscope/api/benchmark/benchmark.py +321 -0
  9. evalscope/api/benchmark/meta.py +115 -0
  10. evalscope/api/dataset/__init__.py +2 -0
  11. evalscope/api/dataset/dataset.py +349 -0
  12. evalscope/api/dataset/loader.py +261 -0
  13. evalscope/api/dataset/utils.py +143 -0
  14. evalscope/api/evaluator/__init__.py +3 -0
  15. evalscope/api/evaluator/cache.py +355 -0
  16. evalscope/api/evaluator/evaluator.py +56 -0
  17. evalscope/api/evaluator/state.py +264 -0
  18. evalscope/api/filter/__init__.py +1 -0
  19. evalscope/api/filter/filter.py +72 -0
  20. evalscope/api/messages/__init__.py +11 -0
  21. evalscope/api/messages/chat_message.py +198 -0
  22. evalscope/api/messages/content.py +102 -0
  23. evalscope/api/messages/utils.py +35 -0
  24. evalscope/api/metric/__init__.py +2 -0
  25. evalscope/api/metric/metric.py +55 -0
  26. evalscope/api/metric/scorer.py +105 -0
  27. evalscope/api/mixin/__init__.py +2 -0
  28. evalscope/api/mixin/dataset_mixin.py +105 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +157 -0
  32. evalscope/api/model/model.py +383 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/ui/app_ui.py +2 -1
  40. evalscope/app/ui/multi_model.py +50 -25
  41. evalscope/app/ui/single_model.py +23 -11
  42. evalscope/app/utils/data_utils.py +42 -26
  43. evalscope/app/utils/text_utils.py +0 -2
  44. evalscope/app/utils/visualization.py +9 -4
  45. evalscope/arguments.py +6 -7
  46. evalscope/backend/opencompass/api_meta_template.py +2 -1
  47. evalscope/backend/opencompass/backend_manager.py +6 -3
  48. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  49. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  50. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  51. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  52. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  53. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  55. evalscope/backend/rag_eval/utils/embedding.py +2 -1
  56. evalscope/backend/rag_eval/utils/llm.py +13 -12
  57. evalscope/benchmarks/__init__.py +0 -2
  58. evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
  59. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
  60. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
  61. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
  62. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
  63. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
  64. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
  65. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  66. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  67. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  68. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  69. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  70. evalscope/benchmarks/arena_hard/utils.py +37 -1
  71. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  72. evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
  73. evalscope/benchmarks/bfcl/generation.py +222 -0
  74. evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
  75. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  76. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  77. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  78. evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
  79. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  80. evalscope/benchmarks/docmath/utils.py +4 -5
  81. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  82. evalscope/benchmarks/frames/frames_adapter.py +135 -52
  83. evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
  84. evalscope/benchmarks/general_arena/utils.py +23 -27
  85. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  86. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  87. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  88. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  89. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  90. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  91. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  92. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  93. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  94. evalscope/benchmarks/ifeval/instructions.py +109 -64
  95. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  96. evalscope/benchmarks/ifeval/utils.py +6 -7
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  105. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  106. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  107. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  108. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
  109. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  110. evalscope/benchmarks/race/race_adapter.py +33 -119
  111. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  112. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  113. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  114. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  115. evalscope/benchmarks/tau_bench/generation.py +147 -0
  116. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
  117. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  118. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  119. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
  120. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  121. evalscope/cli/cli.py +2 -0
  122. evalscope/cli/start_server.py +6 -3
  123. evalscope/collections/__init__.py +2 -10
  124. evalscope/collections/sampler.py +10 -10
  125. evalscope/collections/schema.py +13 -11
  126. evalscope/config.py +95 -54
  127. evalscope/constants.py +29 -61
  128. evalscope/evaluator/__init__.py +1 -1
  129. evalscope/evaluator/evaluator.py +277 -423
  130. evalscope/filters/__init__.py +2 -0
  131. evalscope/filters/extraction.py +126 -0
  132. evalscope/filters/selection.py +57 -0
  133. evalscope/metrics/__init__.py +13 -13
  134. evalscope/metrics/llm_judge.py +32 -30
  135. evalscope/metrics/math_parser.py +27 -22
  136. evalscope/metrics/metric.py +307 -0
  137. evalscope/metrics/metrics.py +22 -18
  138. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  139. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  140. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  141. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  142. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  143. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  144. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  145. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  146. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  147. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  148. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  149. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  150. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  151. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  152. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  153. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  154. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  155. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  156. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  157. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  158. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  159. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  160. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  162. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  163. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  184. evalscope/models/__init__.py +6 -29
  185. evalscope/models/mockllm.py +65 -0
  186. evalscope/models/model_apis.py +47 -0
  187. evalscope/models/modelscope.py +455 -0
  188. evalscope/models/openai_compatible.py +123 -0
  189. evalscope/models/text2image_model.py +124 -0
  190. evalscope/models/utils/openai.py +698 -0
  191. evalscope/perf/benchmark.py +2 -1
  192. evalscope/perf/http_client.py +4 -2
  193. evalscope/perf/plugin/api/custom_api.py +5 -4
  194. evalscope/perf/plugin/api/openai_api.py +11 -9
  195. evalscope/perf/plugin/datasets/custom.py +2 -1
  196. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  197. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  198. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  199. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  200. evalscope/perf/plugin/datasets/openqa.py +4 -2
  201. evalscope/perf/utils/benchmark_util.py +7 -5
  202. evalscope/perf/utils/db_util.py +9 -6
  203. evalscope/perf/utils/local_server.py +8 -3
  204. evalscope/perf/utils/rich_display.py +16 -10
  205. evalscope/report/__init__.py +2 -2
  206. evalscope/report/combinator.py +18 -12
  207. evalscope/report/generator.py +101 -6
  208. evalscope/report/{utils.py → report.py} +8 -6
  209. evalscope/run.py +26 -44
  210. evalscope/summarizer.py +1 -1
  211. evalscope/utils/__init__.py +21 -2
  212. evalscope/utils/chat_service.py +2 -1
  213. evalscope/utils/deprecation_utils.py +12 -1
  214. evalscope/utils/function_utils.py +29 -0
  215. evalscope/utils/io_utils.py +100 -5
  216. evalscope/utils/json_schema.py +208 -0
  217. evalscope/utils/logger.py +51 -12
  218. evalscope/utils/model_utils.py +10 -7
  219. evalscope/utils/multi_choices.py +271 -0
  220. evalscope/utils/url_utils.py +65 -0
  221. evalscope/version.py +2 -2
  222. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
  223. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
  224. tests/aigc/test_t2i.py +22 -4
  225. tests/benchmark/__init__.py +1 -0
  226. tests/benchmark/test_eval.py +386 -0
  227. tests/cli/test_all.py +3 -5
  228. tests/cli/test_collection.py +13 -4
  229. tests/cli/test_custom.py +22 -15
  230. tests/rag/test_clip_benchmark.py +1 -0
  231. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  232. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  233. evalscope/benchmarks/benchmark.py +0 -81
  234. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  235. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  236. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  237. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  238. evalscope/benchmarks/data_adapter.py +0 -528
  239. evalscope/benchmarks/filters.py +0 -59
  240. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  241. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  242. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  243. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  244. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  245. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  246. evalscope/benchmarks/race/race.py +0 -104
  247. evalscope/benchmarks/race/samples.jsonl +0 -5
  248. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  249. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  250. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  251. evalscope/benchmarks/utils.py +0 -60
  252. evalscope/collections/evaluator.py +0 -375
  253. evalscope/metrics/completion_parsers.py +0 -227
  254. evalscope/metrics/named_metrics.py +0 -55
  255. evalscope/models/adapters/__init__.py +0 -14
  256. evalscope/models/adapters/base_adapter.py +0 -84
  257. evalscope/models/adapters/bfcl_adapter.py +0 -246
  258. evalscope/models/adapters/chat_adapter.py +0 -207
  259. evalscope/models/adapters/choice_adapter.py +0 -222
  260. evalscope/models/adapters/custom_adapter.py +0 -71
  261. evalscope/models/adapters/server_adapter.py +0 -236
  262. evalscope/models/adapters/t2i_adapter.py +0 -79
  263. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  264. evalscope/models/custom/__init__.py +0 -4
  265. evalscope/models/custom/custom_model.py +0 -50
  266. evalscope/models/custom/dummy_model.py +0 -99
  267. evalscope/models/local_model.py +0 -128
  268. evalscope/models/register.py +0 -41
  269. tests/cli/test_run.py +0 -489
  270. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
  271. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
  272. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
  273. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
evalscope/config.py CHANGED
@@ -1,16 +1,25 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
2
+ # flake8: noqa: E501
3
3
  import copy
4
4
  import os
5
5
  from argparse import Namespace
6
6
  from dataclasses import dataclass, field
7
7
  from typing import Dict, List, Optional, Union
8
8
 
9
- from evalscope.constants import (DEFAULT_DATASET_CACHE_DIR, DEFAULT_WORK_DIR, EvalBackend, EvalStage, EvalType, HubType,
10
- JudgeStrategy, ModelTask, OutputType)
11
- from evalscope.models import CustomModel, DummyCustomModel
9
+ from evalscope.api.model import GenerateConfig
10
+ from evalscope.constants import (
11
+ DEFAULT_DATASET_CACHE_DIR,
12
+ DEFAULT_WORK_DIR,
13
+ EvalBackend,
14
+ EvalType,
15
+ HubType,
16
+ JudgeStrategy,
17
+ ModelTask,
18
+ OutputType,
19
+ )
12
20
  from evalscope.utils.argument_utils import BaseArgument, parse_int_or_float
13
- from evalscope.utils.io_utils import dict_to_yaml, gen_hash
21
+ from evalscope.utils.deprecation_utils import deprecated_warning
22
+ from evalscope.utils.io_utils import dict_to_yaml, gen_hash, safe_filename
14
23
  from evalscope.utils.logger import get_logger
15
24
 
16
25
  logger = get_logger()
@@ -19,13 +28,12 @@ logger = get_logger()
19
28
  @dataclass
20
29
  class TaskConfig(BaseArgument):
21
30
  # Model-related arguments
22
- model: Union[str, 'CustomModel', None] = None
31
+ model: Optional[str] = None
23
32
  model_id: Optional[str] = None
24
33
  model_args: Dict = field(default_factory=dict)
25
34
  model_task: str = ModelTask.TEXT_GENERATION
26
35
 
27
36
  # Template-related arguments
28
- template_type: Optional[str] = None # Deprecated, will be removed in v1.0.0.
29
37
  chat_template: Optional[str] = None
30
38
 
31
39
  # Dataset-related arguments
@@ -33,23 +41,22 @@ class TaskConfig(BaseArgument):
33
41
  dataset_args: Dict = field(default_factory=dict)
34
42
  dataset_dir: str = DEFAULT_DATASET_CACHE_DIR
35
43
  dataset_hub: str = HubType.MODELSCOPE
44
+ repeats: int = 1 # Number of times to repeat the dataset items for k-metrics
36
45
 
37
46
  # Generation configuration arguments
38
- generation_config: Dict = field(default_factory=dict)
47
+ generation_config: Union[Dict, GenerateConfig] = field(default_factory=dict)
39
48
 
40
49
  # Evaluation-related arguments
41
50
  eval_type: str = EvalType.CHECKPOINT
42
51
  eval_backend: str = EvalBackend.NATIVE
43
52
  eval_config: Union[str, Dict, None] = None
44
- stage: str = EvalStage.ALL
45
53
  limit: Optional[Union[int, float]] = None
46
- eval_batch_size: Optional[int] = None
54
+ eval_batch_size: int = 1
47
55
 
48
56
  # Cache and working directory arguments
49
- mem_cache: bool = False # Deprecated, will be removed in v1.0.0.
50
57
  use_cache: Optional[str] = None
58
+ rerun_review: bool = False
51
59
  work_dir: str = DEFAULT_WORK_DIR
52
- outputs: Optional[str] = None # Deprecated, will be removed in v1.0.0.
53
60
 
54
61
  # Debug and runtime mode arguments
55
62
  ignore_errors: bool = False
@@ -59,7 +66,7 @@ class TaskConfig(BaseArgument):
59
66
  api_url: Optional[str] = None # Only used for server model
60
67
  api_key: Optional[str] = 'EMPTY' # Only used for server model
61
68
  timeout: Optional[float] = None # Only used for server model
62
- stream: bool = False # Only used for server model
69
+ stream: Optional[bool] = None # Only used for server model
63
70
 
64
71
  # LLMJudge arguments
65
72
  judge_strategy: str = JudgeStrategy.AUTO
@@ -68,55 +75,87 @@ class TaskConfig(BaseArgument):
68
75
  analysis_report: bool = False
69
76
 
70
77
  def __post_init__(self):
78
+ self.__init_model_and_id()
79
+
80
+ self.__init_eval_data_config()
81
+
82
+ # Set default generation_config and model_args
83
+ self.__init_default_generation_config()
84
+ self.__init_default_model_args()
85
+
86
+ def __init_model_and_id(self):
87
+ # Set model to DummyCustomModel if not provided
71
88
  if self.model is None:
72
- self.model = DummyCustomModel()
73
- self.eval_type = EvalType.CUSTOM
89
+ self.model = self.model_task
90
+ self.eval_type = EvalType.MOCK_LLM
91
+ else:
92
+ if self.model_task == ModelTask.IMAGE_GENERATION:
93
+ self.eval_type = EvalType.TEXT2IMAGE
74
94
 
75
- if (not self.model_id) and self.model:
76
- if isinstance(self.model, CustomModel):
77
- self.model_id = self.model.config.get('model_id', 'custom_model')
95
+ # Set model_id if not provided
96
+ if not self.model_id:
97
+ if self.model:
98
+ self.model_id = safe_filename(os.path.basename(self.model))
78
99
  else:
79
- self.model_id = os.path.basename(self.model).rstrip(os.sep)
80
- # fix path error, see http://github.com/modelscope/evalscope/issues/377
81
- self.model_id = self.model_id.replace(':', '-')
82
-
83
- # Set default eval_batch_size based on eval_type
84
- if self.eval_batch_size is None:
85
- self.eval_batch_size = 8 if self.eval_type == EvalType.SERVICE else 1
100
+ self.model_id = 'dummy_model'
86
101
 
102
+ def __init_eval_data_config(self):
87
103
  # Post process limit
88
104
  if self.limit is not None:
89
105
  self.limit = parse_int_or_float(self.limit)
90
106
 
91
- # Set default generation_config and model_args
92
- self.__init_default_generation_config()
93
- self.__init_default_model_args()
94
-
95
107
  def __init_default_generation_config(self):
96
- if self.generation_config:
97
- return
98
- if self.model_task == ModelTask.IMAGE_GENERATION:
99
- self.generation_config = {
100
- 'height': 1024,
101
- 'width': 1024,
102
- 'num_inference_steps': 50,
103
- 'guidance_scale': 9.0,
104
- }
105
- elif self.model_task == ModelTask.TEXT_GENERATION:
106
- if self.eval_type == EvalType.CHECKPOINT:
107
- self.generation_config = {
108
- 'max_length': 2048,
109
- 'max_new_tokens': 512,
110
- 'do_sample': False,
111
- 'top_k': 50,
112
- 'top_p': 1.0,
113
- 'temperature': 1.0,
114
- }
115
- elif self.eval_type == EvalType.SERVICE:
108
+ if not self.generation_config:
109
+ if self.model_task == ModelTask.IMAGE_GENERATION:
116
110
  self.generation_config = {
117
- 'max_tokens': 2048,
118
- 'temperature': 0.0,
111
+ 'height': 1024,
112
+ 'width': 1024,
113
+ 'num_inference_steps': 50,
114
+ 'guidance_scale': 9.0,
119
115
  }
116
+ elif self.model_task == ModelTask.TEXT_GENERATION:
117
+ if self.eval_type == EvalType.CHECKPOINT:
118
+ self.generation_config = {
119
+ 'max_tokens': 2048,
120
+ 'do_sample': False,
121
+ 'top_k': 50,
122
+ 'top_p': 1.0,
123
+ 'temperature': 1.0,
124
+ 'n': 1,
125
+ }
126
+ elif self.eval_type == EvalType.SERVICE:
127
+ self.generation_config = {
128
+ 'max_tokens': 2048,
129
+ 'temperature': 0.0,
130
+ }
131
+ if isinstance(self.generation_config, dict):
132
+ self.generation_config = GenerateConfig.model_validate(self.generation_config)
133
+
134
+ # Set eval_batch_size to generation_config.batch_size
135
+ self.generation_config.batch_size = self.eval_batch_size
136
+
137
+ # Set default values for generation_config
138
+ if self.timeout is not None:
139
+ deprecated_warning(
140
+ logger,
141
+ 'The `timeout` parameter is deprecated and will be removed in v1.1.0. Use `generation_config.timeout` instead.'
142
+ )
143
+ self.generation_config.timeout = self.timeout
144
+
145
+ if self.stream is not None:
146
+ deprecated_warning(
147
+ logger,
148
+ 'The `stream` parameter is deprecated and will be removed in v1.1.0. Use `generation_config.stream` instead.'
149
+ )
150
+ self.generation_config.stream = self.stream
151
+
152
+ if self.generation_config.n is not None and self.generation_config.n > 1:
153
+ self.repeats = self.generation_config.n
154
+ self.generation_config.n = 1
155
+ deprecated_warning(
156
+ logger,
157
+ 'The `n` parameter in generation_config is deprecated and will be removed in v1.1.0. Use `TaskConfig.repeats` instead.'
158
+ )
120
159
 
121
160
  def __init_default_model_args(self):
122
161
  if self.model_args:
@@ -143,9 +182,11 @@ class TaskConfig(BaseArgument):
143
182
  logger.warning(f'Failed to dump overall task config: {e}')
144
183
 
145
184
  def to_dict(self):
146
- result = self.__dict__.copy()
147
- if isinstance(self.model, CustomModel):
148
- result['model'] = self.model.__class__.__name__
185
+ result = copy.deepcopy(self.__dict__)
186
+ del result['api_key'] # Do not expose api_key in the config
187
+
188
+ if isinstance(self.generation_config, GenerateConfig):
189
+ result['generation_config'] = self.generation_config.model_dump(exclude_unset=True)
149
190
  return result
150
191
 
151
192
 
evalscope/constants.py CHANGED
@@ -9,9 +9,12 @@ from modelscope.utils.file_utils import get_dataset_cache_root, get_model_cache_
9
9
 
10
10
  DEFAULT_WORK_DIR = './outputs'
11
11
  DEFAULT_MODEL_REVISION = DEFAULT_REPOSITORY_REVISION # master
12
- DEFAULT_MODEL_CACHE_DIR = get_model_cache_root() # ~/.cache/modelscope/hub
13
- DEFAULT_DATASET_CACHE_DIR = get_dataset_cache_root() # ~/.cache/modelscope/datasets
12
+ DEFAULT_MODEL_CACHE_DIR = get_model_cache_root() # ~/.cache/modelscope/hub/models
13
+ DEFAULT_DATASET_CACHE_DIR = get_dataset_cache_root() # ~/.cache/modelscope/hub/datasets
14
14
  DEFAULT_ROOT_CACHE_DIR = DEFAULT_DATASET_CACHE_DIR # compatible with old version
15
+ DEFAULT_EVALSCOPE_CACHE_DIR = os.path.expanduser(
16
+ os.getenv('EVALSCOPE_CACHE', '~/.cache/evalscope')
17
+ ) # ~/.cache/evalscope
15
18
 
16
19
 
17
20
  class HubType:
@@ -44,22 +47,12 @@ class MetricsConstant:
44
47
  class ArenaWinner:
45
48
 
46
49
  MODEL_A = 'model_a'
47
-
48
50
  MODEL_B = 'model_b'
49
-
50
51
  TIE = 'tie'
51
-
52
52
  TIE_BOTH_BAD = 'tie_both_bad'
53
-
54
53
  UNKNOWN = 'unknown'
55
54
 
56
55
 
57
- class ArenaMode:
58
- SINGLE = 'single'
59
- PAIRWISE = 'pairwise'
60
- PAIRWISE_BASELINE = 'pairwise_baseline'
61
-
62
-
63
56
  class AnswerKeys:
64
57
  INDEX = 'index'
65
58
  ANSWER_ID = 'answer_id'
@@ -70,58 +63,13 @@ class AnswerKeys:
70
63
  CHOICES = 'choices'
71
64
 
72
65
 
73
- class ReviewKeys:
74
- REVIEW_ID = 'review_id'
75
- REVIEWED = 'reviewed'
76
- REVIEWER_SPEC = 'reviewer_spec'
77
- REVIEW_TIME = 'review_time'
78
- MESSAGE = 'message'
79
- CONTENT = 'content'
80
- GOLD = 'gold'
81
- PRED = 'pred'
82
- RESULT = 'result'
83
- REVIEW = 'review'
84
-
85
-
86
- class EvalConfigKeys:
87
- CLASS_REF = 'ref'
88
- CLASS_ARGS = 'args'
89
- ENABLE = 'enable'
90
- POSITION_BIAS_MITIGATION = 'position_bias_mitigation'
91
- RANDOM_SEED = 'random_seed'
92
- FN_COMPLETION_PARSER = 'fn_completion_parser'
93
- COMPLETION_PARSER_KWARGS = 'completion_parser_kwargs'
94
- OUTPUT_FILE = 'output_file'
95
- MODEL_ID_OR_PATH = 'model_id_or_path'
96
- MODEL_REVISION = 'revision'
97
- GENERATION_CONFIG = 'generation_config'
98
- PRECISION = 'precision'
99
- TEMPLATE_TYPE = 'template_type'
100
-
101
-
102
- class FnCompletionParser:
103
- LMSYS_PARSER: str = 'lmsys_parser'
104
- RANKING_PARSER: str = 'ranking_parser'
105
-
106
-
107
- class PositionBiasMitigation:
108
- NONE = 'none'
109
- RANDOMIZE_ORDER = 'randomize_order'
110
- SWAP_POSITION = 'swap_position'
111
-
112
-
113
- class EvalStage:
114
- # Enums: `all`, `infer`, `review`
115
- ALL = 'all'
116
- INFER = 'infer'
117
- REVIEW = 'review'
118
-
119
-
120
66
  class EvalType:
121
67
 
122
68
  CUSTOM = 'custom'
123
- CHECKPOINT = 'checkpoint' # native model checkpoint
124
- SERVICE = 'service' # model service
69
+ MOCK_LLM = 'mock_llm'
70
+ CHECKPOINT = 'llm_ckpt' # native model checkpoint
71
+ SERVICE = 'openai_api' # model service
72
+ TEXT2IMAGE = 'text2image' # image generation service
125
73
 
126
74
 
127
75
  class OutputType:
@@ -142,6 +90,7 @@ class EvalBackend:
142
90
 
143
91
  class DataCollection:
144
92
  NAME = 'data_collection'
93
+ INFO = 'collection_info'
145
94
 
146
95
 
147
96
  class JudgeStrategy:
@@ -159,3 +108,22 @@ class JudgeScoreType:
159
108
  class ModelTask:
160
109
  TEXT_GENERATION = 'text_generation'
161
110
  IMAGE_GENERATION = 'image_generation'
111
+
112
+
113
+ class Tags:
114
+ KNOWLEDGE = 'Knowledge'
115
+ MULTIPLE_CHOICE = 'MCQ'
116
+ MATH = 'Math'
117
+ REASONING = 'Reasoning'
118
+ CODING = 'Coding'
119
+ CHINESE = 'Chinese'
120
+ COMMONSENSE = 'Commonsense'
121
+ QA = 'QA'
122
+ READING_COMPREHENSION = 'ReadingComprehension'
123
+ CUSTOM = 'Custom'
124
+ INSTRUCTION_FOLLOWING = 'InstructionFollowing'
125
+ ARENA = 'Arena'
126
+ LONG_CONTEXT = 'LongContext'
127
+ RETRIEVAL = 'Retrieval'
128
+ FUNCTION_CALLING = 'FunctionCalling'
129
+ TEXT_TO_IMAGE = 'TextToImage'
@@ -1,3 +1,3 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- from .evaluator import Evaluator
3
+ from .evaluator import DefaultEvaluator