evalscope 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (233) hide show
  1. evalscope/__init__.py +1 -1
  2. evalscope/arguments.py +73 -0
  3. evalscope/backend/base.py +5 -1
  4. evalscope/backend/opencompass/api_meta_template.py +8 -14
  5. evalscope/backend/opencompass/backend_manager.py +24 -15
  6. evalscope/backend/opencompass/tasks/eval_api.py +1 -6
  7. evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
  8. evalscope/backend/rag_eval/__init__.py +3 -3
  9. evalscope/backend/rag_eval/backend_manager.py +21 -25
  10. evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
  11. evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
  12. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
  13. evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
  14. evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
  15. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
  16. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
  17. evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
  18. evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
  19. evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
  20. evalscope/backend/rag_eval/cmteb/base.py +22 -23
  21. evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
  22. evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
  23. evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
  24. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
  25. evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
  26. evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
  27. evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
  28. evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
  29. evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
  30. evalscope/backend/rag_eval/ragas/__init__.py +2 -2
  31. evalscope/backend/rag_eval/ragas/arguments.py +3 -8
  32. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
  33. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
  34. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
  35. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
  36. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
  37. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
  38. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
  39. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
  40. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
  41. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  42. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  43. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  44. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  45. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
  46. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
  47. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
  48. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
  49. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
  50. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
  51. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
  52. evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
  53. evalscope/backend/rag_eval/ragas/task_template.py +10 -15
  54. evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
  55. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
  56. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
  57. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
  58. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
  59. evalscope/backend/rag_eval/utils/clip.py +46 -50
  60. evalscope/backend/rag_eval/utils/embedding.py +12 -11
  61. evalscope/backend/rag_eval/utils/llm.py +8 -6
  62. evalscope/backend/rag_eval/utils/tools.py +12 -11
  63. evalscope/backend/vlm_eval_kit/__init__.py +1 -1
  64. evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
  65. evalscope/benchmarks/arc/__init__.py +3 -2
  66. evalscope/benchmarks/arc/ai2_arc.py +19 -16
  67. evalscope/benchmarks/arc/arc_adapter.py +32 -24
  68. evalscope/benchmarks/bbh/__init__.py +1 -2
  69. evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
  70. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
  71. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
  72. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
  73. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
  74. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
  75. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
  76. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
  77. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
  78. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
  79. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
  80. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
  81. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
  82. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
  83. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
  84. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
  85. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
  86. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
  87. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
  88. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
  89. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
  90. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
  91. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
  92. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
  93. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
  94. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
  95. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
  96. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
  97. evalscope/benchmarks/benchmark.py +16 -16
  98. evalscope/benchmarks/ceval/__init__.py +3 -2
  99. evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
  100. evalscope/benchmarks/ceval/ceval_exam.py +18 -31
  101. evalscope/benchmarks/cmmlu/__init__.py +3 -2
  102. evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
  103. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
  104. evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
  105. evalscope/benchmarks/competition_math/__init__.py +3 -2
  106. evalscope/benchmarks/competition_math/competition_math.py +7 -16
  107. evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
  108. evalscope/benchmarks/data_adapter.py +24 -24
  109. evalscope/benchmarks/general_qa/__init__.py +3 -2
  110. evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
  111. evalscope/benchmarks/gsm8k/__init__.py +1 -1
  112. evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
  113. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
  114. evalscope/benchmarks/hellaswag/__init__.py +3 -2
  115. evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
  116. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
  117. evalscope/benchmarks/humaneval/__init__.py +1 -1
  118. evalscope/benchmarks/humaneval/humaneval.py +15 -18
  119. evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
  120. evalscope/benchmarks/mmlu/__init__.py +3 -2
  121. evalscope/benchmarks/mmlu/mmlu.py +15 -29
  122. evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
  123. evalscope/benchmarks/race/__init__.py +3 -2
  124. evalscope/benchmarks/race/race.py +21 -35
  125. evalscope/benchmarks/race/race_adapter.py +32 -29
  126. evalscope/benchmarks/race/samples.jsonl +1 -1
  127. evalscope/benchmarks/trivia_qa/__init__.py +3 -2
  128. evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
  129. evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
  131. evalscope/benchmarks/truthful_qa/__init__.py +3 -2
  132. evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
  133. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
  134. evalscope/cli/cli.py +6 -5
  135. evalscope/cli/start_eval.py +31 -0
  136. evalscope/cli/start_perf.py +0 -3
  137. evalscope/cli/start_server.py +27 -41
  138. evalscope/config.py +119 -95
  139. evalscope/constants.py +61 -29
  140. evalscope/evaluator/__init__.py +1 -0
  141. evalscope/evaluator/evaluator.py +96 -377
  142. evalscope/evaluator/humaneval_evaluator.py +158 -0
  143. evalscope/evaluator/rating_eval.py +12 -33
  144. evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
  145. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
  146. evalscope/metrics/code_metric.py +3 -9
  147. evalscope/metrics/math_accuracy.py +3 -6
  148. evalscope/metrics/metrics.py +21 -21
  149. evalscope/metrics/rouge_metric.py +11 -25
  150. evalscope/models/__init__.py +1 -2
  151. evalscope/models/api/openai_api.py +40 -29
  152. evalscope/models/custom/__init__.py +0 -1
  153. evalscope/models/custom/custom_model.py +3 -3
  154. evalscope/models/dummy_chat_model.py +7 -8
  155. evalscope/models/model_adapter.py +89 -156
  156. evalscope/models/openai_model.py +20 -20
  157. evalscope/perf/arguments.py +15 -3
  158. evalscope/perf/benchmark.py +7 -9
  159. evalscope/perf/http_client.py +3 -8
  160. evalscope/perf/main.py +10 -0
  161. evalscope/perf/plugin/api/custom_api.py +1 -2
  162. evalscope/perf/plugin/api/dashscope_api.py +1 -2
  163. evalscope/perf/plugin/api/openai_api.py +2 -3
  164. evalscope/perf/plugin/datasets/base.py +1 -2
  165. evalscope/perf/plugin/datasets/flickr8k.py +1 -2
  166. evalscope/perf/plugin/datasets/longalpaca.py +1 -2
  167. evalscope/perf/plugin/datasets/openqa.py +1 -2
  168. evalscope/perf/utils/analysis_result.py +1 -2
  169. evalscope/perf/utils/benchmark_util.py +1 -2
  170. evalscope/perf/utils/db_util.py +11 -8
  171. evalscope/perf/utils/local_server.py +19 -13
  172. evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
  173. evalscope/registry/tasks/arc.yaml +2 -3
  174. evalscope/registry/tasks/bbh.yaml +3 -4
  175. evalscope/registry/tasks/bbh_mini.yaml +3 -4
  176. evalscope/registry/tasks/ceval.yaml +3 -3
  177. evalscope/registry/tasks/ceval_mini.yaml +3 -4
  178. evalscope/registry/tasks/cmmlu.yaml +3 -3
  179. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
  180. evalscope/registry/tasks/general_qa.yaml +1 -1
  181. evalscope/registry/tasks/gsm8k.yaml +2 -2
  182. evalscope/registry/tasks/mmlu.yaml +3 -3
  183. evalscope/registry/tasks/mmlu_mini.yaml +3 -3
  184. evalscope/run.py +184 -375
  185. evalscope/run_arena.py +20 -25
  186. evalscope/summarizer.py +16 -17
  187. evalscope/third_party/longbench_write/README.md +99 -42
  188. evalscope/third_party/longbench_write/default_task.json +1 -1
  189. evalscope/third_party/longbench_write/default_task.yaml +8 -7
  190. evalscope/third_party/longbench_write/eval.py +29 -28
  191. evalscope/third_party/longbench_write/infer.py +16 -104
  192. evalscope/third_party/longbench_write/longbench_write.py +5 -5
  193. evalscope/third_party/longbench_write/resources/judge.txt +1 -1
  194. evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
  195. evalscope/third_party/longbench_write/utils.py +0 -1
  196. evalscope/third_party/toolbench_static/eval.py +14 -15
  197. evalscope/third_party/toolbench_static/infer.py +48 -69
  198. evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
  199. evalscope/third_party/toolbench_static/requirements.txt +1 -1
  200. evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
  201. evalscope/tools/combine_reports.py +25 -30
  202. evalscope/tools/rewrite_eval_results.py +14 -46
  203. evalscope/utils/__init__.py +0 -1
  204. evalscope/utils/arena_utils.py +18 -48
  205. evalscope/{perf/utils → utils}/chat_service.py +3 -4
  206. evalscope/utils/completion_parsers.py +3 -8
  207. evalscope/utils/logger.py +9 -7
  208. evalscope/utils/model_utils.py +11 -0
  209. evalscope/utils/utils.py +12 -138
  210. evalscope/version.py +2 -2
  211. {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/METADATA +123 -118
  212. evalscope-0.8.0.dist-info/RECORD +285 -0
  213. tests/cli/test_run.py +54 -15
  214. tests/perf/test_perf.py +4 -0
  215. tests/rag/test_clip_benchmark.py +38 -38
  216. tests/rag/test_mteb.py +3 -2
  217. tests/rag/test_ragas.py +5 -5
  218. tests/swift/test_run_swift_eval.py +2 -3
  219. tests/swift/test_run_swift_vlm_eval.py +2 -3
  220. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
  221. evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
  222. evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
  223. evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
  224. evalscope/cache.py +0 -98
  225. evalscope/models/template.py +0 -1446
  226. evalscope/run_ms.py +0 -140
  227. evalscope/utils/task_cfg_parser.py +0 -10
  228. evalscope/utils/task_utils.py +0 -22
  229. evalscope-0.7.2.dist-info/RECORD +0 -286
  230. {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
  231. {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
  232. {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
  233. {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0
evalscope/config.py CHANGED
@@ -1,69 +1,127 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- import os
4
3
  import copy
5
- from dataclasses import dataclass, asdict, field
6
- from typing import Optional, List
4
+ import json
5
+ import os
6
+ from argparse import Namespace
7
+ from dataclasses import dataclass, field
8
+ from typing import Dict, List, Optional, Union
7
9
 
8
- from evalscope.constants import DEFAULT_ROOT_CACHE_DIR
10
+ from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, DEFAULT_WORK_DIR, EvalBackend, EvalStage, EvalType, HubType
9
11
  from evalscope.models.custom import CustomModel
10
- from evalscope.utils import yaml_to_dict
12
+ from evalscope.utils import dict_to_yaml, gen_hash, json_to_dict, yaml_to_dict
11
13
  from evalscope.utils.logger import get_logger
12
14
 
13
15
  logger = get_logger()
14
16
 
15
17
  cur_path = os.path.dirname(os.path.abspath(__file__))
16
18
 
17
- registry_tasks = {
18
- 'arc': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/arc.yaml')),
19
- 'gsm8k': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/gsm8k.yaml')),
20
- 'mmlu': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/mmlu.yaml')),
21
- 'cmmlu': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/cmmlu.yaml')),
22
- 'ceval': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/ceval.yaml')),
23
- 'bbh': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/bbh.yaml')),
24
- 'general_qa': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/general_qa.yaml')),
25
-
26
- # 'bbh_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/bbh_mini.yaml')),
27
- # 'mmlu_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/mmlu_mini.yaml')),
28
- # 'ceval_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/ceval_mini.yaml')),
29
-
19
+ DEFAULT_MODEL_ARGS = {'revision': 'master', 'precision': 'torch.float16', 'device': 'auto'}
20
+ DEFAULT_GENERATION_CONFIG = {
21
+ 'max_length': 2048,
22
+ 'max_new_tokens': 512,
23
+ 'do_sample': False,
24
+ 'top_k': 50,
25
+ 'top_p': 1.0,
26
+ 'temperature': 1.0,
30
27
  }
31
28
 
32
29
 
33
30
  @dataclass
34
31
  class TaskConfig:
35
- model_args: Optional[dict] = field(default_factory=dict)
36
- template_type: Optional[str] = 'default-generation'
37
- generation_config: Optional[dict] = field(default_factory=dict)
38
- dataset_args: Optional[dict] = field(default_factory=dict)
32
+ # Model-related arguments
33
+ model: Union[str, CustomModel, None] = None
34
+ model_args: Optional[Dict] = field(default_factory=lambda: DEFAULT_MODEL_ARGS | {})
35
+
36
+ # Template-related arguments
37
+ template_type: Optional[str] = None # Deprecated, will be removed in v1.0.0.
38
+ chat_template: Optional[str] = None
39
+
40
+ # Dataset-related arguments
41
+ datasets: Optional[List[str]] = None
42
+ dataset_args: Optional[Dict] = field(default_factory=dict)
43
+ dataset_dir: str = DEFAULT_DATASET_CACHE_DIR
44
+ dataset_hub: str = HubType.MODELSCOPE
45
+
46
+ # Generation configuration arguments
47
+ generation_config: Optional[Dict] = field(default_factory=lambda: DEFAULT_GENERATION_CONFIG | {})
48
+
49
+ # Evaluation-related arguments
50
+ eval_type: str = EvalType.CHECKPOINT
51
+ eval_backend: str = EvalBackend.NATIVE
52
+ eval_config: Union[str, Dict, None] = None
53
+ stage: str = EvalStage.ALL
54
+ limit: Optional[int] = None
55
+
56
+ # Cache and working directory arguments
57
+ mem_cache: bool = False # Deprecated, will be removed in v1.0.0.
58
+ use_cache: Optional[str] = None
59
+ work_dir: str = DEFAULT_WORK_DIR
60
+ outputs: Optional[str] = None # Deprecated, will be removed in v1.0.0.
61
+
62
+ # Debug and runtime mode arguments
63
+ debug: bool = False
39
64
  dry_run: bool = False
40
- model: CustomModel = None
41
- eval_type: str = 'custom'
42
- datasets: list = field(default_factory=list)
43
- work_dir: str = DEFAULT_ROOT_CACHE_DIR
44
- outputs: str = None
45
- mem_cache: bool = False
46
- use_cache: bool = True
47
- stage: str = 'all' # `all` or `infer` or `review`
48
- dataset_hub: str = 'ModelScope'
49
- dataset_dir: str = DEFAULT_ROOT_CACHE_DIR
50
- limit: int = None
51
- eval_backend: str = 'Native'
52
- eval_config: dict = field(default_factory=dict)
53
-
54
- # def __post_init__(self):
55
- # self.registry_tasks = {
56
- # 'arc': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/arc.yaml')),
57
- # 'gsm8k': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/gsm8k.yaml')),
58
- # 'mmlu': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/mmlu.yaml')),
59
- # 'ceval': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/ceval.yaml')),
60
- # 'bbh': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/bbh.yaml')),
61
- #
62
- # 'bbh_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/bbh_mini.yaml')),
63
- # 'mmlu_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/mmlu_mini.yaml')),
64
- # 'ceval_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/ceval_mini.yaml')),
65
- #
66
- # }
65
+ seed: int = 42
66
+
67
+ def to_dict(self):
68
+ # Note: to avoid serialization error for some model instance
69
+ return self.__dict__
70
+
71
+ def __str__(self):
72
+ return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
73
+
74
+ def update(self, other: Union['TaskConfig', dict]):
75
+ if isinstance(other, TaskConfig):
76
+ other = other.to_dict()
77
+ self.__dict__.update(other)
78
+
79
+ def dump_yaml(self, output_dir: str):
80
+ """Dump the task configuration to a YAML file."""
81
+ task_cfg_file = os.path.join(output_dir, f'task_config_{gen_hash(str(self), bits=6)}.yaml')
82
+ try:
83
+ logger.info(f'Dump task config to {task_cfg_file}')
84
+ dict_to_yaml(self.to_dict(), task_cfg_file)
85
+ except Exception as e:
86
+ logger.warning(f'Failed to dump overall task config: {e}')
87
+
88
+ @staticmethod
89
+ def list():
90
+ return list(registry_tasks.keys())
91
+
92
+ @staticmethod
93
+ def from_yaml(yaml_file: str):
94
+ return TaskConfig.from_dict(yaml_to_dict(yaml_file))
95
+
96
+ @staticmethod
97
+ def from_dict(d: dict):
98
+ return TaskConfig(**d)
99
+
100
+ @staticmethod
101
+ def from_json(json_file: str):
102
+ return TaskConfig.from_dict(json_to_dict(json_file))
103
+
104
+ @staticmethod
105
+ def from_args(args: Namespace):
106
+ # Convert Namespace to a dictionary and filter out None values
107
+ args_dict = {k: v for k, v in vars(args).items() if v is not None}
108
+ del args_dict['func'] # Note: compat CLI arguments
109
+
110
+ return TaskConfig.from_dict(args_dict)
111
+
112
+ @staticmethod
113
+ def load(custom_model: CustomModel, tasks: List[str]) -> List['TaskConfig']:
114
+ res_list = []
115
+ for task_name in tasks:
116
+ task = registry_tasks.get(task_name, None)
117
+ if task is None:
118
+ logger.error(f'No task found in tasks: {list(registry_tasks.keys())}, got task_name: {task_name}')
119
+ continue
120
+
121
+ task.model = custom_model
122
+ res_list.append(task)
123
+
124
+ return res_list
67
125
 
68
126
  @staticmethod
69
127
  def registry(name: str, data_pattern: str, dataset_dir: str = None, subset_list: list = None) -> None:
@@ -75,7 +133,7 @@ class TaskConfig:
75
133
  data_pattern: str, the data pattern for the task.
76
134
  e.g. `mmlu`, `ceval`, `gsm8k`, ...
77
135
  refer to task_config.list() for all available datasets.
78
- dataset_dir: str, the directory to store multiple datasets files. e.g. /path/to/data,
136
+ dataset_dir: str, the directory to store multiple datasets files. e.g. /path/to/data,
79
137
  then your specific custom dataset directory will be /path/to/data/{name}
80
138
  subset_list: list, the subset list for the dataset.
81
139
  e.g. ['middle_school_politics', 'operating_system']
@@ -83,63 +141,31 @@ class TaskConfig:
83
141
  """
84
142
  available_datasets = list(registry_tasks.keys())
85
143
  if data_pattern not in available_datasets:
86
- logger.error(f'No dataset found in available datasets: {available_datasets}, got data_pattern: {data_pattern}')
144
+ logger.error(
145
+ f'No dataset found in available datasets: {available_datasets}, got data_pattern: {data_pattern}')
87
146
  return
88
147
 
89
148
  # Reuse the existing task config and update the datasets
90
- pattern_config = registry_tasks.get(data_pattern)
149
+ pattern_config = registry_tasks[data_pattern]
91
150
 
92
151
  custom_config = copy.deepcopy(pattern_config)
93
- custom_config.update({'datasets': [data_pattern]})
94
- custom_config.update({'dataset_hub': 'Local'}) # TODO: to support `ModelScope`
95
- if 'dataset_args' in custom_config:
96
- if data_pattern not in custom_config:
97
- custom_config['dataset_args'].update({data_pattern: {}})
98
- else:
99
- custom_config.update({'dataset_args': {data_pattern: {}}})
152
+ custom_config.datasets = [data_pattern]
153
+ custom_config.dataset_args = {data_pattern: {}}
154
+ custom_config.eval_type = EvalType.CHECKPOINT
100
155
 
101
156
  if dataset_dir is not None:
102
- custom_config['dataset_args'][data_pattern].update({'local_path': dataset_dir})
157
+ custom_config.dataset_args[data_pattern].update({'local_path': dataset_dir})
103
158
 
104
159
  if subset_list is not None:
105
- # custom_config['dataset_args'].get(data_pattern, {}).update({'subset_list': subset_list})
106
- custom_config['dataset_args'][data_pattern].update({'subset_list': subset_list})
160
+ custom_config.dataset_args[data_pattern].update({'subset_list': subset_list})
107
161
 
108
162
  registry_tasks.update({name: custom_config})
109
163
  logger.info(f'** Registered task: {name} with data pattern: {data_pattern}')
110
164
 
111
- def to_dict(self):
112
- # Note: to avoid serialization error for some model instance
113
- _tmp_model = copy.copy(self.model)
114
- self.model = None
115
- res_dict = asdict(self)
116
- res_dict.update({'model': _tmp_model})
117
- self.model = _tmp_model
118
165
 
119
- return res_dict
166
+ tasks = ['arc', 'gsm8k', 'mmlu', 'cmmlu', 'ceval', 'bbh', 'general_qa']
120
167
 
121
- @staticmethod
122
- def load(custom_model: CustomModel, tasks: List[str]) -> List['TaskConfig']:
123
- res_list = []
124
- for task_name in tasks:
125
- task: dict = registry_tasks.get(task_name, None)
126
- if task is None:
127
- logger.error(f'No task found in tasks: {list(registry_tasks.keys())}, got task_name: {task_name}')
128
- continue
129
-
130
- res = TaskConfig(**task)
131
- res.model = custom_model
132
- if res.outputs is None:
133
- res.outputs = os.path.join(res.work_dir,
134
- 'outputs',
135
- f"eval_{'-'.join(tasks)}_{res.model.config['model_id']}_{res.model_args.get('revision', 'default')}")
136
- res_list.append(res)
137
-
138
- return res_list
139
-
140
- @staticmethod
141
- def list():
142
- return list(registry_tasks.keys())
168
+ registry_tasks = {task: TaskConfig.from_yaml(os.path.join(cur_path, f'registry/tasks/{task}.yaml')) for task in tasks}
143
169
 
144
170
 
145
171
  class TempModel(CustomModel):
@@ -158,9 +184,7 @@ if __name__ == '__main__':
158
184
  # Register a new task
159
185
  TaskConfig.registry(name='arc_swift', data_pattern='arc', dataset_dir='/path/to/swift_custom_work')
160
186
 
161
- import json
162
187
  swift_eval_task: List[TaskConfig] = TaskConfig.load(custom_model=model, tasks=['gsm8k', 'arc', 'arc_swift'])
163
188
  for item in swift_eval_task:
164
- print(item.to_dict())
189
+ print(item)
165
190
  print()
166
-
evalscope/constants.py CHANGED
@@ -1,7 +1,18 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- from enum import Enum
2
+ import os
3
+ from modelscope.utils.constant import DEFAULT_REPOSITORY_REVISION
4
+ from modelscope.utils.file_utils import get_dataset_cache_root, get_model_cache_root
3
5
 
4
- DEFAULT_ROOT_CACHE_DIR = '~/.cache/evalscope'
6
+ DEFAULT_WORK_DIR = './outputs'
7
+ DEFAULT_MODEL_REVISION = DEFAULT_REPOSITORY_REVISION # master
8
+ DEFAULT_MODEL_CACHE_DIR = get_model_cache_root() # ~/.cache/modelscope/hub
9
+ DEFAULT_DATASET_CACHE_DIR = get_dataset_cache_root() # ~/.cache/modelscope/datasets
10
+
11
+
12
+ class HubType:
13
+ MODELSCOPE = 'modelscope'
14
+ HUGGINGFACE = 'huggingface'
15
+ LOCAL = 'local'
5
16
 
6
17
 
7
18
  class DumpMode:
@@ -25,7 +36,7 @@ class MetricsConstant:
25
36
  ]
26
37
 
27
38
 
28
- class MetricMembers(Enum):
39
+ class MetricMembers:
29
40
 
30
41
  # Math accuracy metric
31
42
  MATH_ACCURACY = 'math_accuracy'
@@ -66,53 +77,51 @@ class ArenaMode:
66
77
 
67
78
 
68
79
  class OutputsStructure:
69
-
70
- LOGS_DIR = 'logs_dir'
71
-
72
- PREDICTIONS_DIR = 'predictions_dir'
73
-
74
- REVIEWS_DIR = 'reviews_dir'
75
-
76
- REPORTS_DIR = 'reports_dir'
77
-
78
- CONFIGS_DIR = 'configs_dir'
80
+ LOGS_DIR = 'logs'
81
+ PREDICTIONS_DIR = 'predictions'
82
+ REVIEWS_DIR = 'reviews'
83
+ REPORTS_DIR = 'reports'
84
+ CONFIGS_DIR = 'configs'
85
+
86
+ def __init__(self, outputs_dir: str, is_make: bool = True):
87
+ self.outputs_dir = outputs_dir
88
+ self.logs_dir = os.path.join(outputs_dir, OutputsStructure.LOGS_DIR)
89
+ self.predictions_dir = os.path.join(outputs_dir, OutputsStructure.PREDICTIONS_DIR)
90
+ self.reviews_dir = os.path.join(outputs_dir, OutputsStructure.REVIEWS_DIR)
91
+ self.reports_dir = os.path.join(outputs_dir, OutputsStructure.REPORTS_DIR)
92
+ self.configs_dir = os.path.join(outputs_dir, OutputsStructure.CONFIGS_DIR)
93
+
94
+ if is_make:
95
+ self.create_directories()
96
+
97
+ def create_directories(self):
98
+ os.makedirs(self.outputs_dir, exist_ok=True)
99
+ os.makedirs(self.logs_dir, exist_ok=True)
100
+ os.makedirs(self.predictions_dir, exist_ok=True)
101
+ os.makedirs(self.reviews_dir, exist_ok=True)
102
+ os.makedirs(self.reports_dir, exist_ok=True)
103
+ os.makedirs(self.configs_dir, exist_ok=True)
79
104
 
80
105
 
81
106
  class AnswerKeys:
82
-
83
107
  ANSWER_ID = 'answer_id'
84
-
85
108
  RAW_INPUT = 'raw_input'
86
-
87
109
  ORIGIN_PROMPT = 'origin_prompt'
88
-
89
110
  MODEL_SPEC = 'model_spec'
90
-
91
111
  SUBSET_NAME = 'subset_name'
92
-
93
112
  CHOICES = 'choices'
94
113
 
95
114
 
96
115
  class ReviewKeys:
97
-
98
116
  REVIEW_ID = 'review_id'
99
-
100
117
  REVIEWED = 'reviewed'
101
-
102
118
  REVIEWER_SPEC = 'reviewer_spec'
103
-
104
119
  REVIEW_TIME = 'review_time'
105
-
106
120
  MESSAGE = 'message'
107
-
108
121
  CONTENT = 'content'
109
-
110
122
  GOLD = 'gold'
111
-
112
123
  PRED = 'pred'
113
-
114
124
  RESULT = 'result'
115
-
116
125
  REVIEW = 'review'
117
126
 
118
127
 
@@ -148,3 +157,26 @@ class EvalStage:
148
157
  ALL = 'all'
149
158
  INFER = 'infer'
150
159
  REVIEW = 'review'
160
+
161
+
162
+ class EvalType:
163
+
164
+ CUSTOM = 'custom'
165
+ CHECKPOINT = 'checkpoint'
166
+
167
+
168
+ class EvalBackend:
169
+ # Use native evaluation pipeline of EvalScope
170
+ NATIVE = 'Native'
171
+
172
+ # Use OpenCompass framework as the evaluation backend
173
+ OPEN_COMPASS = 'OpenCompass'
174
+
175
+ # Use VLM Eval Kit as the multi-modal model evaluation backend
176
+ VLM_EVAL_KIT = 'VLMEvalKit'
177
+
178
+ # Use RAGEval as the RAG evaluation backend
179
+ RAG_EVAL = 'RAGEval'
180
+
181
+ # Use third-party evaluation backend/modules
182
+ THIRD_PARTY = 'ThirdParty'
@@ -1,3 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
3
  from evalscope.evaluator.evaluator import Evaluator
4
+ from evalscope.evaluator.humaneval_evaluator import HumanevalEvaluator