evalscope 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (234) hide show
  1. evalscope/__init__.py +1 -1
  2. evalscope/arguments.py +73 -0
  3. evalscope/backend/base.py +6 -2
  4. evalscope/backend/opencompass/api_meta_template.py +8 -14
  5. evalscope/backend/opencompass/backend_manager.py +24 -15
  6. evalscope/backend/opencompass/tasks/eval_api.py +1 -6
  7. evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
  8. evalscope/backend/rag_eval/__init__.py +3 -3
  9. evalscope/backend/rag_eval/backend_manager.py +21 -25
  10. evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
  11. evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
  12. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
  13. evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
  14. evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
  15. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
  16. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
  17. evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
  18. evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
  19. evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
  20. evalscope/backend/rag_eval/cmteb/base.py +22 -23
  21. evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
  22. evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
  23. evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
  24. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
  25. evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
  26. evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
  27. evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
  28. evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
  29. evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
  30. evalscope/backend/rag_eval/ragas/__init__.py +2 -2
  31. evalscope/backend/rag_eval/ragas/arguments.py +3 -8
  32. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
  33. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
  34. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
  35. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
  36. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
  37. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
  38. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
  39. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
  40. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
  41. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  42. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  43. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  44. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  45. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
  46. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
  47. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
  48. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
  49. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
  50. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
  51. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
  52. evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
  53. evalscope/backend/rag_eval/ragas/task_template.py +10 -15
  54. evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
  55. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
  56. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
  57. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
  58. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
  59. evalscope/backend/rag_eval/utils/clip.py +47 -51
  60. evalscope/backend/rag_eval/utils/embedding.py +13 -12
  61. evalscope/backend/rag_eval/utils/llm.py +8 -6
  62. evalscope/backend/rag_eval/utils/tools.py +12 -11
  63. evalscope/backend/vlm_eval_kit/__init__.py +1 -1
  64. evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
  65. evalscope/benchmarks/arc/__init__.py +3 -2
  66. evalscope/benchmarks/arc/ai2_arc.py +19 -16
  67. evalscope/benchmarks/arc/arc_adapter.py +32 -24
  68. evalscope/benchmarks/bbh/__init__.py +1 -2
  69. evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
  70. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
  71. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
  72. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
  73. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
  74. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
  75. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
  76. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
  77. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
  78. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
  79. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
  80. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
  81. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
  82. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
  83. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
  84. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
  85. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
  86. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
  87. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
  88. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
  89. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
  90. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
  91. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
  92. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
  93. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
  94. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
  95. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
  96. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
  97. evalscope/benchmarks/benchmark.py +16 -16
  98. evalscope/benchmarks/ceval/__init__.py +3 -2
  99. evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
  100. evalscope/benchmarks/ceval/ceval_exam.py +18 -31
  101. evalscope/benchmarks/cmmlu/__init__.py +3 -2
  102. evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
  103. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
  104. evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
  105. evalscope/benchmarks/competition_math/__init__.py +3 -2
  106. evalscope/benchmarks/competition_math/competition_math.py +7 -16
  107. evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
  108. evalscope/benchmarks/data_adapter.py +24 -24
  109. evalscope/benchmarks/general_qa/__init__.py +3 -2
  110. evalscope/benchmarks/general_qa/general_qa_adapter.py +35 -39
  111. evalscope/benchmarks/gsm8k/__init__.py +1 -1
  112. evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
  113. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +27 -24
  114. evalscope/benchmarks/hellaswag/__init__.py +3 -2
  115. evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
  116. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +28 -23
  117. evalscope/benchmarks/humaneval/__init__.py +1 -1
  118. evalscope/benchmarks/humaneval/humaneval.py +15 -18
  119. evalscope/benchmarks/humaneval/humaneval_adapter.py +192 -7
  120. evalscope/benchmarks/mmlu/__init__.py +3 -2
  121. evalscope/benchmarks/mmlu/mmlu.py +15 -29
  122. evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
  123. evalscope/benchmarks/race/__init__.py +3 -2
  124. evalscope/benchmarks/race/race.py +21 -35
  125. evalscope/benchmarks/race/race_adapter.py +33 -29
  126. evalscope/benchmarks/race/samples.jsonl +1 -1
  127. evalscope/benchmarks/trivia_qa/__init__.py +3 -2
  128. evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
  129. evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
  131. evalscope/benchmarks/truthful_qa/__init__.py +3 -2
  132. evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
  133. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
  134. evalscope/cli/cli.py +6 -5
  135. evalscope/cli/start_eval.py +31 -0
  136. evalscope/cli/start_perf.py +0 -3
  137. evalscope/cli/start_server.py +27 -41
  138. evalscope/config.py +154 -96
  139. evalscope/constants.py +50 -32
  140. evalscope/evaluator/evaluator.py +97 -377
  141. evalscope/evaluator/rating_eval.py +12 -33
  142. evalscope/evaluator/reviewer/auto_reviewer.py +48 -76
  143. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
  144. evalscope/metrics/code_metric.py +3 -9
  145. evalscope/metrics/math_accuracy.py +3 -6
  146. evalscope/metrics/metrics.py +21 -21
  147. evalscope/metrics/rouge_metric.py +11 -25
  148. evalscope/models/__init__.py +1 -2
  149. evalscope/models/api/openai_api.py +40 -29
  150. evalscope/models/custom/__init__.py +0 -1
  151. evalscope/models/custom/custom_model.py +3 -3
  152. evalscope/models/dummy_chat_model.py +7 -8
  153. evalscope/models/model_adapter.py +89 -156
  154. evalscope/models/openai_model.py +20 -20
  155. evalscope/perf/arguments.py +16 -3
  156. evalscope/perf/benchmark.py +9 -11
  157. evalscope/perf/http_client.py +3 -8
  158. evalscope/perf/main.py +8 -1
  159. evalscope/perf/plugin/api/custom_api.py +1 -2
  160. evalscope/perf/plugin/api/dashscope_api.py +1 -2
  161. evalscope/perf/plugin/api/openai_api.py +3 -4
  162. evalscope/perf/plugin/datasets/base.py +1 -2
  163. evalscope/perf/plugin/datasets/flickr8k.py +1 -2
  164. evalscope/perf/plugin/datasets/longalpaca.py +1 -2
  165. evalscope/perf/plugin/datasets/openqa.py +1 -2
  166. evalscope/perf/plugin/registry.py +3 -3
  167. evalscope/perf/utils/analysis_result.py +1 -2
  168. evalscope/perf/utils/benchmark_util.py +5 -6
  169. evalscope/perf/utils/db_util.py +77 -30
  170. evalscope/perf/utils/local_server.py +21 -13
  171. evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
  172. evalscope/registry/tasks/arc.yaml +2 -3
  173. evalscope/registry/tasks/bbh.yaml +3 -4
  174. evalscope/registry/tasks/bbh_mini.yaml +3 -4
  175. evalscope/registry/tasks/ceval.yaml +3 -3
  176. evalscope/registry/tasks/ceval_mini.yaml +3 -4
  177. evalscope/registry/tasks/cmmlu.yaml +3 -3
  178. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
  179. evalscope/registry/tasks/general_qa.yaml +1 -1
  180. evalscope/registry/tasks/gsm8k.yaml +2 -2
  181. evalscope/registry/tasks/mmlu.yaml +3 -3
  182. evalscope/registry/tasks/mmlu_mini.yaml +3 -3
  183. evalscope/run.py +153 -381
  184. evalscope/run_arena.py +21 -25
  185. evalscope/summarizer.py +27 -40
  186. evalscope/third_party/longbench_write/README.md +99 -42
  187. evalscope/third_party/longbench_write/default_task.json +1 -1
  188. evalscope/third_party/longbench_write/default_task.yaml +8 -7
  189. evalscope/third_party/longbench_write/eval.py +29 -27
  190. evalscope/third_party/longbench_write/infer.py +16 -104
  191. evalscope/third_party/longbench_write/longbench_write.py +5 -4
  192. evalscope/third_party/longbench_write/resources/judge.txt +1 -1
  193. evalscope/third_party/longbench_write/tools/data_etl.py +5 -6
  194. evalscope/third_party/longbench_write/utils.py +0 -1
  195. evalscope/third_party/toolbench_static/eval.py +14 -15
  196. evalscope/third_party/toolbench_static/infer.py +48 -69
  197. evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
  198. evalscope/third_party/toolbench_static/requirements.txt +1 -1
  199. evalscope/third_party/toolbench_static/toolbench_static.py +4 -3
  200. evalscope/tools/combine_reports.py +27 -34
  201. evalscope/tools/rewrite_eval_results.py +15 -47
  202. evalscope/utils/__init__.py +1 -1
  203. evalscope/utils/arena_utils.py +18 -48
  204. evalscope/{perf/utils → utils}/chat_service.py +4 -5
  205. evalscope/utils/completion_parsers.py +3 -8
  206. evalscope/utils/io_utils.py +162 -0
  207. evalscope/utils/logger.py +17 -7
  208. evalscope/utils/model_utils.py +11 -0
  209. evalscope/utils/utils.py +5 -306
  210. evalscope/version.py +2 -2
  211. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/METADATA +123 -118
  212. evalscope-0.8.1.dist-info/RECORD +285 -0
  213. tests/cli/test_run.py +53 -15
  214. tests/perf/test_perf.py +6 -1
  215. tests/rag/test_clip_benchmark.py +38 -38
  216. tests/rag/test_mteb.py +3 -2
  217. tests/rag/test_ragas.py +5 -5
  218. tests/swift/test_run_swift_eval.py +2 -3
  219. tests/swift/test_run_swift_vlm_eval.py +2 -3
  220. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
  221. tests/vlm/test_vlmeval.py +3 -2
  222. evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
  223. evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
  224. evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
  225. evalscope/cache.py +0 -98
  226. evalscope/models/template.py +0 -1446
  227. evalscope/run_ms.py +0 -140
  228. evalscope/utils/task_cfg_parser.py +0 -10
  229. evalscope/utils/task_utils.py +0 -22
  230. evalscope-0.7.2.dist-info/RECORD +0 -286
  231. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/LICENSE +0 -0
  232. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/WHEEL +0 -0
  233. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/entry_points.txt +0 -0
  234. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/top_level.txt +0 -0
@@ -1,67 +1,56 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import os, sys, time
3
- from argparse import ArgumentParser
2
+ import os
4
3
  import subprocess
5
-
4
+ import sys
5
+ import time
6
+ from argparse import ArgumentParser
6
7
 
7
8
  from evalscope.cli.base import CLICommand
8
9
 
9
-
10
10
  current_path = os.path.dirname(os.path.abspath(__file__))
11
11
  print(current_path)
12
12
  root_path = os.path.dirname(current_path)
13
13
  print(root_path)
14
14
 
15
+
15
16
  def subparser_func(args):
16
17
  """ Function which will be called for a specific sub parser.
17
18
  """
18
19
  return PerfServerCMD(args)
19
20
 
21
+
20
22
  def add_perf_args(parser):
23
+ parser.add_argument('--server-command', required=True, type=str, help='The start server command.')
21
24
  parser.add_argument(
22
- '--server-command', required=True, type=str, help='The start server command.')
23
- parser.add_argument(
24
- '--logdir', required=True, type=str, help='The monitor log save dir, tensorboard start at this path for display!')
25
- parser.add_argument(
26
- '--host', type=str, default='0.0.0.0', help='The tensorboard host'
27
- )
28
- parser.add_argument(
29
- '--tensorboard-port', type=str, default='6006', help='The tensorboard port'
30
- )
25
+ '--logdir',
26
+ required=True,
27
+ type=str,
28
+ help='The monitor log save dir, tensorboard start at this path for display!')
29
+ parser.add_argument('--host', type=str, default='0.0.0.0', help='The tensorboard host')
30
+ parser.add_argument('--tensorboard-port', type=str, default='6006', help='The tensorboard port')
31
+
31
32
 
32
33
  def async_run_command_with_popen(cmd):
33
34
  sub_process = subprocess.Popen(
34
- cmd,
35
- stdout=subprocess.PIPE,
36
- stderr=subprocess.STDOUT,
37
- bufsize=1,
38
- universal_newlines=True,
39
- encoding='utf8')
35
+ cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, bufsize=1, universal_newlines=True, encoding='utf8')
40
36
  return sub_process
41
37
 
38
+
42
39
  def start_monitor(args):
43
- cmd = ['python',
44
- '%s/perf/monitor.py'%root_path,
45
- '--logdir',
46
- args.logdir]
40
+ cmd = ['python', '%s/perf/monitor.py' % root_path, '--logdir', args.logdir]
47
41
  print(cmd)
48
42
  p = async_run_command_with_popen(cmd)
49
43
  os.set_blocking(p.stdout.fileno(), False)
50
44
  return p
51
45
 
46
+
52
47
  def start_tensorboard(args):
53
- cmd = ['tensorboard',
54
- '--logdir',
55
- args.logdir,
56
- '--host',
57
- args.host,
58
- '--port',
59
- args.tensorboard_port
60
- ]
48
+ cmd = ['tensorboard', '--logdir', args.logdir, '--host', args.host, '--port', args.tensorboard_port]
61
49
  p = async_run_command_with_popen(cmd)
62
50
  os.set_blocking(p.stdout.fileno(), False)
63
51
  return p
64
52
 
53
+
65
54
  def start_server(args):
66
55
  cmd = args.server_command
67
56
  print(cmd)
@@ -76,7 +65,7 @@ def start_server(args):
76
65
 
77
66
  os.set_blocking(sub_process.stdout.fileno(), False)
78
67
  return sub_process
79
-
68
+
80
69
 
81
70
  def wait_for_workers(workers):
82
71
  while True:
@@ -91,12 +80,12 @@ def wait_for_workers(workers):
91
80
  else:
92
81
  break
93
82
  else:
94
- print('Worker %s completed!'%idx)
83
+ print('Worker %s completed!' % idx)
95
84
  for line in iter(worker.stdout.readline, ''):
96
85
  if line != '':
97
86
  sys.stdout.write(line)
98
87
  else:
99
- break
88
+ break
100
89
  workers[idx] = None
101
90
 
102
91
  is_all_completed = True
@@ -108,7 +97,8 @@ def wait_for_workers(workers):
108
97
  if is_all_completed:
109
98
  break
110
99
  time.sleep(0.1)
111
-
100
+
101
+
112
102
  class PerfServerCMD(CLICommand):
113
103
  name = 'server'
114
104
 
@@ -127,12 +117,8 @@ class PerfServerCMD(CLICommand):
127
117
  # start monitor
128
118
  p_monitor = start_monitor(self.args)
129
119
  # start tensorboard
130
- p_tensorboard = start_tensorboard(self.args)
120
+ p_tensorboard = start_tensorboard(self.args)
131
121
  # start server
132
122
  p_server = start_server(self.args)
133
-
123
+
134
124
  wait_for_workers([p_monitor, p_tensorboard, p_server])
135
-
136
-
137
-
138
-
evalscope/config.py CHANGED
@@ -1,69 +1,137 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- import os
4
3
  import copy
5
- from dataclasses import dataclass, asdict, field
6
- from typing import Optional, List
4
+ import json
5
+ import os
6
+ from argparse import Namespace
7
+ from dataclasses import dataclass, field
8
+ from typing import Dict, List, Optional, Union
7
9
 
8
- from evalscope.constants import DEFAULT_ROOT_CACHE_DIR
10
+ from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, DEFAULT_WORK_DIR, EvalBackend, EvalStage, EvalType, HubType
9
11
  from evalscope.models.custom import CustomModel
10
- from evalscope.utils import yaml_to_dict
12
+ from evalscope.utils import gen_hash
13
+ from evalscope.utils.io_utils import dict_to_yaml, json_to_dict, yaml_to_dict
11
14
  from evalscope.utils.logger import get_logger
12
15
 
13
16
  logger = get_logger()
14
17
 
15
18
  cur_path = os.path.dirname(os.path.abspath(__file__))
16
19
 
17
- registry_tasks = {
18
- 'arc': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/arc.yaml')),
19
- 'gsm8k': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/gsm8k.yaml')),
20
- 'mmlu': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/mmlu.yaml')),
21
- 'cmmlu': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/cmmlu.yaml')),
22
- 'ceval': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/ceval.yaml')),
23
- 'bbh': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/bbh.yaml')),
24
- 'general_qa': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/general_qa.yaml')),
25
-
26
- # 'bbh_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/bbh_mini.yaml')),
27
- # 'mmlu_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/mmlu_mini.yaml')),
28
- # 'ceval_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/ceval_mini.yaml')),
29
-
20
+ DEFAULT_MODEL_ARGS = {'revision': 'master', 'precision': 'torch.float16', 'device': 'auto'}
21
+ DEFAULT_GENERATION_CONFIG = {
22
+ 'max_length': 2048,
23
+ 'max_new_tokens': 512,
24
+ 'do_sample': False,
25
+ 'top_k': 50,
26
+ 'top_p': 1.0,
27
+ 'temperature': 1.0,
30
28
  }
31
29
 
32
30
 
33
31
  @dataclass
34
32
  class TaskConfig:
35
- model_args: Optional[dict] = field(default_factory=dict)
36
- template_type: Optional[str] = 'default-generation'
37
- generation_config: Optional[dict] = field(default_factory=dict)
38
- dataset_args: Optional[dict] = field(default_factory=dict)
33
+ # Model-related arguments
34
+ model: Union[str, CustomModel, None] = None
35
+ model_id: Optional[str] = None
36
+ model_args: Optional[Dict] = field(default_factory=lambda: DEFAULT_MODEL_ARGS | {})
37
+
38
+ # Template-related arguments
39
+ template_type: Optional[str] = None # Deprecated, will be removed in v1.0.0.
40
+ chat_template: Optional[str] = None
41
+
42
+ # Dataset-related arguments
43
+ datasets: Optional[List[str]] = None
44
+ dataset_args: Optional[Dict] = field(default_factory=dict)
45
+ dataset_dir: str = DEFAULT_DATASET_CACHE_DIR
46
+ dataset_hub: str = HubType.MODELSCOPE
47
+
48
+ # Generation configuration arguments
49
+ generation_config: Optional[Dict] = field(default_factory=lambda: DEFAULT_GENERATION_CONFIG | {})
50
+
51
+ # Evaluation-related arguments
52
+ eval_type: str = EvalType.CHECKPOINT
53
+ eval_backend: str = EvalBackend.NATIVE
54
+ eval_config: Union[str, Dict, None] = None
55
+ stage: str = EvalStage.ALL
56
+ limit: Optional[int] = None
57
+
58
+ # Cache and working directory arguments
59
+ mem_cache: bool = False # Deprecated, will be removed in v1.0.0.
60
+ use_cache: Optional[str] = None
61
+ work_dir: str = DEFAULT_WORK_DIR
62
+ outputs: Optional[str] = None # Deprecated, will be removed in v1.0.0.
63
+
64
+ # Debug and runtime mode arguments
65
+ debug: bool = False
39
66
  dry_run: bool = False
40
- model: CustomModel = None
41
- eval_type: str = 'custom'
42
- datasets: list = field(default_factory=list)
43
- work_dir: str = DEFAULT_ROOT_CACHE_DIR
44
- outputs: str = None
45
- mem_cache: bool = False
46
- use_cache: bool = True
47
- stage: str = 'all' # `all` or `infer` or `review`
48
- dataset_hub: str = 'ModelScope'
49
- dataset_dir: str = DEFAULT_ROOT_CACHE_DIR
50
- limit: int = None
51
- eval_backend: str = 'Native'
52
- eval_config: dict = field(default_factory=dict)
53
-
54
- # def __post_init__(self):
55
- # self.registry_tasks = {
56
- # 'arc': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/arc.yaml')),
57
- # 'gsm8k': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/gsm8k.yaml')),
58
- # 'mmlu': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/mmlu.yaml')),
59
- # 'ceval': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/ceval.yaml')),
60
- # 'bbh': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/bbh.yaml')),
61
- #
62
- # 'bbh_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/bbh_mini.yaml')),
63
- # 'mmlu_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/mmlu_mini.yaml')),
64
- # 'ceval_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/ceval_mini.yaml')),
65
- #
66
- # }
67
+ seed: int = 42
68
+
69
+ def __post_init__(self):
70
+ if (not self.model_id) and self.model:
71
+ if isinstance(self.model, CustomModel):
72
+ self.model_id = type(self.model).__name__
73
+ else:
74
+ self.model_id = os.path.basename(self.model).rstrip(os.sep)
75
+
76
+ def to_dict(self):
77
+ # Note: to avoid serialization error for some model instance
78
+ return self.__dict__
79
+
80
+ def __str__(self):
81
+ return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
82
+
83
+ def update(self, other: Union['TaskConfig', dict]):
84
+ if isinstance(other, TaskConfig):
85
+ other = other.to_dict()
86
+ self.__dict__.update(other)
87
+
88
+ def dump_yaml(self, output_dir: str):
89
+ """Dump the task configuration to a YAML file."""
90
+ task_cfg_file = os.path.join(output_dir, f'task_config_{gen_hash(str(self), bits=6)}.yaml')
91
+ try:
92
+ logger.info(f'Dump task config to {task_cfg_file}')
93
+ dict_to_yaml(self.to_dict(), task_cfg_file)
94
+ except Exception as e:
95
+ logger.warning(f'Failed to dump overall task config: {e}')
96
+
97
+ @staticmethod
98
+ def list():
99
+ return list(registry_tasks.keys())
100
+
101
+ @staticmethod
102
+ def from_yaml(yaml_file: str):
103
+ return TaskConfig.from_dict(yaml_to_dict(yaml_file))
104
+
105
+ @staticmethod
106
+ def from_dict(d: dict):
107
+ return TaskConfig(**d)
108
+
109
+ @staticmethod
110
+ def from_json(json_file: str):
111
+ return TaskConfig.from_dict(json_to_dict(json_file))
112
+
113
+ @staticmethod
114
+ def from_args(args: Namespace):
115
+ # Convert Namespace to a dictionary and filter out None values
116
+ args_dict = {k: v for k, v in vars(args).items() if v is not None}
117
+ del args_dict['func'] # Note: compat CLI arguments
118
+
119
+ return TaskConfig.from_dict(args_dict)
120
+
121
+ @staticmethod
122
+ def load(custom_model: CustomModel, tasks: List[str]) -> List['TaskConfig']:
123
+ res_list = []
124
+ for task_name in tasks:
125
+ task = registry_tasks.get(task_name, None)
126
+ if task is None:
127
+ logger.error(f'No task found in tasks: {list(registry_tasks.keys())}, got task_name: {task_name}')
128
+ continue
129
+
130
+ task.model = custom_model
131
+ task.model_id = type(custom_model).__name__
132
+ res_list.append(task)
133
+
134
+ return res_list
67
135
 
68
136
  @staticmethod
69
137
  def registry(name: str, data_pattern: str, dataset_dir: str = None, subset_list: list = None) -> None:
@@ -75,7 +143,7 @@ class TaskConfig:
75
143
  data_pattern: str, the data pattern for the task.
76
144
  e.g. `mmlu`, `ceval`, `gsm8k`, ...
77
145
  refer to task_config.list() for all available datasets.
78
- dataset_dir: str, the directory to store multiple datasets files. e.g. /path/to/data,
146
+ dataset_dir: str, the directory to store multiple datasets files. e.g. /path/to/data,
79
147
  then your specific custom dataset directory will be /path/to/data/{name}
80
148
  subset_list: list, the subset list for the dataset.
81
149
  e.g. ['middle_school_politics', 'operating_system']
@@ -83,63 +151,55 @@ class TaskConfig:
83
151
  """
84
152
  available_datasets = list(registry_tasks.keys())
85
153
  if data_pattern not in available_datasets:
86
- logger.error(f'No dataset found in available datasets: {available_datasets}, got data_pattern: {data_pattern}')
154
+ logger.error(
155
+ f'No dataset found in available datasets: {available_datasets}, got data_pattern: {data_pattern}')
87
156
  return
88
157
 
89
158
  # Reuse the existing task config and update the datasets
90
- pattern_config = registry_tasks.get(data_pattern)
159
+ pattern_config = registry_tasks[data_pattern]
91
160
 
92
161
  custom_config = copy.deepcopy(pattern_config)
93
- custom_config.update({'datasets': [data_pattern]})
94
- custom_config.update({'dataset_hub': 'Local'}) # TODO: to support `ModelScope`
95
- if 'dataset_args' in custom_config:
96
- if data_pattern not in custom_config:
97
- custom_config['dataset_args'].update({data_pattern: {}})
98
- else:
99
- custom_config.update({'dataset_args': {data_pattern: {}}})
162
+ custom_config.datasets = [data_pattern]
163
+ custom_config.dataset_args = {data_pattern: {}}
164
+ custom_config.eval_type = EvalType.CHECKPOINT
100
165
 
101
166
  if dataset_dir is not None:
102
- custom_config['dataset_args'][data_pattern].update({'local_path': dataset_dir})
167
+ custom_config.dataset_args[data_pattern].update({'local_path': dataset_dir})
103
168
 
104
169
  if subset_list is not None:
105
- # custom_config['dataset_args'].get(data_pattern, {}).update({'subset_list': subset_list})
106
- custom_config['dataset_args'][data_pattern].update({'subset_list': subset_list})
170
+ custom_config.dataset_args[data_pattern].update({'subset_list': subset_list})
107
171
 
108
172
  registry_tasks.update({name: custom_config})
109
173
  logger.info(f'** Registered task: {name} with data pattern: {data_pattern}')
110
174
 
111
- def to_dict(self):
112
- # Note: to avoid serialization error for some model instance
113
- _tmp_model = copy.copy(self.model)
114
- self.model = None
115
- res_dict = asdict(self)
116
- res_dict.update({'model': _tmp_model})
117
- self.model = _tmp_model
118
-
119
- return res_dict
120
175
 
121
- @staticmethod
122
- def load(custom_model: CustomModel, tasks: List[str]) -> List['TaskConfig']:
123
- res_list = []
124
- for task_name in tasks:
125
- task: dict = registry_tasks.get(task_name, None)
126
- if task is None:
127
- logger.error(f'No task found in tasks: {list(registry_tasks.keys())}, got task_name: {task_name}')
128
- continue
129
-
130
- res = TaskConfig(**task)
131
- res.model = custom_model
132
- if res.outputs is None:
133
- res.outputs = os.path.join(res.work_dir,
134
- 'outputs',
135
- f"eval_{'-'.join(tasks)}_{res.model.config['model_id']}_{res.model_args.get('revision', 'default')}")
136
- res_list.append(res)
137
-
138
- return res_list
139
-
140
- @staticmethod
141
- def list():
142
- return list(registry_tasks.keys())
176
+ tasks = ['arc', 'gsm8k', 'mmlu', 'cmmlu', 'ceval', 'bbh', 'general_qa']
177
+
178
+ registry_tasks = {task: TaskConfig.from_yaml(os.path.join(cur_path, f'registry/tasks/{task}.yaml')) for task in tasks}
179
+
180
+
181
+ def parse_task_config(task_cfg) -> TaskConfig:
182
+ """Parse task configuration from various formats into a TaskConfig object."""
183
+ if isinstance(task_cfg, TaskConfig):
184
+ logger.info('Args: Task config is provided with TaskConfig type.')
185
+ elif isinstance(task_cfg, dict):
186
+ logger.info('Args: Task config is provided with dictionary type.')
187
+ task_cfg = TaskConfig.from_dict(task_cfg)
188
+ elif isinstance(task_cfg, Namespace):
189
+ logger.info('Args: Task config is provided with CommandLine type.')
190
+ task_cfg = TaskConfig.from_args(task_cfg)
191
+ elif isinstance(task_cfg, str):
192
+ extension = task_cfg.split('.')[-1]
193
+ logger.info(f'Args: Task config is provided with {extension} file type.')
194
+ if extension in ['yaml', 'yml']:
195
+ task_cfg = TaskConfig.from_yaml(task_cfg)
196
+ elif extension == 'json':
197
+ task_cfg = TaskConfig.from_json(task_cfg)
198
+ else:
199
+ raise ValueError('Args: Unsupported file extension.')
200
+ else:
201
+ raise ValueError('Args: Please provide a valid task config.')
202
+ return task_cfg
143
203
 
144
204
 
145
205
  class TempModel(CustomModel):
@@ -158,9 +218,7 @@ if __name__ == '__main__':
158
218
  # Register a new task
159
219
  TaskConfig.registry(name='arc_swift', data_pattern='arc', dataset_dir='/path/to/swift_custom_work')
160
220
 
161
- import json
162
221
  swift_eval_task: List[TaskConfig] = TaskConfig.load(custom_model=model, tasks=['gsm8k', 'arc', 'arc_swift'])
163
222
  for item in swift_eval_task:
164
- print(item.to_dict())
223
+ print(item)
165
224
  print()
166
-
evalscope/constants.py CHANGED
@@ -1,7 +1,18 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- from enum import Enum
2
+ from modelscope.utils.constant import DEFAULT_REPOSITORY_REVISION
3
+ from modelscope.utils.file_utils import get_dataset_cache_root, get_model_cache_root
3
4
 
4
- DEFAULT_ROOT_CACHE_DIR = '~/.cache/evalscope'
5
+ DEFAULT_WORK_DIR = './outputs'
6
+ DEFAULT_MODEL_REVISION = DEFAULT_REPOSITORY_REVISION # master
7
+ DEFAULT_MODEL_CACHE_DIR = get_model_cache_root() # ~/.cache/modelscope/hub
8
+ DEFAULT_DATASET_CACHE_DIR = get_dataset_cache_root() # ~/.cache/modelscope/datasets
9
+ DEFAULT_ROOT_CACHE_DIR = DEFAULT_DATASET_CACHE_DIR # compatible with old version
10
+
11
+
12
+ class HubType:
13
+ MODELSCOPE = 'modelscope'
14
+ HUGGINGFACE = 'huggingface'
15
+ LOCAL = 'local'
5
16
 
6
17
 
7
18
  class DumpMode:
@@ -25,7 +36,7 @@ class MetricsConstant:
25
36
  ]
26
37
 
27
38
 
28
- class MetricMembers(Enum):
39
+ class MetricMembers:
29
40
 
30
41
  # Math accuracy metric
31
42
  MATH_ACCURACY = 'math_accuracy'
@@ -65,54 +76,25 @@ class ArenaMode:
65
76
  PAIRWISE_BASELINE = 'pairwise_baseline'
66
77
 
67
78
 
68
- class OutputsStructure:
69
-
70
- LOGS_DIR = 'logs_dir'
71
-
72
- PREDICTIONS_DIR = 'predictions_dir'
73
-
74
- REVIEWS_DIR = 'reviews_dir'
75
-
76
- REPORTS_DIR = 'reports_dir'
77
-
78
- CONFIGS_DIR = 'configs_dir'
79
-
80
-
81
79
  class AnswerKeys:
82
-
83
80
  ANSWER_ID = 'answer_id'
84
-
85
81
  RAW_INPUT = 'raw_input'
86
-
87
82
  ORIGIN_PROMPT = 'origin_prompt'
88
-
89
83
  MODEL_SPEC = 'model_spec'
90
-
91
84
  SUBSET_NAME = 'subset_name'
92
-
93
85
  CHOICES = 'choices'
94
86
 
95
87
 
96
88
  class ReviewKeys:
97
-
98
89
  REVIEW_ID = 'review_id'
99
-
100
90
  REVIEWED = 'reviewed'
101
-
102
91
  REVIEWER_SPEC = 'reviewer_spec'
103
-
104
92
  REVIEW_TIME = 'review_time'
105
-
106
93
  MESSAGE = 'message'
107
-
108
94
  CONTENT = 'content'
109
-
110
95
  GOLD = 'gold'
111
-
112
96
  PRED = 'pred'
113
-
114
97
  RESULT = 'result'
115
-
116
98
  REVIEW = 'review'
117
99
 
118
100
 
@@ -148,3 +130,39 @@ class EvalStage:
148
130
  ALL = 'all'
149
131
  INFER = 'infer'
150
132
  REVIEW = 'review'
133
+
134
+
135
+ class EvalType:
136
+
137
+ CUSTOM = 'custom'
138
+ CHECKPOINT = 'checkpoint'
139
+
140
+
141
+ class EvalBackend:
142
+
143
+ class _Backend:
144
+ # compatible with old version, set 'value'
145
+
146
+ def __init__(self, value):
147
+ self._value = value
148
+
149
+ @property
150
+ def value(self):
151
+ return self._value
152
+
153
+ def __str__(self):
154
+ return self._value
155
+
156
+ def __repr__(self):
157
+ return f"'{self._value}'"
158
+
159
+ def __eq__(self, other):
160
+ if isinstance(other, str):
161
+ return self._value == other
162
+ return NotImplemented
163
+
164
+ NATIVE = _Backend('Native')
165
+ OPEN_COMPASS = _Backend('OpenCompass')
166
+ VLM_EVAL_KIT = _Backend('VLMEvalKit')
167
+ RAG_EVAL = _Backend('RAGEval')
168
+ THIRD_PARTY = _Backend('ThirdParty')