evalscope 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (148) hide show
  1. evalscope/api/benchmark/__init__.py +1 -1
  2. evalscope/api/benchmark/adapters/__init__.py +2 -0
  3. evalscope/api/benchmark/adapters/default_data_adapter.py +7 -4
  4. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  5. evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
  6. evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
  7. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  8. evalscope/api/benchmark/benchmark.py +62 -2
  9. evalscope/api/benchmark/meta.py +9 -0
  10. evalscope/api/dataset/dataset.py +6 -6
  11. evalscope/api/dataset/loader.py +2 -1
  12. evalscope/api/evaluator/cache.py +24 -1
  13. evalscope/api/evaluator/evaluator.py +5 -0
  14. evalscope/api/evaluator/state.py +17 -1
  15. evalscope/api/messages/__init__.py +1 -0
  16. evalscope/api/messages/chat_message.py +52 -2
  17. evalscope/api/metric/scorer.py +15 -7
  18. evalscope/api/mixin/__init__.py +1 -1
  19. evalscope/api/mixin/llm_judge_mixin.py +2 -0
  20. evalscope/api/mixin/sandbox_mixin.py +204 -0
  21. evalscope/api/model/generate_config.py +1 -6
  22. evalscope/api/model/model.py +5 -2
  23. evalscope/api/tool/tool_info.py +1 -1
  24. evalscope/app/app.py +3 -0
  25. evalscope/app/ui/single_model.py +3 -3
  26. evalscope/app/utils/data_utils.py +7 -7
  27. evalscope/app/utils/env_utils.py +12 -0
  28. evalscope/app/utils/text_utils.py +14 -12
  29. evalscope/arguments.py +8 -4
  30. evalscope/backend/opencompass/backend_manager.py +0 -2
  31. evalscope/backend/rag_eval/utils/embedding.py +9 -1
  32. evalscope/benchmarks/ai2d/ai2d_adapter.py +53 -0
  33. evalscope/benchmarks/amc/amc_adapter.py +46 -0
  34. evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
  35. evalscope/benchmarks/bfcl/bfcl_adapter.py +142 -7
  36. evalscope/benchmarks/bfcl/generation.py +9 -9
  37. evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
  38. evalscope/benchmarks/data_collection/data_collection_adapter.py +23 -19
  39. evalscope/benchmarks/drop/drop_adapter.py +1 -1
  40. evalscope/benchmarks/frames/frames_adapter.py +2 -1
  41. evalscope/benchmarks/general_arena/general_arena_adapter.py +5 -1
  42. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  43. evalscope/benchmarks/healthbench/utils.py +102 -0
  44. evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
  45. evalscope/benchmarks/humaneval/utils.py +235 -0
  46. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  47. evalscope/benchmarks/image_edit/__init__.py +0 -0
  48. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  49. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  50. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  51. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  52. evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
  53. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
  54. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  55. evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
  56. evalscope/benchmarks/math_vista/__init__.py +0 -0
  57. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  58. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  59. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
  60. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  61. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  62. evalscope/benchmarks/mm_star/__init__.py +0 -0
  63. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  64. evalscope/benchmarks/mmmu/__init__.py +0 -0
  65. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  66. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  67. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  68. evalscope/benchmarks/multi_if/__init__.py +0 -0
  69. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  70. evalscope/benchmarks/multi_if/metrics.py +120 -0
  71. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  72. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +6 -5
  73. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  74. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  75. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  76. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  77. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  78. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  79. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  80. evalscope/benchmarks/tau_bench/generation.py +1 -1
  81. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +20 -19
  82. evalscope/benchmarks/text2image/__init__.py +0 -0
  83. evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
  84. evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
  85. evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
  86. evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
  87. evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
  88. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
  89. evalscope/cli/start_app.py +7 -1
  90. evalscope/cli/start_perf.py +7 -1
  91. evalscope/config.py +96 -14
  92. evalscope/constants.py +11 -0
  93. evalscope/evaluator/evaluator.py +30 -10
  94. evalscope/metrics/llm_judge.py +19 -7
  95. evalscope/metrics/metric.py +27 -2
  96. evalscope/models/image_edit_model.py +125 -0
  97. evalscope/models/model_apis.py +22 -0
  98. evalscope/models/openai_compatible.py +3 -0
  99. evalscope/models/text2image_model.py +2 -2
  100. evalscope/models/utils/openai.py +8 -6
  101. evalscope/perf/arguments.py +2 -0
  102. evalscope/perf/benchmark.py +2 -0
  103. evalscope/perf/plugin/api/base.py +2 -2
  104. evalscope/perf/plugin/api/default_api.py +7 -7
  105. evalscope/perf/plugin/api/openai_api.py +83 -19
  106. evalscope/perf/plugin/datasets/flickr8k.py +2 -2
  107. evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
  108. evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
  109. evalscope/perf/utils/benchmark_util.py +7 -5
  110. evalscope/perf/utils/local_server.py +3 -0
  111. evalscope/report/__init__.py +0 -1
  112. evalscope/report/combinator.py +0 -25
  113. evalscope/report/generator.py +8 -87
  114. evalscope/report/report.py +8 -4
  115. evalscope/run.py +9 -5
  116. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  117. evalscope/utils/chat_service.py +1 -1
  118. evalscope/utils/function_utils.py +41 -0
  119. evalscope/utils/import_utils.py +73 -1
  120. evalscope/utils/io_utils.py +56 -7
  121. evalscope/utils/json_schema.py +23 -2
  122. evalscope/utils/logger.py +19 -0
  123. evalscope/utils/model_utils.py +4 -3
  124. evalscope/utils/multi_choices.py +23 -6
  125. evalscope/version.py +2 -2
  126. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/METADATA +17 -24
  127. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/RECORD +145 -103
  128. tests/benchmark/test_eval.py +80 -37
  129. tests/benchmark/test_image_edit.py +65 -0
  130. tests/benchmark/test_sandbox.py +81 -0
  131. tests/benchmark/test_vlm.py +137 -0
  132. tests/cli/test_all.py +83 -43
  133. tests/cli/test_collection.py +8 -5
  134. tests/cli/test_reasoning.py +81 -0
  135. tests/common.py +73 -0
  136. tests/perf/test_perf.py +44 -14
  137. tests/rag/test_clip_benchmark.py +0 -3
  138. evalscope/api/mixin/dataset_mixin.py +0 -105
  139. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
  140. tests/aigc/__init__.py +0 -1
  141. /evalscope/benchmarks/{aigc → ai2d}/__init__.py +0 -0
  142. /evalscope/benchmarks/{aigc/i2i → amc}/__init__.py +0 -0
  143. /evalscope/benchmarks/{aigc/t2i → healthbench}/__init__.py +0 -0
  144. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/LICENSE +0 -0
  145. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/WHEEL +0 -0
  146. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/entry_points.txt +0 -0
  147. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/top_level.txt +0 -0
  148. /tests/{aigc → benchmark}/test_t2i.py +0 -0
@@ -0,0 +1,64 @@
1
+ import re
2
+ from typing import Any, Dict, List
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.evaluator import TaskState
7
+ from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
8
+ from evalscope.api.registry import register_benchmark
9
+ from evalscope.constants import Tags
10
+ from evalscope.utils.io_utils import bytes_to_base64
11
+ from evalscope.utils.logger import get_logger
12
+
13
+ logger = get_logger()
14
+
15
+ SUBSET_LIST = ['default']
16
+
17
+ OPEN_PROMPT = (
18
+ 'Read the picture and solve the following problem step by step.'
19
+ 'The last line of your response should be of the form'
20
+ ' "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem.\n\n'
21
+ '{question}\n\n'
22
+ 'Remember to put your answer on its own line at the end in the form'
23
+ ' "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem,'
24
+ ' and you do not need to use a \\boxed command.'
25
+ )
26
+
27
+
28
+ @register_benchmark(
29
+ BenchmarkMeta(
30
+ name='real_world_qa',
31
+ pretty_name='RealWorldQA',
32
+ tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
33
+ description=
34
+ 'RealWorldQA is a benchmark designed to evaluate the real-world spatial understanding capabilities of multimodal AI models, contributed by XAI. It assesses how well these models comprehend physical environments. The benchmark consists of 700+ images, each accompanied by a question and a verifiable answer. These images are drawn from real-world scenarios, including those captured from vehicles. The goal is to advance AI models\' understanding of our physical world.', # noqa: E501
35
+ dataset_id='lmms-lab/RealWorldQA',
36
+ subset_list=SUBSET_LIST,
37
+ metric_list=['acc'],
38
+ eval_split='test',
39
+ prompt_template=OPEN_PROMPT,
40
+ )
41
+ )
42
+ class RealWorldQAAdapter(VisionLanguageAdapter):
43
+
44
+ def __init__(self, **kwargs):
45
+ super().__init__(**kwargs)
46
+
47
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
48
+ content_list: list[Content] = [ContentText(text=OPEN_PROMPT.format(question=record['question']))]
49
+ image = record.get('image')
50
+ if image:
51
+ image_base64 = bytes_to_base64(image['bytes'], format='webp', add_header=True)
52
+ content_list.append(ContentImage(image=image_base64))
53
+ return Sample(
54
+ input=[ChatMessageUser(content=content_list)],
55
+ target=record['answer'],
56
+ metadata={'image_path': record['image_path']}
57
+ )
58
+
59
+ def extract_answer(self, prediction: str, task_state: TaskState) -> str:
60
+ pattern = r'ANSWER:\s*(.*)'
61
+ match = re.search(pattern, prediction)
62
+ if match:
63
+ return match.group(1).strip()
64
+ return ''
@@ -45,7 +45,7 @@ def _patch_agent_solve(model: Model):
45
45
  input=[dict_to_chat_message(msg) for msg in messages],
46
46
  tools=[ToolInfo.model_validate(tool['function']) for tool in self.tools_info]
47
47
  )
48
- oai_res = openai_chat_choices(res.choices)
48
+ oai_res = openai_chat_choices(res.choices, include_reasoning=False)
49
49
 
50
50
  next_message = oai_res[0].message.model_dump(exclude_none=True)
51
51
 
@@ -13,6 +13,7 @@ from evalscope.api.registry import register_benchmark
13
13
  from evalscope.constants import Tags
14
14
  from evalscope.utils import get_logger
15
15
  from evalscope.utils.function_utils import run_once
16
+ from evalscope.utils.import_utils import check_import
16
17
 
17
18
  logger = get_logger()
18
19
 
@@ -35,8 +36,8 @@ logger = get_logger()
35
36
  'api_key': 'EMPTY',
36
37
  'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
37
38
  'generation_config': {
38
- 'temperature': 0.7,
39
- 'max_new_tokens': 1024
39
+ 'temperature': 0.0,
40
+ 'max_tokens': 4096,
40
41
  }
41
42
  }
42
43
  )
@@ -46,22 +47,18 @@ class TauBenchAdapter(DefaultDataAdapter):
46
47
  def __init__(self, **kwargs):
47
48
  super().__init__(**kwargs)
48
49
 
49
- spec = importlib.util.find_spec('tau_bench')
50
- if spec is None:
51
- raise ImportError(
52
- '`tau_bench` not found, please install it with `pip install git+https://github.com/sierra-research/tau-bench` before evaluating.' # noqa: E501
53
- )
50
+ check_import(
51
+ 'tau_bench',
52
+ package='git+https://github.com/sierra-research/tau-bench',
53
+ raise_error=True,
54
+ feature_name=self.pretty_name
55
+ )
54
56
 
55
57
  # setup user model args
56
58
  self.user_model = self.extra_params.get('user_model', 'qwen-plus')
57
59
  self.api_key = self.extra_params.get('api_key', 'EMPTY')
58
60
  self.api_base = self.extra_params.get('api_base', 'https://dashscope.aliyuncs.com/compatible-mode/v1')
59
- self.generation_config = self.extra_params.get(
60
- 'generation_config', {
61
- 'temperature': 0.7,
62
- 'max_new_tokens': 1024
63
- }
64
- )
61
+ self.generation_config = self.extra_params.get('generation_config', {'temperature': 0.0, 'max_tokens': 4096})
65
62
 
66
63
  self._patch_env_completion()
67
64
 
@@ -84,10 +81,10 @@ class TauBenchAdapter(DefaultDataAdapter):
84
81
 
85
82
  res = user_server.generate(input=[dict_to_chat_message(msg) for msg in messages])
86
83
 
87
- message = res.message.model_dump(exclude_none=True)
84
+ message = {'role': 'assistant', 'content': res.completion}
88
85
  self.messages.append(message)
89
86
  self.total_cost = 0
90
- return message['content']
87
+ return res.completion
91
88
 
92
89
  # get the current instance of TauBenchAdapter
93
90
  adapter_instance = self
@@ -114,7 +111,11 @@ class TauBenchAdapter(DefaultDataAdapter):
114
111
  })
115
112
  # load dataset
116
113
  dataset = DictDataLoader(
117
- dict_list=tasks, sample_fields=self.record_to_sample, limit=self.limit, repeats=self.repeats
114
+ dict_list=tasks,
115
+ sample_fields=self.record_to_sample,
116
+ limit=self.limit,
117
+ repeats=self.repeats,
118
+ shuffle=self.shuffle,
118
119
  ).load()
119
120
 
120
121
  data_dict[env_name] = dataset
@@ -145,15 +146,15 @@ class TauBenchAdapter(DefaultDataAdapter):
145
146
 
146
147
  try:
147
148
  # Parse the prediction to get the reward
148
- res = task_state.metadata
149
- reward = res.get('reward', 0.0)
149
+ task_result = task_state.metadata['task_result']
150
+ reward = task_result.get('reward', 0.0)
150
151
 
151
152
  score.value = {
152
153
  'Pass^1': float(reward),
153
154
  }
154
155
  score.explanation = f'Task completed with reward: {reward}'
155
156
  score.metadata = {
156
- 'task_result': res,
157
+ 'task_result': task_result,
157
158
  'env_name': task_state.metadata.get('env_name', 'unknown'),
158
159
  'task_index': task_state.metadata.get('task_index', -1)
159
160
  }
File without changes
@@ -16,8 +16,10 @@ logger = get_logger()
16
16
  @register_benchmark(
17
17
  BenchmarkMeta(
18
18
  name='evalmuse',
19
+ pretty_name='EvalMuse',
19
20
  dataset_id='AI-ModelScope/T2V-Eval-Prompts',
20
- description='EvalMuse Text-to-Image Benchmark',
21
+ description='EvalMuse Text-to-Image Benchmark. Used for evaluating the quality '
22
+ 'and semantic alignment of finely generated images',
21
23
  tags=[Tags.TEXT_TO_IMAGE],
22
24
  subset_list=['EvalMuse'],
23
25
  metric_list=['FGA_BLIP2Score'],
@@ -4,7 +4,6 @@ import os
4
4
  from evalscope.api.benchmark import BenchmarkMeta, Text2ImageAdapter
5
5
  from evalscope.api.dataset import Sample
6
6
  from evalscope.api.messages import ChatMessageUser
7
- from evalscope.api.metric.scorer import Score
8
7
  from evalscope.api.registry import get_metric, register_benchmark
9
8
  from evalscope.constants import Tags
10
9
  from evalscope.utils.logger import get_logger
@@ -15,8 +14,9 @@ logger = get_logger()
15
14
  @register_benchmark(
16
15
  BenchmarkMeta(
17
16
  name='genai_bench',
17
+ pretty_name='GenAI-Bench',
18
18
  dataset_id='AI-ModelScope/T2V-Eval-Prompts',
19
- description='GenAI-Bench Text-to-Image Benchmark',
19
+ description='GenAI-Bench Text-to-Image Benchmark. Includes 1600 prompts for text-to-image task.',
20
20
  tags=[Tags.TEXT_TO_IMAGE],
21
21
  subset_list=['GenAI-Bench-1600'],
22
22
  metric_list=['VQAScore'],
@@ -16,7 +16,7 @@ logger = get_logger()
16
16
  name='general_t2i',
17
17
  dataset_id='general_t2i',
18
18
  description='General Text-to-Image Benchmark',
19
- tags=[Tags.TEXT_TO_IMAGE],
19
+ tags=[Tags.TEXT_TO_IMAGE, Tags.CUSTOM],
20
20
  subset_list=['default'],
21
21
  metric_list=['PickScore'],
22
22
  few_shot_num=0,
@@ -14,8 +14,10 @@ logger = get_logger()
14
14
  @register_benchmark(
15
15
  BenchmarkMeta(
16
16
  name='hpdv2',
17
+ pretty_name='HPD-v2',
17
18
  dataset_id='AI-ModelScope/T2V-Eval-Prompts',
18
- description='HPDv2 Text-to-Image Benchmark',
19
+ description='HPDv2 Text-to-Image Benchmark. Evaluation metrics based on human preferences, '
20
+ 'trained on the Human Preference Dataset (HPD v2)',
19
21
  tags=[Tags.TEXT_TO_IMAGE],
20
22
  subset_list=['HPDv2'],
21
23
  metric_list=['HPSv2.1Score'],
@@ -41,7 +43,10 @@ class HPDv2Adapter(Text2ImageAdapter):
41
43
  return Sample(
42
44
  input=[ChatMessageUser(content=record['prompt'])],
43
45
  metadata={
46
+ 'id': record['id'],
47
+ 'prompt': record['prompt'],
44
48
  'category': record.get('tags', {}).get('category', ''),
45
- 'tags': record.get('tags', {})
49
+ 'tags': record.get('tags', {}),
50
+ 'image_path': record.get('image_path', ''), # Optional field for existing image path
46
51
  }
47
52
  )
@@ -10,6 +10,7 @@ logger = get_logger()
10
10
  @register_benchmark(
11
11
  BenchmarkMeta(
12
12
  name='tifa160',
13
+ pretty_name='TIFA-160',
13
14
  dataset_id='AI-ModelScope/T2V-Eval-Prompts',
14
15
  description='TIFA-160 Text-to-Image Benchmark',
15
16
  tags=[Tags.TEXT_TO_IMAGE],
@@ -37,6 +37,7 @@ TRUTHFUL_QA_PROMPT = (
37
37
  dataset_id='evalscope/truthful_qa',
38
38
  metric_list=['multi_choice_acc'],
39
39
  subset_list=['multiple_choice'],
40
+ shuffle_choices=True,
40
41
  few_shot_num=0,
41
42
  train_split=None,
42
43
  eval_split='validation',
@@ -55,8 +56,6 @@ class TruthfulQaAdapter(MultiChoiceAdapter):
55
56
 
56
57
  super().__init__(**kwargs)
57
58
 
58
- self.shuffle_choices = True
59
-
60
59
  self.multiple_correct = self.extra_params.get('multiple_correct', False)
61
60
  if self.multiple_correct:
62
61
  self.prompt_template = MultipleChoiceTemplate.MULTIPLE_ANSWER
@@ -28,6 +28,12 @@ class StartAppCMD(CLICommand):
28
28
  parser.set_defaults(func=subparser_func)
29
29
 
30
30
  def execute(self):
31
- from evalscope.app import create_app
31
+ try:
32
+ from evalscope.app import create_app
33
+ except ImportError as e:
34
+ raise ImportError(
35
+ f'Failed to import create_app from evalscope.app, due to {e}. '
36
+ "Please run `pip install 'evalscope[app]'`."
37
+ )
32
38
 
33
39
  create_app(self.args)
@@ -28,6 +28,12 @@ class PerfBenchCMD(CLICommand):
28
28
  parser.set_defaults(func=subparser_func)
29
29
 
30
30
  def execute(self):
31
- from evalscope.perf.main import run_perf_benchmark
31
+ try:
32
+ from evalscope.perf.main import run_perf_benchmark
33
+ except ImportError as e:
34
+ raise ImportError(
35
+ f'Failed to import run_perf_benchmark from evalscope.perf.main, due to {e}. '
36
+ "Please run `pip install 'evalscope[perf]'`."
37
+ )
32
38
 
33
39
  run_perf_benchmark(self.args)
evalscope/config.py CHANGED
@@ -6,7 +6,7 @@ from argparse import Namespace
6
6
  from dataclasses import dataclass, field
7
7
  from typing import Dict, List, Optional, Union
8
8
 
9
- from evalscope.api.model import GenerateConfig
9
+ from evalscope.api.model import GenerateConfig, Model, ModelAPI
10
10
  from evalscope.constants import (
11
11
  DEFAULT_DATASET_CACHE_DIR,
12
12
  DEFAULT_WORK_DIR,
@@ -15,10 +15,10 @@ from evalscope.constants import (
15
15
  HubType,
16
16
  JudgeStrategy,
17
17
  ModelTask,
18
- OutputType,
19
18
  )
20
19
  from evalscope.utils.argument_utils import BaseArgument, parse_int_or_float
21
20
  from evalscope.utils.deprecation_utils import deprecated_warning
21
+ from evalscope.utils.import_utils import check_import
22
22
  from evalscope.utils.io_utils import dict_to_yaml, gen_hash, safe_filename
23
23
  from evalscope.utils.logger import get_logger
24
24
 
@@ -28,51 +28,115 @@ logger = get_logger()
28
28
  @dataclass
29
29
  class TaskConfig(BaseArgument):
30
30
  # Model-related arguments
31
- model: Optional[str] = None
31
+ model: Optional[Union[str, Model, ModelAPI]] = None
32
+ """The model to be evaluated. Can be a string path, Model object, or ModelAPI object."""
33
+
32
34
  model_id: Optional[str] = None
35
+ """Unique identifier for the model. Auto-generated from model name if not provided."""
36
+
33
37
  model_args: Dict = field(default_factory=dict)
38
+ """Additional arguments to pass to the model during initialization."""
39
+
34
40
  model_task: str = ModelTask.TEXT_GENERATION
41
+ """The type of task the model performs (e.g., text generation, image generation)."""
35
42
 
36
43
  # Template-related arguments
37
44
  chat_template: Optional[str] = None
45
+ """Chat template to use for formatting conversations with the model."""
38
46
 
39
47
  # Dataset-related arguments
40
48
  datasets: List[str] = field(default_factory=list)
49
+ """List of dataset names to evaluate the model on."""
50
+
41
51
  dataset_args: Dict = field(default_factory=dict)
52
+ """Additional arguments to pass to datasets during loading."""
53
+
42
54
  dataset_dir: str = DEFAULT_DATASET_CACHE_DIR
55
+ """Directory where datasets are cached locally."""
56
+
43
57
  dataset_hub: str = HubType.MODELSCOPE
44
- repeats: int = 1 # Number of times to repeat the dataset items for k-metrics
58
+ """Hub platform to download datasets from (e.g., ModelScope, HuggingFace)."""
59
+
60
+ repeats: int = 1
61
+ """Number of times to repeat the dataset items for k-metrics evaluation."""
45
62
 
46
63
  # Generation configuration arguments
47
64
  generation_config: Union[Dict, GenerateConfig] = field(default_factory=dict)
65
+ """Configuration parameters for text/image generation."""
48
66
 
49
67
  # Evaluation-related arguments
50
68
  eval_type: str = EvalType.CHECKPOINT
69
+ """Type of evaluation: checkpoint, service, or mock."""
70
+
51
71
  eval_backend: str = EvalBackend.NATIVE
72
+ """Backend framework to use for evaluation."""
73
+
52
74
  eval_config: Union[str, Dict, None] = None
75
+ """Additional evaluation configuration parameters."""
76
+
53
77
  limit: Optional[Union[int, float]] = None
78
+ """Maximum number of samples to evaluate. Can be int (count) or float (fraction)."""
79
+
54
80
  eval_batch_size: int = 1
81
+ """Batch size for evaluation processing."""
55
82
 
56
83
  # Cache and working directory arguments
57
84
  use_cache: Optional[str] = None
85
+ """Whether to use cached results and which cache strategy to apply."""
86
+
58
87
  rerun_review: bool = False
88
+ """Whether to rerun the review process even if results exist."""
89
+
59
90
  work_dir: str = DEFAULT_WORK_DIR
91
+ """Working directory for storing evaluation results and temporary files."""
60
92
 
61
93
  # Debug and runtime mode arguments
62
94
  ignore_errors: bool = False
95
+ """Whether to continue evaluation when encountering errors."""
96
+
63
97
  debug: bool = False
64
- dry_run: bool = False
98
+ """Enable debug mode for detailed logging and error reporting."""
99
+
65
100
  seed: Optional[int] = 42
66
- api_url: Optional[str] = None # Only used for server model
67
- api_key: Optional[str] = 'EMPTY' # Only used for server model
68
- timeout: Optional[float] = None # Only used for server model
69
- stream: Optional[bool] = None # Only used for server model
101
+ """Random seed for reproducible results."""
102
+
103
+ api_url: Optional[str] = None
104
+ """API endpoint URL for server-based model evaluation."""
105
+
106
+ api_key: Optional[str] = 'EMPTY'
107
+ """API key for authenticating with server-based models."""
108
+
109
+ timeout: Optional[float] = None
110
+ """Request timeout in seconds for server-based models."""
111
+
112
+ stream: Optional[bool] = None
113
+ """Whether to use streaming responses for server-based models."""
70
114
 
71
115
  # LLMJudge arguments
72
116
  judge_strategy: str = JudgeStrategy.AUTO
117
+ """Strategy for LLM-based judgment (auto, single, pairwise)."""
118
+
73
119
  judge_worker_num: int = 1
120
+ """Number of worker processes for parallel LLM judging."""
121
+
74
122
  judge_model_args: Optional[Dict] = field(default_factory=dict)
123
+ """Additional arguments for the judge model configuration."""
124
+
75
125
  analysis_report: bool = False
126
+ """Whether to generate detailed analysis reports after evaluation."""
127
+
128
+ # Sandbox configuration arguments
129
+ use_sandbox: bool = False
130
+ """Whether to execute code in a sandboxed environment."""
131
+
132
+ sandbox_type: Optional[str] = 'docker'
133
+ """Type of sandbox environment for code execution (e.g., docker). Default is 'docker'."""
134
+
135
+ sandbox_manager_config: Optional[Dict] = field(default_factory=dict)
136
+ """Configuration for the sandbox manager. Default is local manager. If url is provided, it will use remote manager."""
137
+
138
+ sandbox_config: Optional[Dict] = field(default_factory=dict)
139
+ """Configuration for sandboxed code execution environments."""
76
140
 
77
141
  def __post_init__(self):
78
142
  self.__init_model_and_id()
@@ -82,20 +146,22 @@ class TaskConfig(BaseArgument):
82
146
  # Set default generation_config and model_args
83
147
  self.__init_default_generation_config()
84
148
  self.__init_default_model_args()
149
+ self.__init_default_sandbox_config()
85
150
 
86
151
  def __init_model_and_id(self):
87
152
  # Set model to DummyCustomModel if not provided
88
153
  if self.model is None:
89
154
  self.model = self.model_task
90
155
  self.eval_type = EvalType.MOCK_LLM
91
- else:
92
- if self.model_task == ModelTask.IMAGE_GENERATION:
93
- self.eval_type = EvalType.TEXT2IMAGE
94
156
 
95
157
  # Set model_id if not provided
96
158
  if not self.model_id:
97
- if self.model:
159
+ if isinstance(self.model, str):
98
160
  self.model_id = safe_filename(os.path.basename(self.model))
161
+ elif isinstance(self.model, Model):
162
+ self.model_id = safe_filename(self.model.name)
163
+ elif isinstance(self.model, ModelAPI):
164
+ self.model_id = safe_filename(self.model.model_name)
99
165
  else:
100
166
  self.model_id = 'dummy_model'
101
167
 
@@ -113,6 +179,11 @@ class TaskConfig(BaseArgument):
113
179
  'num_inference_steps': 50,
114
180
  'guidance_scale': 9.0,
115
181
  }
182
+ if self.eval_batch_size != 1:
183
+ logger.warning(
184
+ 'For image generation task, we only support eval_batch_size=1 for now, changed to 1.'
185
+ )
186
+ self.eval_batch_size = 1
116
187
  elif self.model_task == ModelTask.TEXT_GENERATION:
117
188
  if self.eval_type == EvalType.CHECKPOINT:
118
189
  self.generation_config = {
@@ -167,6 +238,14 @@ class TaskConfig(BaseArgument):
167
238
  'precision': 'torch.float16',
168
239
  }
169
240
 
241
+ def __init_default_sandbox_config(self):
242
+ if not self.use_sandbox:
243
+ return
244
+ check_import('ms_enclave', 'ms_enclave[docker]', raise_error=True)
245
+
246
+ if not self.sandbox_type:
247
+ self.sandbox_type = 'docker'
248
+
170
249
  def update(self, other: Union['TaskConfig', dict]):
171
250
  if isinstance(other, TaskConfig):
172
251
  other = other.to_dict()
@@ -182,9 +261,12 @@ class TaskConfig(BaseArgument):
182
261
  logger.warning(f'Failed to dump overall task config: {e}')
183
262
 
184
263
  def to_dict(self):
185
- result = copy.deepcopy(self.__dict__)
264
+ result = copy.copy(self.__dict__)
186
265
  del result['api_key'] # Do not expose api_key in the config
187
266
 
267
+ if isinstance(self.model, (Model, ModelAPI)):
268
+ result['model'] = self.model.__class__.__name__
269
+
188
270
  if isinstance(self.generation_config, GenerateConfig):
189
271
  result['generation_config'] = self.generation_config.model_dump(exclude_unset=True)
190
272
  return result
evalscope/constants.py CHANGED
@@ -15,6 +15,7 @@ DEFAULT_ROOT_CACHE_DIR = DEFAULT_DATASET_CACHE_DIR # compatible with old versio
15
15
  DEFAULT_EVALSCOPE_CACHE_DIR = os.path.expanduser(
16
16
  os.getenv('EVALSCOPE_CACHE', '~/.cache/evalscope')
17
17
  ) # ~/.cache/evalscope
18
+ IS_BUILD_DOC = os.getenv('BUILD_DOC', '0') == '1' # To avoid some heavy dependencies when building doc
18
19
 
19
20
 
20
21
  class HubType:
@@ -70,6 +71,7 @@ class EvalType:
70
71
  CHECKPOINT = 'llm_ckpt' # native model checkpoint
71
72
  SERVICE = 'openai_api' # model service
72
73
  TEXT2IMAGE = 'text2image' # image generation service
74
+ IMAGE_EDITING = 'image_editing' # image editing service
73
75
 
74
76
 
75
77
  class OutputType:
@@ -127,3 +129,12 @@ class Tags:
127
129
  RETRIEVAL = 'Retrieval'
128
130
  FUNCTION_CALLING = 'FunctionCalling'
129
131
  TEXT_TO_IMAGE = 'TextToImage'
132
+ IMAGE_EDITING = 'ImageEditing'
133
+ MULTI_MODAL = 'MultiModal'
134
+ MULTI_LINGUAL = 'MultiLingual'
135
+ MULTI_TURN = 'MultiTurn'
136
+
137
+
138
+ class FileConstants:
139
+ IMAGE_PATH = 'image_path'
140
+ ID = 'id'
@@ -8,8 +8,9 @@ and report generation.
8
8
  """
9
9
 
10
10
  import os
11
+ import traceback
11
12
  from collections import defaultdict
12
- from concurrent.futures import ThreadPoolExecutor, as_completed
13
+ from concurrent.futures import ThreadPoolExecutor, TimeoutError, as_completed
13
14
  from tqdm import tqdm
14
15
  from typing import TYPE_CHECKING, Dict, List, Tuple, Union
15
16
 
@@ -17,6 +18,7 @@ from evalscope.api.dataset import Dataset, DatasetDict, Sample
17
18
  from evalscope.api.evaluator import CacheManager, Evaluator, TaskState
18
19
  from evalscope.api.metric import AggScore, SampleScore
19
20
  from evalscope.report import Report, gen_table
21
+ from evalscope.utils.logger import get_logger
20
22
 
21
23
  if TYPE_CHECKING:
22
24
  from evalscope.api.benchmark import DataAdapter
@@ -24,8 +26,6 @@ if TYPE_CHECKING:
24
26
  from evalscope.config import TaskConfig
25
27
  from evalscope.utils.io_utils import OutputsStructure
26
28
 
27
- from evalscope.utils.logger import get_logger
28
-
29
29
  logger = get_logger()
30
30
 
31
31
 
@@ -96,12 +96,17 @@ class DefaultEvaluator(Evaluator):
96
96
 
97
97
  # Process each subset (e.g., test, validation) independently
98
98
  for subset, dataset in dataset_dict.items():
99
- assert len(dataset) > 0, f'No samples found in subset: {subset}'
99
+ if len(dataset) == 0:
100
+ logger.info(f'No samples found in subset: {subset}, skipping.')
101
+ continue
100
102
  subset_score = self.evaluate_subset(subset, dataset)
101
103
  agg_score_dict[subset] = subset_score
102
104
 
103
105
  # Generate the report based on aggregated scores
104
106
  report = self.get_report(agg_score_dict)
107
+
108
+ # Finalize the evaluation process
109
+ self.finalize()
105
110
  return report
106
111
 
107
112
  def evaluate_subset(self, subset: str, dataset: Dataset) -> List[AggScore]:
@@ -181,10 +186,13 @@ class DefaultEvaluator(Evaluator):
181
186
  model_result = self.cache_manager.save_prediction_cache(
182
187
  subset, task_state, self.benchmark.save_metadata
183
188
  )
184
- logger.debug(f'Model result: \n{model_result.model_dump_json(indent=2)}')
189
+ logger.debug(f'Model result: \n{model_result.pretty_print()}')
185
190
 
186
191
  except Exception as exc:
187
- logger.error(f'{sample.model_dump_json(indent=2)} prediction failed: due to {exc}')
192
+ tb_str = traceback.format_exc()
193
+ logger.error(
194
+ f'{sample.model_dump_json(indent=2)} prediction failed: due to {exc}\nTraceback:\n{tb_str}'
195
+ )
188
196
  if self.task_config.ignore_errors:
189
197
  logger.warning('Error ignored, continuing with next sample.')
190
198
  else:
@@ -251,7 +259,13 @@ class DefaultEvaluator(Evaluator):
251
259
  for future in as_completed(future_to_task_state):
252
260
  task_state = future_to_task_state[future]
253
261
  try:
254
- sample_score = future.result()
262
+ try:
263
+ sample_score = future.result()
264
+ except TimeoutError:
265
+ logger.warning(
266
+ f'Timeout when reviewing sample {task_state.sample_id}, setting score to zero.'
267
+ )
268
+ sample_score = SampleScore(sample_id=task_state.sample_id, scores={})
255
269
  sample_score_list.append(sample_score)
256
270
 
257
271
  # Save the review result to cache for future use
@@ -261,10 +275,13 @@ class DefaultEvaluator(Evaluator):
261
275
  sample_score=sample_score,
262
276
  save_metadata=self.benchmark.save_metadata
263
277
  )
264
- logger.debug(f'Review result: \n{review_result.model_dump_json(indent=2)}')
278
+ logger.debug(f'Review result: \n{review_result.pretty_print()}')
265
279
 
266
280
  except Exception as exc:
267
- logger.error(f'Error when review sample {task_state.sample_id}: {exc}')
281
+ tb_str = traceback.format_exc()
282
+ logger.error(
283
+ f'Error when review sample {task_state.sample_id}: due to {exc}\nTraceback:\n{tb_str}'
284
+ )
268
285
  if self.task_config.ignore_errors:
269
286
  logger.warning('Error ignored, continuing with next sample.')
270
287
  else:
@@ -317,7 +334,7 @@ class DefaultEvaluator(Evaluator):
317
334
 
318
335
  # Generate and display a summary table of results
319
336
  try:
320
- report_table = gen_table(report_list=[report], add_overall_metric=True)
337
+ report_table = gen_table(report_list=[report], add_overall_metric=self.benchmark.add_overall_metric)
321
338
  logger.info(f'\n{self.benchmark_name} report table:'
322
339
  f'\n{report_table} \n')
323
340
  except Exception:
@@ -335,3 +352,6 @@ class DefaultEvaluator(Evaluator):
335
352
  report.to_json(report_file)
336
353
  logger.info(f'Dump report to: {report_file} \n')
337
354
  return report
355
+
356
+ def finalize(self, *args, **kwargs):
357
+ self.benchmark.finalize(*args, **kwargs)