evalscope 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (97) hide show
  1. evalscope/api/benchmark/__init__.py +1 -1
  2. evalscope/api/benchmark/adapters/__init__.py +2 -0
  3. evalscope/api/benchmark/adapters/default_data_adapter.py +1 -0
  4. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  5. evalscope/api/benchmark/adapters/text2image_adapter.py +7 -6
  6. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  7. evalscope/api/benchmark/benchmark.py +35 -0
  8. evalscope/api/benchmark/meta.py +6 -0
  9. evalscope/api/dataset/dataset.py +6 -6
  10. evalscope/api/dataset/loader.py +2 -1
  11. evalscope/api/evaluator/cache.py +24 -1
  12. evalscope/api/evaluator/state.py +12 -1
  13. evalscope/api/messages/__init__.py +1 -0
  14. evalscope/api/messages/chat_message.py +47 -2
  15. evalscope/api/metric/scorer.py +15 -7
  16. evalscope/api/mixin/__init__.py +0 -1
  17. evalscope/api/model/generate_config.py +1 -3
  18. evalscope/api/model/model.py +4 -1
  19. evalscope/app/app.py +3 -0
  20. evalscope/app/ui/single_model.py +3 -3
  21. evalscope/app/utils/data_utils.py +7 -7
  22. evalscope/app/utils/env_utils.py +12 -0
  23. evalscope/app/utils/text_utils.py +14 -12
  24. evalscope/arguments.py +2 -4
  25. evalscope/backend/opencompass/backend_manager.py +0 -2
  26. evalscope/backend/rag_eval/utils/embedding.py +9 -1
  27. evalscope/benchmarks/bfcl/bfcl_adapter.py +2 -6
  28. evalscope/benchmarks/bfcl/generation.py +2 -2
  29. evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
  30. evalscope/benchmarks/data_collection/data_collection_adapter.py +23 -19
  31. evalscope/benchmarks/frames/frames_adapter.py +2 -1
  32. evalscope/benchmarks/general_arena/general_arena_adapter.py +5 -1
  33. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  34. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  35. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  36. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  37. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  38. evalscope/benchmarks/mmmu/__init__.py +0 -0
  39. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  40. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  41. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  42. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +5 -1
  43. evalscope/benchmarks/tau_bench/generation.py +1 -1
  44. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +15 -19
  45. evalscope/benchmarks/text2image/__init__.py +0 -0
  46. evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
  47. evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
  48. evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
  49. evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
  50. evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
  51. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
  52. evalscope/cli/start_app.py +7 -1
  53. evalscope/cli/start_perf.py +7 -1
  54. evalscope/config.py +72 -13
  55. evalscope/constants.py +8 -0
  56. evalscope/evaluator/evaluator.py +6 -4
  57. evalscope/metrics/llm_judge.py +19 -7
  58. evalscope/models/image_edit_model.py +125 -0
  59. evalscope/models/model_apis.py +20 -0
  60. evalscope/models/openai_compatible.py +3 -0
  61. evalscope/models/text2image_model.py +2 -2
  62. evalscope/models/utils/openai.py +7 -4
  63. evalscope/perf/benchmark.py +2 -0
  64. evalscope/perf/utils/benchmark_util.py +8 -5
  65. evalscope/perf/utils/local_server.py +3 -0
  66. evalscope/report/__init__.py +0 -1
  67. evalscope/report/generator.py +8 -87
  68. evalscope/run.py +9 -5
  69. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  70. evalscope/utils/chat_service.py +1 -1
  71. evalscope/utils/import_utils.py +23 -1
  72. evalscope/utils/io_utils.py +42 -1
  73. evalscope/utils/model_utils.py +4 -3
  74. evalscope/utils/multi_choices.py +23 -6
  75. evalscope/version.py +2 -2
  76. {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/METADATA +12 -15
  77. {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/RECORD +94 -80
  78. tests/benchmark/test_eval.py +30 -31
  79. tests/benchmark/test_image_edit.py +65 -0
  80. tests/benchmark/test_vlm.py +80 -0
  81. tests/cli/test_all.py +83 -43
  82. tests/cli/test_collection.py +8 -5
  83. tests/cli/test_reasoning.py +81 -0
  84. tests/common.py +73 -0
  85. tests/perf/test_perf.py +4 -2
  86. tests/rag/test_clip_benchmark.py +0 -3
  87. evalscope/api/mixin/dataset_mixin.py +0 -105
  88. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
  89. tests/aigc/__init__.py +0 -1
  90. /evalscope/benchmarks/{aigc → image_edit}/__init__.py +0 -0
  91. /evalscope/benchmarks/{aigc/i2i → image_edit/gedit}/__init__.py +0 -0
  92. /evalscope/benchmarks/{aigc/t2i → math_vista}/__init__.py +0 -0
  93. {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  94. {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  95. {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  96. {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  97. /tests/{aigc → benchmark}/test_t2i.py +0 -0
evalscope/config.py CHANGED
@@ -6,7 +6,7 @@ from argparse import Namespace
6
6
  from dataclasses import dataclass, field
7
7
  from typing import Dict, List, Optional, Union
8
8
 
9
- from evalscope.api.model import GenerateConfig
9
+ from evalscope.api.model import GenerateConfig, Model, ModelAPI
10
10
  from evalscope.constants import (
11
11
  DEFAULT_DATASET_CACHE_DIR,
12
12
  DEFAULT_WORK_DIR,
@@ -15,7 +15,6 @@ from evalscope.constants import (
15
15
  HubType,
16
16
  JudgeStrategy,
17
17
  ModelTask,
18
- OutputType,
19
18
  )
20
19
  from evalscope.utils.argument_utils import BaseArgument, parse_int_or_float
21
20
  from evalscope.utils.deprecation_utils import deprecated_warning
@@ -28,51 +27,102 @@ logger = get_logger()
28
27
  @dataclass
29
28
  class TaskConfig(BaseArgument):
30
29
  # Model-related arguments
31
- model: Optional[str] = None
30
+ model: Optional[Union[str, Model, ModelAPI]] = None
31
+ """The model to be evaluated. Can be a string path, Model object, or ModelAPI object."""
32
+
32
33
  model_id: Optional[str] = None
34
+ """Unique identifier for the model. Auto-generated from model name if not provided."""
35
+
33
36
  model_args: Dict = field(default_factory=dict)
37
+ """Additional arguments to pass to the model during initialization."""
38
+
34
39
  model_task: str = ModelTask.TEXT_GENERATION
40
+ """The type of task the model performs (e.g., text generation, image generation)."""
35
41
 
36
42
  # Template-related arguments
37
43
  chat_template: Optional[str] = None
44
+ """Chat template to use for formatting conversations with the model."""
38
45
 
39
46
  # Dataset-related arguments
40
47
  datasets: List[str] = field(default_factory=list)
48
+ """List of dataset names to evaluate the model on."""
49
+
41
50
  dataset_args: Dict = field(default_factory=dict)
51
+ """Additional arguments to pass to datasets during loading."""
52
+
42
53
  dataset_dir: str = DEFAULT_DATASET_CACHE_DIR
54
+ """Directory where datasets are cached locally."""
55
+
43
56
  dataset_hub: str = HubType.MODELSCOPE
44
- repeats: int = 1 # Number of times to repeat the dataset items for k-metrics
57
+ """Hub platform to download datasets from (e.g., ModelScope, HuggingFace)."""
58
+
59
+ repeats: int = 1
60
+ """Number of times to repeat the dataset items for k-metrics evaluation."""
45
61
 
46
62
  # Generation configuration arguments
47
63
  generation_config: Union[Dict, GenerateConfig] = field(default_factory=dict)
64
+ """Configuration parameters for text/image generation."""
48
65
 
49
66
  # Evaluation-related arguments
50
67
  eval_type: str = EvalType.CHECKPOINT
68
+ """Type of evaluation: checkpoint, service, or mock."""
69
+
51
70
  eval_backend: str = EvalBackend.NATIVE
71
+ """Backend framework to use for evaluation."""
72
+
52
73
  eval_config: Union[str, Dict, None] = None
74
+ """Additional evaluation configuration parameters."""
75
+
53
76
  limit: Optional[Union[int, float]] = None
77
+ """Maximum number of samples to evaluate. Can be int (count) or float (fraction)."""
78
+
54
79
  eval_batch_size: int = 1
80
+ """Batch size for evaluation processing."""
55
81
 
56
82
  # Cache and working directory arguments
57
83
  use_cache: Optional[str] = None
84
+ """Whether to use cached results and which cache strategy to apply."""
85
+
58
86
  rerun_review: bool = False
87
+ """Whether to rerun the review process even if results exist."""
88
+
59
89
  work_dir: str = DEFAULT_WORK_DIR
90
+ """Working directory for storing evaluation results and temporary files."""
60
91
 
61
92
  # Debug and runtime mode arguments
62
93
  ignore_errors: bool = False
94
+ """Whether to continue evaluation when encountering errors."""
95
+
63
96
  debug: bool = False
64
- dry_run: bool = False
97
+ """Enable debug mode for detailed logging and error reporting."""
98
+
65
99
  seed: Optional[int] = 42
66
- api_url: Optional[str] = None # Only used for server model
67
- api_key: Optional[str] = 'EMPTY' # Only used for server model
68
- timeout: Optional[float] = None # Only used for server model
69
- stream: Optional[bool] = None # Only used for server model
100
+ """Random seed for reproducible results."""
101
+
102
+ api_url: Optional[str] = None
103
+ """API endpoint URL for server-based model evaluation."""
104
+
105
+ api_key: Optional[str] = 'EMPTY'
106
+ """API key for authenticating with server-based models."""
107
+
108
+ timeout: Optional[float] = None
109
+ """Request timeout in seconds for server-based models."""
110
+
111
+ stream: Optional[bool] = None
112
+ """Whether to use streaming responses for server-based models."""
70
113
 
71
114
  # LLMJudge arguments
72
115
  judge_strategy: str = JudgeStrategy.AUTO
116
+ """Strategy for LLM-based judgment (auto, single, pairwise)."""
117
+
73
118
  judge_worker_num: int = 1
119
+ """Number of worker processes for parallel LLM judging."""
120
+
74
121
  judge_model_args: Optional[Dict] = field(default_factory=dict)
122
+ """Additional arguments for the judge model configuration."""
123
+
75
124
  analysis_report: bool = False
125
+ """Whether to generate detailed analysis reports after evaluation."""
76
126
 
77
127
  def __post_init__(self):
78
128
  self.__init_model_and_id()
@@ -88,14 +138,15 @@ class TaskConfig(BaseArgument):
88
138
  if self.model is None:
89
139
  self.model = self.model_task
90
140
  self.eval_type = EvalType.MOCK_LLM
91
- else:
92
- if self.model_task == ModelTask.IMAGE_GENERATION:
93
- self.eval_type = EvalType.TEXT2IMAGE
94
141
 
95
142
  # Set model_id if not provided
96
143
  if not self.model_id:
97
- if self.model:
144
+ if isinstance(self.model, str):
98
145
  self.model_id = safe_filename(os.path.basename(self.model))
146
+ elif isinstance(self.model, Model):
147
+ self.model_id = safe_filename(self.model.name)
148
+ elif isinstance(self.model, ModelAPI):
149
+ self.model_id = safe_filename(self.model.model_name)
99
150
  else:
100
151
  self.model_id = 'dummy_model'
101
152
 
@@ -113,6 +164,11 @@ class TaskConfig(BaseArgument):
113
164
  'num_inference_steps': 50,
114
165
  'guidance_scale': 9.0,
115
166
  }
167
+ if self.eval_batch_size != 1:
168
+ logger.warning(
169
+ 'For image generation task, we only support eval_batch_size=1 for now, changed to 1.'
170
+ )
171
+ self.eval_batch_size = 1
116
172
  elif self.model_task == ModelTask.TEXT_GENERATION:
117
173
  if self.eval_type == EvalType.CHECKPOINT:
118
174
  self.generation_config = {
@@ -185,6 +241,9 @@ class TaskConfig(BaseArgument):
185
241
  result = copy.deepcopy(self.__dict__)
186
242
  del result['api_key'] # Do not expose api_key in the config
187
243
 
244
+ if isinstance(self.model, (Model, ModelAPI)):
245
+ result['model'] = self.model.__class__.__name__
246
+
188
247
  if isinstance(self.generation_config, GenerateConfig):
189
248
  result['generation_config'] = self.generation_config.model_dump(exclude_unset=True)
190
249
  return result
evalscope/constants.py CHANGED
@@ -70,6 +70,7 @@ class EvalType:
70
70
  CHECKPOINT = 'llm_ckpt' # native model checkpoint
71
71
  SERVICE = 'openai_api' # model service
72
72
  TEXT2IMAGE = 'text2image' # image generation service
73
+ IMAGE_EDITING = 'image_editing' # image editing service
73
74
 
74
75
 
75
76
  class OutputType:
@@ -127,3 +128,10 @@ class Tags:
127
128
  RETRIEVAL = 'Retrieval'
128
129
  FUNCTION_CALLING = 'FunctionCalling'
129
130
  TEXT_TO_IMAGE = 'TextToImage'
131
+ IMAGE_EDITING = 'ImageEditing'
132
+ MULTI_MODAL = 'MultiModal'
133
+
134
+
135
+ class FileConstants:
136
+ IMAGE_PATH = 'image_path'
137
+ ID = 'id'
@@ -96,7 +96,9 @@ class DefaultEvaluator(Evaluator):
96
96
 
97
97
  # Process each subset (e.g., test, validation) independently
98
98
  for subset, dataset in dataset_dict.items():
99
- assert len(dataset) > 0, f'No samples found in subset: {subset}'
99
+ if len(dataset) == 0:
100
+ logger.info(f'No samples found in subset: {subset}, skipping.')
101
+ continue
100
102
  subset_score = self.evaluate_subset(subset, dataset)
101
103
  agg_score_dict[subset] = subset_score
102
104
 
@@ -181,7 +183,7 @@ class DefaultEvaluator(Evaluator):
181
183
  model_result = self.cache_manager.save_prediction_cache(
182
184
  subset, task_state, self.benchmark.save_metadata
183
185
  )
184
- logger.debug(f'Model result: \n{model_result.model_dump_json(indent=2)}')
186
+ logger.debug(f'Model result: \n{model_result.pretty_print()}')
185
187
 
186
188
  except Exception as exc:
187
189
  logger.error(f'{sample.model_dump_json(indent=2)} prediction failed: due to {exc}')
@@ -261,10 +263,10 @@ class DefaultEvaluator(Evaluator):
261
263
  sample_score=sample_score,
262
264
  save_metadata=self.benchmark.save_metadata
263
265
  )
264
- logger.debug(f'Review result: \n{review_result.model_dump_json(indent=2)}')
266
+ logger.debug(f'Review result: \n{review_result.pretty_print()}')
265
267
 
266
268
  except Exception as exc:
267
- logger.error(f'Error when review sample {task_state.sample_id}: {exc}')
269
+ logger.error(f'Error when review sample {task_state.sample_id}: due to {exc}')
268
270
  if self.task_config.ignore_errors:
269
271
  logger.warning('Error ignored, continuing with next sample.')
270
272
  else:
@@ -2,6 +2,7 @@ import os
2
2
  import re
3
3
  from typing import Any, Dict, List, Optional
4
4
 
5
+ from evalscope.api.messages import ChatMessage, ChatMessageSystem, ChatMessageUser
5
6
  from evalscope.constants import JudgeScoreType
6
7
  from evalscope.utils.logger import get_logger
7
8
 
@@ -109,20 +110,31 @@ class LLMJudge:
109
110
  config=GenerateConfig(**self.generation_config),
110
111
  )
111
112
 
112
- def judge(self, prompt: str, system_prompt: Optional[str] = None) -> str:
113
+ def judge(
114
+ self,
115
+ prompt: str = '',
116
+ system_prompt: Optional[str] = None,
117
+ messages: Optional[List[ChatMessage]] = None
118
+ ) -> str:
113
119
  """
120
+ Generate a response from the LLM based on the provided prompt and context.
121
+ If messages is provided, it will be used as the input context.
122
+
114
123
  Args:
115
124
  prompt (str): The prompt to evaluate
116
125
  system_prompt (str, optional): The system prompt to use for the evaluation
126
+ messages (List[ChatMessage], optional): A list of chat messages to include in the evaluation
117
127
  Returns:
118
128
  str: The response from the LLM
119
129
  """
120
- from evalscope.api.messages import ChatMessageSystem, ChatMessageUser
121
-
122
- system_content = system_prompt or self.system_prompt
123
- input_messages = [ChatMessageUser(content=prompt)]
124
- if system_content:
125
- input_messages.insert(0, ChatMessageSystem(content=system_content))
130
+ # parse messages
131
+ if messages is not None:
132
+ input_messages = messages
133
+ else:
134
+ system_content = system_prompt or self.system_prompt
135
+ input_messages = [ChatMessageUser(content=prompt)]
136
+ if system_content:
137
+ input_messages.insert(0, ChatMessageSystem(content=system_content))
126
138
  try:
127
139
  # Send request using ServerModelAdapter
128
140
  response = self.model.generate(input_messages)
@@ -0,0 +1,125 @@
1
+ from __future__ import annotations
2
+
3
+ import importlib
4
+ import time
5
+ import torch
6
+ from logging import getLogger
7
+ from typing import Any, Dict, List, Literal, Optional, Protocol, Tuple, Union, cast
8
+
9
+ from evalscope.api.messages import (
10
+ ChatMessage,
11
+ ChatMessageAssistant,
12
+ ContentAudio,
13
+ ContentImage,
14
+ ContentText,
15
+ ContentVideo,
16
+ )
17
+ from evalscope.api.model import (
18
+ ChatCompletionChoice,
19
+ GenerateConfig,
20
+ Logprob,
21
+ Logprobs,
22
+ ModelAPI,
23
+ ModelOutput,
24
+ ModelUsage,
25
+ TopLogprob,
26
+ )
27
+ from evalscope.api.tool import ToolChoice, ToolInfo
28
+ from evalscope.utils.io_utils import PIL_to_base64, base64_to_PIL
29
+ from evalscope.utils.model_utils import get_device
30
+
31
+ logger = getLogger()
32
+
33
+
34
+ class ImageEditAPI(ModelAPI):
35
+
36
+ def __init__(
37
+ self,
38
+ model_name: str,
39
+ base_url: Optional[str] = None,
40
+ api_key: Optional[str] = None,
41
+ config: GenerateConfig = GenerateConfig(),
42
+ **model_args: Any,
43
+ ):
44
+ super().__init__(
45
+ model_name=model_name,
46
+ base_url=base_url,
47
+ api_key=api_key,
48
+ config=config,
49
+ )
50
+
51
+ # collect known model_args (then delete them so we can pass the rest on)
52
+ def collect_model_arg(name: str) -> Optional[Any]:
53
+ nonlocal model_args
54
+ value = model_args.get(name, None)
55
+ if value is not None:
56
+ model_args.pop(name)
57
+ return value
58
+
59
+ model_path = collect_model_arg('model_path')
60
+ torch_dtype = collect_model_arg('precision') or collect_model_arg('torch_dtype')
61
+ device_map = collect_model_arg('device_map')
62
+ # torch dtype
63
+ DTYPE_MAP = {'float16': torch.float16, 'float32': torch.float32, 'bfloat16': torch.bfloat16, 'auto': 'auto'}
64
+
65
+ if isinstance(torch_dtype, str) and torch_dtype != 'auto':
66
+ torch_dtype = DTYPE_MAP.get(torch_dtype, torch.float32)
67
+ self.torch_dtype = torch_dtype
68
+ self.device = device_map or get_device()
69
+
70
+ self.pipeline_cls = collect_model_arg('pipeline_cls')
71
+ # default to DiffusionPipeline if not specified
72
+ if self.pipeline_cls is None:
73
+ if 'qwen' in model_name.lower():
74
+ self.pipeline_cls = 'QwenImageEditPipeline'
75
+ else:
76
+ logger.error('Pipeline class not found. Please provide a valid `pipeline_cls` in model args.')
77
+ raise ValueError('Invalid pipeline class.')
78
+
79
+ model_name_or_path = model_path or model_name
80
+
81
+ # from modelscope import pipeline_cls
82
+ module = getattr(importlib.import_module('modelscope'), self.pipeline_cls)
83
+ logger.info(f'Loading model {model_name_or_path} with {self.pipeline_cls} ...')
84
+
85
+ self.model = module.from_pretrained(
86
+ model_name_or_path,
87
+ torch_dtype=self.torch_dtype,
88
+ **model_args,
89
+ )
90
+
91
+ self.model.to(self.device)
92
+
93
+ def generate(
94
+ self,
95
+ input: List[ChatMessage],
96
+ tools: List[ToolInfo],
97
+ tool_choice: ToolChoice,
98
+ config: GenerateConfig,
99
+ ) -> ModelOutput:
100
+
101
+ # prepare generator
102
+ kwargs: Dict[str, Any] = {}
103
+ if config.num_inference_steps is not None:
104
+ kwargs['num_inference_steps'] = config.num_inference_steps
105
+ kwargs.update(config.model_extra)
106
+
107
+ # assume the first text as prompt
108
+ content = input[0].content
109
+ assert isinstance(content[0], ContentText) and isinstance(content[1], ContentImage), \
110
+ 'Invalid content types, expected (ContentText, ContentImage)'
111
+
112
+ prompt = content[0].text
113
+ input_image_base64 = content[1].image
114
+ input_image = base64_to_PIL(input_image_base64)
115
+ # get the first image as output
116
+ output = self.model(image=input_image, prompt=prompt, **kwargs)
117
+ image = output.images[0]
118
+
119
+ image_base64 = PIL_to_base64(image)
120
+
121
+ return ModelOutput(
122
+ model=self.model_name,
123
+ choices=[ChatCompletionChoice.from_content(content=[ContentImage(image=image_base64)])],
124
+ time=time.time(),
125
+ )
@@ -1,6 +1,7 @@
1
1
  from evalscope.api.model import ModelAPI
2
2
  from evalscope.api.registry import register_model_api
3
3
  from evalscope.utils.deprecation_utils import deprecated
4
+ from evalscope.utils.import_utils import check_import
4
5
 
5
6
 
6
7
  @register_model_api(name='mock_llm')
@@ -27,6 +28,8 @@ def server() -> type[ModelAPI]:
27
28
 
28
29
  @register_model_api(name='llm_ckpt')
29
30
  def llm_ckpt() -> type[ModelAPI]:
31
+ check_import('torch', package='torch', raise_error=True)
32
+
30
33
  from .modelscope import ModelScopeAPI
31
34
 
32
35
  return ModelScopeAPI
@@ -35,6 +38,8 @@ def llm_ckpt() -> type[ModelAPI]:
35
38
  @register_model_api(name='checkpoint')
36
39
  @deprecated(since='1.0.0', remove_in='1.1.0', alternative='llm_ckpt')
37
40
  def checkpoint() -> type[ModelAPI]:
41
+ check_import('torch', package='torch', raise_error=True)
42
+
38
43
  from .modelscope import ModelScopeAPI
39
44
 
40
45
  return ModelScopeAPI
@@ -42,6 +47,21 @@ def checkpoint() -> type[ModelAPI]:
42
47
 
43
48
  @register_model_api(name='text2image')
44
49
  def text2image() -> type[ModelAPI]:
50
+ check_import('torch', package='evalscope[aigc]', raise_error=True)
51
+ check_import('torchvision', package='evalscope[aigc]', raise_error=True)
52
+ check_import('diffusers', package='evalscope[aigc]', raise_error=True)
53
+
45
54
  from .text2image_model import Text2ImageAPI
46
55
 
47
56
  return Text2ImageAPI
57
+
58
+
59
+ @register_model_api(name='image_editing')
60
+ def image_editing() -> type[ModelAPI]:
61
+ check_import('torch', package='evalscope[aigc]', raise_error=True)
62
+ check_import('torchvision', package='evalscope[aigc]', raise_error=True)
63
+ check_import('diffusers', package='evalscope[aigc]', raise_error=True)
64
+
65
+ from .image_edit_model import ImageEditAPI
66
+
67
+ return ImageEditAPI
@@ -48,6 +48,9 @@ class OpenAICompatibleAPI(ModelAPI):
48
48
  self.base_url = base_url or os.environ.get('EVALSCOPE_BASE_URL', None)
49
49
  assert self.base_url, f'Base URL for {model_name} not found'
50
50
 
51
+ # remove trailing slash from base_url
52
+ self.base_url = self.base_url.rstrip('/').removesuffix('/chat/completions')
53
+
51
54
  # create http client
52
55
  self.client = OpenAI(
53
56
  api_key=self.api_key,
@@ -107,8 +107,8 @@ class Text2ImageAPI(ModelAPI):
107
107
  kwargs['num_inference_steps'] = config.num_inference_steps
108
108
  if config.guidance_scale is not None:
109
109
  kwargs['guidance_scale'] = config.guidance_scale
110
- if config.extra_body is not None:
111
- kwargs.update(config.extra_body)
110
+ # update with extra model parameters
111
+ kwargs.update(config.model_extra)
112
112
 
113
113
  # assume the first text as prompt
114
114
  prompt = input[0].text
@@ -209,7 +209,7 @@ def openai_completion_params(model: str, config: GenerateConfig, tools: bool) ->
209
209
  return params
210
210
 
211
211
 
212
- def openai_assistant_content(message: ChatMessageAssistant) -> str:
212
+ def openai_assistant_content(message: ChatMessageAssistant, include_reasoning=True) -> str:
213
213
  # In agent bridge scenarios, we could encounter concepts such as reasoning and
214
214
  # .internal use in the ChatMessageAssistant that are not supported by the OpenAI
215
215
  # choices API. This code smuggles that data into the plain text so that it
@@ -220,7 +220,7 @@ def openai_assistant_content(message: ChatMessageAssistant) -> str:
220
220
  else:
221
221
  content = ''
222
222
  for c in message.content:
223
- if c.type == 'reasoning':
223
+ if c.type == 'reasoning' and include_reasoning:
224
224
  attribs = ''
225
225
  if c.signature is not None:
226
226
  attribs = f'{attribs} signature="{c.signature}"'
@@ -239,11 +239,14 @@ def openai_assistant_content(message: ChatMessageAssistant) -> str:
239
239
  return content
240
240
 
241
241
 
242
- def openai_chat_choices(choices: List[ChatCompletionChoice]) -> List[Choice]:
242
+ def openai_chat_choices(choices: List[ChatCompletionChoice], include_reasoning: bool = True) -> List[Choice]:
243
243
  oai_choices: List[Choice] = []
244
244
 
245
245
  for index, choice in enumerate(choices):
246
- content = openai_assistant_content(choice.message)
246
+ # Handle content
247
+ content = openai_assistant_content(choice.message, include_reasoning=include_reasoning)
248
+
249
+ # Handle tool calls
247
250
  if choice.message.tool_calls:
248
251
  tool_calls = [openai_chat_tool_call(tc) for tc in choice.message.tool_calls]
249
252
  else:
@@ -42,6 +42,8 @@ async def get_requests(args: Arguments, api_plugin: 'ApiPluginBase') -> AsyncGen
42
42
  try:
43
43
  for messages in message_generator.build_messages():
44
44
  dataset_messages.append(messages)
45
+ if len(dataset_messages) >= args.number:
46
+ break
45
47
  except StopIteration:
46
48
  pass
47
49
 
@@ -1,8 +1,8 @@
1
1
  import time
2
- import torch
3
2
  from dataclasses import dataclass, field
4
3
  from typing import Any, List, Optional, Tuple
5
4
 
5
+ from evalscope.utils.import_utils import check_import
6
6
  from evalscope.utils.logger import get_logger
7
7
 
8
8
  logger = get_logger()
@@ -44,10 +44,13 @@ class BenchmarkData:
44
44
  api_plugin.parse_responses(self.response_messages, request=self.request)
45
45
 
46
46
  def update_gpu_usage(self):
47
- total_memory = 0
48
- for i in range(torch.cuda.device_count()):
49
- total_memory += (torch.cuda.max_memory_allocated(i) / 2**30) # GB
50
- self.max_gpu_memory_cost = max(self.max_gpu_memory_cost, total_memory)
47
+ if check_import('torch'):
48
+
49
+ import torch
50
+ total_memory = 0
51
+ for i in range(torch.cuda.device_count()):
52
+ total_memory += (torch.cuda.max_memory_allocated(i) / 2**30) # GB
53
+ self.max_gpu_memory_cost = max(self.max_gpu_memory_cost, total_memory)
51
54
 
52
55
 
53
56
  class Metrics:
@@ -9,6 +9,7 @@ from sse_starlette.sse import EventSourceResponse
9
9
 
10
10
  from evalscope.perf.arguments import Arguments
11
11
  from evalscope.utils.chat_service import ChatCompletionRequest, ChatService, ModelList, TextCompletionRequest
12
+ from evalscope.utils.import_utils import check_import
12
13
  from evalscope.utils.logger import get_logger
13
14
 
14
15
  logger = get_logger()
@@ -101,6 +102,8 @@ def create_app(model, attn_implementation=None) -> FastAPI:
101
102
  def start_app(args: Arguments):
102
103
  logger.info('Starting local server, please wait...')
103
104
  if args.api == 'local':
105
+ check_import('torch', 'torch', raise_error=True)
106
+
104
107
  app = create_app(args.model, args.attn_implementation)
105
108
  uvicorn.run(app, host='0.0.0.0', port=args.port, workers=1)
106
109
 
@@ -14,7 +14,6 @@ else:
14
14
  'gen_table',
15
15
  'get_data_frame',
16
16
  'get_report_list',
17
- 'gen_report_table',
18
17
  ],
19
18
  'generator': [
20
19
  'ReportGenerator',