evalscope 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/api/benchmark/__init__.py +1 -1
- evalscope/api/benchmark/adapters/__init__.py +2 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +1 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +7 -6
- evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
- evalscope/api/benchmark/benchmark.py +35 -0
- evalscope/api/benchmark/meta.py +6 -0
- evalscope/api/dataset/dataset.py +6 -6
- evalscope/api/dataset/loader.py +2 -1
- evalscope/api/evaluator/cache.py +24 -1
- evalscope/api/evaluator/state.py +12 -1
- evalscope/api/messages/__init__.py +1 -0
- evalscope/api/messages/chat_message.py +47 -2
- evalscope/api/metric/scorer.py +15 -7
- evalscope/api/mixin/__init__.py +0 -1
- evalscope/api/model/generate_config.py +1 -3
- evalscope/api/model/model.py +4 -1
- evalscope/app/app.py +3 -0
- evalscope/app/ui/single_model.py +3 -3
- evalscope/app/utils/data_utils.py +7 -7
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -12
- evalscope/arguments.py +2 -4
- evalscope/backend/opencompass/backend_manager.py +0 -2
- evalscope/backend/rag_eval/utils/embedding.py +9 -1
- evalscope/benchmarks/bfcl/bfcl_adapter.py +2 -6
- evalscope/benchmarks/bfcl/generation.py +2 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
- evalscope/benchmarks/data_collection/data_collection_adapter.py +23 -19
- evalscope/benchmarks/frames/frames_adapter.py +2 -1
- evalscope/benchmarks/general_arena/general_arena_adapter.py +5 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +5 -1
- evalscope/benchmarks/tau_bench/generation.py +1 -1
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +15 -19
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/config.py +72 -13
- evalscope/constants.py +8 -0
- evalscope/evaluator/evaluator.py +6 -4
- evalscope/metrics/llm_judge.py +19 -7
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/model_apis.py +20 -0
- evalscope/models/openai_compatible.py +3 -0
- evalscope/models/text2image_model.py +2 -2
- evalscope/models/utils/openai.py +7 -4
- evalscope/perf/benchmark.py +2 -0
- evalscope/perf/utils/benchmark_util.py +8 -5
- evalscope/perf/utils/local_server.py +3 -0
- evalscope/report/__init__.py +0 -1
- evalscope/report/generator.py +8 -87
- evalscope/run.py +9 -5
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/chat_service.py +1 -1
- evalscope/utils/import_utils.py +23 -1
- evalscope/utils/io_utils.py +42 -1
- evalscope/utils/model_utils.py +4 -3
- evalscope/utils/multi_choices.py +23 -6
- evalscope/version.py +2 -2
- {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/METADATA +12 -15
- {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/RECORD +94 -80
- tests/benchmark/test_eval.py +30 -31
- tests/benchmark/test_image_edit.py +65 -0
- tests/benchmark/test_vlm.py +80 -0
- tests/cli/test_all.py +83 -43
- tests/cli/test_collection.py +8 -5
- tests/cli/test_reasoning.py +81 -0
- tests/common.py +73 -0
- tests/perf/test_perf.py +4 -2
- tests/rag/test_clip_benchmark.py +0 -3
- evalscope/api/mixin/dataset_mixin.py +0 -105
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
- tests/aigc/__init__.py +0 -1
- /evalscope/benchmarks/{aigc → image_edit}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/i2i → image_edit/gedit}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → math_vista}/__init__.py +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
- /tests/{aigc → benchmark}/test_t2i.py +0 -0
evalscope/config.py
CHANGED
|
@@ -6,7 +6,7 @@ from argparse import Namespace
|
|
|
6
6
|
from dataclasses import dataclass, field
|
|
7
7
|
from typing import Dict, List, Optional, Union
|
|
8
8
|
|
|
9
|
-
from evalscope.api.model import GenerateConfig
|
|
9
|
+
from evalscope.api.model import GenerateConfig, Model, ModelAPI
|
|
10
10
|
from evalscope.constants import (
|
|
11
11
|
DEFAULT_DATASET_CACHE_DIR,
|
|
12
12
|
DEFAULT_WORK_DIR,
|
|
@@ -15,7 +15,6 @@ from evalscope.constants import (
|
|
|
15
15
|
HubType,
|
|
16
16
|
JudgeStrategy,
|
|
17
17
|
ModelTask,
|
|
18
|
-
OutputType,
|
|
19
18
|
)
|
|
20
19
|
from evalscope.utils.argument_utils import BaseArgument, parse_int_or_float
|
|
21
20
|
from evalscope.utils.deprecation_utils import deprecated_warning
|
|
@@ -28,51 +27,102 @@ logger = get_logger()
|
|
|
28
27
|
@dataclass
|
|
29
28
|
class TaskConfig(BaseArgument):
|
|
30
29
|
# Model-related arguments
|
|
31
|
-
model: Optional[str] = None
|
|
30
|
+
model: Optional[Union[str, Model, ModelAPI]] = None
|
|
31
|
+
"""The model to be evaluated. Can be a string path, Model object, or ModelAPI object."""
|
|
32
|
+
|
|
32
33
|
model_id: Optional[str] = None
|
|
34
|
+
"""Unique identifier for the model. Auto-generated from model name if not provided."""
|
|
35
|
+
|
|
33
36
|
model_args: Dict = field(default_factory=dict)
|
|
37
|
+
"""Additional arguments to pass to the model during initialization."""
|
|
38
|
+
|
|
34
39
|
model_task: str = ModelTask.TEXT_GENERATION
|
|
40
|
+
"""The type of task the model performs (e.g., text generation, image generation)."""
|
|
35
41
|
|
|
36
42
|
# Template-related arguments
|
|
37
43
|
chat_template: Optional[str] = None
|
|
44
|
+
"""Chat template to use for formatting conversations with the model."""
|
|
38
45
|
|
|
39
46
|
# Dataset-related arguments
|
|
40
47
|
datasets: List[str] = field(default_factory=list)
|
|
48
|
+
"""List of dataset names to evaluate the model on."""
|
|
49
|
+
|
|
41
50
|
dataset_args: Dict = field(default_factory=dict)
|
|
51
|
+
"""Additional arguments to pass to datasets during loading."""
|
|
52
|
+
|
|
42
53
|
dataset_dir: str = DEFAULT_DATASET_CACHE_DIR
|
|
54
|
+
"""Directory where datasets are cached locally."""
|
|
55
|
+
|
|
43
56
|
dataset_hub: str = HubType.MODELSCOPE
|
|
44
|
-
|
|
57
|
+
"""Hub platform to download datasets from (e.g., ModelScope, HuggingFace)."""
|
|
58
|
+
|
|
59
|
+
repeats: int = 1
|
|
60
|
+
"""Number of times to repeat the dataset items for k-metrics evaluation."""
|
|
45
61
|
|
|
46
62
|
# Generation configuration arguments
|
|
47
63
|
generation_config: Union[Dict, GenerateConfig] = field(default_factory=dict)
|
|
64
|
+
"""Configuration parameters for text/image generation."""
|
|
48
65
|
|
|
49
66
|
# Evaluation-related arguments
|
|
50
67
|
eval_type: str = EvalType.CHECKPOINT
|
|
68
|
+
"""Type of evaluation: checkpoint, service, or mock."""
|
|
69
|
+
|
|
51
70
|
eval_backend: str = EvalBackend.NATIVE
|
|
71
|
+
"""Backend framework to use for evaluation."""
|
|
72
|
+
|
|
52
73
|
eval_config: Union[str, Dict, None] = None
|
|
74
|
+
"""Additional evaluation configuration parameters."""
|
|
75
|
+
|
|
53
76
|
limit: Optional[Union[int, float]] = None
|
|
77
|
+
"""Maximum number of samples to evaluate. Can be int (count) or float (fraction)."""
|
|
78
|
+
|
|
54
79
|
eval_batch_size: int = 1
|
|
80
|
+
"""Batch size for evaluation processing."""
|
|
55
81
|
|
|
56
82
|
# Cache and working directory arguments
|
|
57
83
|
use_cache: Optional[str] = None
|
|
84
|
+
"""Whether to use cached results and which cache strategy to apply."""
|
|
85
|
+
|
|
58
86
|
rerun_review: bool = False
|
|
87
|
+
"""Whether to rerun the review process even if results exist."""
|
|
88
|
+
|
|
59
89
|
work_dir: str = DEFAULT_WORK_DIR
|
|
90
|
+
"""Working directory for storing evaluation results and temporary files."""
|
|
60
91
|
|
|
61
92
|
# Debug and runtime mode arguments
|
|
62
93
|
ignore_errors: bool = False
|
|
94
|
+
"""Whether to continue evaluation when encountering errors."""
|
|
95
|
+
|
|
63
96
|
debug: bool = False
|
|
64
|
-
|
|
97
|
+
"""Enable debug mode for detailed logging and error reporting."""
|
|
98
|
+
|
|
65
99
|
seed: Optional[int] = 42
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
100
|
+
"""Random seed for reproducible results."""
|
|
101
|
+
|
|
102
|
+
api_url: Optional[str] = None
|
|
103
|
+
"""API endpoint URL for server-based model evaluation."""
|
|
104
|
+
|
|
105
|
+
api_key: Optional[str] = 'EMPTY'
|
|
106
|
+
"""API key for authenticating with server-based models."""
|
|
107
|
+
|
|
108
|
+
timeout: Optional[float] = None
|
|
109
|
+
"""Request timeout in seconds for server-based models."""
|
|
110
|
+
|
|
111
|
+
stream: Optional[bool] = None
|
|
112
|
+
"""Whether to use streaming responses for server-based models."""
|
|
70
113
|
|
|
71
114
|
# LLMJudge arguments
|
|
72
115
|
judge_strategy: str = JudgeStrategy.AUTO
|
|
116
|
+
"""Strategy for LLM-based judgment (auto, single, pairwise)."""
|
|
117
|
+
|
|
73
118
|
judge_worker_num: int = 1
|
|
119
|
+
"""Number of worker processes for parallel LLM judging."""
|
|
120
|
+
|
|
74
121
|
judge_model_args: Optional[Dict] = field(default_factory=dict)
|
|
122
|
+
"""Additional arguments for the judge model configuration."""
|
|
123
|
+
|
|
75
124
|
analysis_report: bool = False
|
|
125
|
+
"""Whether to generate detailed analysis reports after evaluation."""
|
|
76
126
|
|
|
77
127
|
def __post_init__(self):
|
|
78
128
|
self.__init_model_and_id()
|
|
@@ -88,14 +138,15 @@ class TaskConfig(BaseArgument):
|
|
|
88
138
|
if self.model is None:
|
|
89
139
|
self.model = self.model_task
|
|
90
140
|
self.eval_type = EvalType.MOCK_LLM
|
|
91
|
-
else:
|
|
92
|
-
if self.model_task == ModelTask.IMAGE_GENERATION:
|
|
93
|
-
self.eval_type = EvalType.TEXT2IMAGE
|
|
94
141
|
|
|
95
142
|
# Set model_id if not provided
|
|
96
143
|
if not self.model_id:
|
|
97
|
-
if self.model:
|
|
144
|
+
if isinstance(self.model, str):
|
|
98
145
|
self.model_id = safe_filename(os.path.basename(self.model))
|
|
146
|
+
elif isinstance(self.model, Model):
|
|
147
|
+
self.model_id = safe_filename(self.model.name)
|
|
148
|
+
elif isinstance(self.model, ModelAPI):
|
|
149
|
+
self.model_id = safe_filename(self.model.model_name)
|
|
99
150
|
else:
|
|
100
151
|
self.model_id = 'dummy_model'
|
|
101
152
|
|
|
@@ -113,6 +164,11 @@ class TaskConfig(BaseArgument):
|
|
|
113
164
|
'num_inference_steps': 50,
|
|
114
165
|
'guidance_scale': 9.0,
|
|
115
166
|
}
|
|
167
|
+
if self.eval_batch_size != 1:
|
|
168
|
+
logger.warning(
|
|
169
|
+
'For image generation task, we only support eval_batch_size=1 for now, changed to 1.'
|
|
170
|
+
)
|
|
171
|
+
self.eval_batch_size = 1
|
|
116
172
|
elif self.model_task == ModelTask.TEXT_GENERATION:
|
|
117
173
|
if self.eval_type == EvalType.CHECKPOINT:
|
|
118
174
|
self.generation_config = {
|
|
@@ -185,6 +241,9 @@ class TaskConfig(BaseArgument):
|
|
|
185
241
|
result = copy.deepcopy(self.__dict__)
|
|
186
242
|
del result['api_key'] # Do not expose api_key in the config
|
|
187
243
|
|
|
244
|
+
if isinstance(self.model, (Model, ModelAPI)):
|
|
245
|
+
result['model'] = self.model.__class__.__name__
|
|
246
|
+
|
|
188
247
|
if isinstance(self.generation_config, GenerateConfig):
|
|
189
248
|
result['generation_config'] = self.generation_config.model_dump(exclude_unset=True)
|
|
190
249
|
return result
|
evalscope/constants.py
CHANGED
|
@@ -70,6 +70,7 @@ class EvalType:
|
|
|
70
70
|
CHECKPOINT = 'llm_ckpt' # native model checkpoint
|
|
71
71
|
SERVICE = 'openai_api' # model service
|
|
72
72
|
TEXT2IMAGE = 'text2image' # image generation service
|
|
73
|
+
IMAGE_EDITING = 'image_editing' # image editing service
|
|
73
74
|
|
|
74
75
|
|
|
75
76
|
class OutputType:
|
|
@@ -127,3 +128,10 @@ class Tags:
|
|
|
127
128
|
RETRIEVAL = 'Retrieval'
|
|
128
129
|
FUNCTION_CALLING = 'FunctionCalling'
|
|
129
130
|
TEXT_TO_IMAGE = 'TextToImage'
|
|
131
|
+
IMAGE_EDITING = 'ImageEditing'
|
|
132
|
+
MULTI_MODAL = 'MultiModal'
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
class FileConstants:
|
|
136
|
+
IMAGE_PATH = 'image_path'
|
|
137
|
+
ID = 'id'
|
evalscope/evaluator/evaluator.py
CHANGED
|
@@ -96,7 +96,9 @@ class DefaultEvaluator(Evaluator):
|
|
|
96
96
|
|
|
97
97
|
# Process each subset (e.g., test, validation) independently
|
|
98
98
|
for subset, dataset in dataset_dict.items():
|
|
99
|
-
|
|
99
|
+
if len(dataset) == 0:
|
|
100
|
+
logger.info(f'No samples found in subset: {subset}, skipping.')
|
|
101
|
+
continue
|
|
100
102
|
subset_score = self.evaluate_subset(subset, dataset)
|
|
101
103
|
agg_score_dict[subset] = subset_score
|
|
102
104
|
|
|
@@ -181,7 +183,7 @@ class DefaultEvaluator(Evaluator):
|
|
|
181
183
|
model_result = self.cache_manager.save_prediction_cache(
|
|
182
184
|
subset, task_state, self.benchmark.save_metadata
|
|
183
185
|
)
|
|
184
|
-
logger.debug(f'Model result: \n{model_result.
|
|
186
|
+
logger.debug(f'Model result: \n{model_result.pretty_print()}')
|
|
185
187
|
|
|
186
188
|
except Exception as exc:
|
|
187
189
|
logger.error(f'{sample.model_dump_json(indent=2)} prediction failed: due to {exc}')
|
|
@@ -261,10 +263,10 @@ class DefaultEvaluator(Evaluator):
|
|
|
261
263
|
sample_score=sample_score,
|
|
262
264
|
save_metadata=self.benchmark.save_metadata
|
|
263
265
|
)
|
|
264
|
-
logger.debug(f'Review result: \n{review_result.
|
|
266
|
+
logger.debug(f'Review result: \n{review_result.pretty_print()}')
|
|
265
267
|
|
|
266
268
|
except Exception as exc:
|
|
267
|
-
logger.error(f'Error when review sample {task_state.sample_id}: {exc}')
|
|
269
|
+
logger.error(f'Error when review sample {task_state.sample_id}: due to {exc}')
|
|
268
270
|
if self.task_config.ignore_errors:
|
|
269
271
|
logger.warning('Error ignored, continuing with next sample.')
|
|
270
272
|
else:
|
evalscope/metrics/llm_judge.py
CHANGED
|
@@ -2,6 +2,7 @@ import os
|
|
|
2
2
|
import re
|
|
3
3
|
from typing import Any, Dict, List, Optional
|
|
4
4
|
|
|
5
|
+
from evalscope.api.messages import ChatMessage, ChatMessageSystem, ChatMessageUser
|
|
5
6
|
from evalscope.constants import JudgeScoreType
|
|
6
7
|
from evalscope.utils.logger import get_logger
|
|
7
8
|
|
|
@@ -109,20 +110,31 @@ class LLMJudge:
|
|
|
109
110
|
config=GenerateConfig(**self.generation_config),
|
|
110
111
|
)
|
|
111
112
|
|
|
112
|
-
def judge(
|
|
113
|
+
def judge(
|
|
114
|
+
self,
|
|
115
|
+
prompt: str = '',
|
|
116
|
+
system_prompt: Optional[str] = None,
|
|
117
|
+
messages: Optional[List[ChatMessage]] = None
|
|
118
|
+
) -> str:
|
|
113
119
|
"""
|
|
120
|
+
Generate a response from the LLM based on the provided prompt and context.
|
|
121
|
+
If messages is provided, it will be used as the input context.
|
|
122
|
+
|
|
114
123
|
Args:
|
|
115
124
|
prompt (str): The prompt to evaluate
|
|
116
125
|
system_prompt (str, optional): The system prompt to use for the evaluation
|
|
126
|
+
messages (List[ChatMessage], optional): A list of chat messages to include in the evaluation
|
|
117
127
|
Returns:
|
|
118
128
|
str: The response from the LLM
|
|
119
129
|
"""
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
input_messages
|
|
130
|
+
# parse messages
|
|
131
|
+
if messages is not None:
|
|
132
|
+
input_messages = messages
|
|
133
|
+
else:
|
|
134
|
+
system_content = system_prompt or self.system_prompt
|
|
135
|
+
input_messages = [ChatMessageUser(content=prompt)]
|
|
136
|
+
if system_content:
|
|
137
|
+
input_messages.insert(0, ChatMessageSystem(content=system_content))
|
|
126
138
|
try:
|
|
127
139
|
# Send request using ServerModelAdapter
|
|
128
140
|
response = self.model.generate(input_messages)
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import importlib
|
|
4
|
+
import time
|
|
5
|
+
import torch
|
|
6
|
+
from logging import getLogger
|
|
7
|
+
from typing import Any, Dict, List, Literal, Optional, Protocol, Tuple, Union, cast
|
|
8
|
+
|
|
9
|
+
from evalscope.api.messages import (
|
|
10
|
+
ChatMessage,
|
|
11
|
+
ChatMessageAssistant,
|
|
12
|
+
ContentAudio,
|
|
13
|
+
ContentImage,
|
|
14
|
+
ContentText,
|
|
15
|
+
ContentVideo,
|
|
16
|
+
)
|
|
17
|
+
from evalscope.api.model import (
|
|
18
|
+
ChatCompletionChoice,
|
|
19
|
+
GenerateConfig,
|
|
20
|
+
Logprob,
|
|
21
|
+
Logprobs,
|
|
22
|
+
ModelAPI,
|
|
23
|
+
ModelOutput,
|
|
24
|
+
ModelUsage,
|
|
25
|
+
TopLogprob,
|
|
26
|
+
)
|
|
27
|
+
from evalscope.api.tool import ToolChoice, ToolInfo
|
|
28
|
+
from evalscope.utils.io_utils import PIL_to_base64, base64_to_PIL
|
|
29
|
+
from evalscope.utils.model_utils import get_device
|
|
30
|
+
|
|
31
|
+
logger = getLogger()
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ImageEditAPI(ModelAPI):
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
model_name: str,
|
|
39
|
+
base_url: Optional[str] = None,
|
|
40
|
+
api_key: Optional[str] = None,
|
|
41
|
+
config: GenerateConfig = GenerateConfig(),
|
|
42
|
+
**model_args: Any,
|
|
43
|
+
):
|
|
44
|
+
super().__init__(
|
|
45
|
+
model_name=model_name,
|
|
46
|
+
base_url=base_url,
|
|
47
|
+
api_key=api_key,
|
|
48
|
+
config=config,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# collect known model_args (then delete them so we can pass the rest on)
|
|
52
|
+
def collect_model_arg(name: str) -> Optional[Any]:
|
|
53
|
+
nonlocal model_args
|
|
54
|
+
value = model_args.get(name, None)
|
|
55
|
+
if value is not None:
|
|
56
|
+
model_args.pop(name)
|
|
57
|
+
return value
|
|
58
|
+
|
|
59
|
+
model_path = collect_model_arg('model_path')
|
|
60
|
+
torch_dtype = collect_model_arg('precision') or collect_model_arg('torch_dtype')
|
|
61
|
+
device_map = collect_model_arg('device_map')
|
|
62
|
+
# torch dtype
|
|
63
|
+
DTYPE_MAP = {'float16': torch.float16, 'float32': torch.float32, 'bfloat16': torch.bfloat16, 'auto': 'auto'}
|
|
64
|
+
|
|
65
|
+
if isinstance(torch_dtype, str) and torch_dtype != 'auto':
|
|
66
|
+
torch_dtype = DTYPE_MAP.get(torch_dtype, torch.float32)
|
|
67
|
+
self.torch_dtype = torch_dtype
|
|
68
|
+
self.device = device_map or get_device()
|
|
69
|
+
|
|
70
|
+
self.pipeline_cls = collect_model_arg('pipeline_cls')
|
|
71
|
+
# default to DiffusionPipeline if not specified
|
|
72
|
+
if self.pipeline_cls is None:
|
|
73
|
+
if 'qwen' in model_name.lower():
|
|
74
|
+
self.pipeline_cls = 'QwenImageEditPipeline'
|
|
75
|
+
else:
|
|
76
|
+
logger.error('Pipeline class not found. Please provide a valid `pipeline_cls` in model args.')
|
|
77
|
+
raise ValueError('Invalid pipeline class.')
|
|
78
|
+
|
|
79
|
+
model_name_or_path = model_path or model_name
|
|
80
|
+
|
|
81
|
+
# from modelscope import pipeline_cls
|
|
82
|
+
module = getattr(importlib.import_module('modelscope'), self.pipeline_cls)
|
|
83
|
+
logger.info(f'Loading model {model_name_or_path} with {self.pipeline_cls} ...')
|
|
84
|
+
|
|
85
|
+
self.model = module.from_pretrained(
|
|
86
|
+
model_name_or_path,
|
|
87
|
+
torch_dtype=self.torch_dtype,
|
|
88
|
+
**model_args,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
self.model.to(self.device)
|
|
92
|
+
|
|
93
|
+
def generate(
|
|
94
|
+
self,
|
|
95
|
+
input: List[ChatMessage],
|
|
96
|
+
tools: List[ToolInfo],
|
|
97
|
+
tool_choice: ToolChoice,
|
|
98
|
+
config: GenerateConfig,
|
|
99
|
+
) -> ModelOutput:
|
|
100
|
+
|
|
101
|
+
# prepare generator
|
|
102
|
+
kwargs: Dict[str, Any] = {}
|
|
103
|
+
if config.num_inference_steps is not None:
|
|
104
|
+
kwargs['num_inference_steps'] = config.num_inference_steps
|
|
105
|
+
kwargs.update(config.model_extra)
|
|
106
|
+
|
|
107
|
+
# assume the first text as prompt
|
|
108
|
+
content = input[0].content
|
|
109
|
+
assert isinstance(content[0], ContentText) and isinstance(content[1], ContentImage), \
|
|
110
|
+
'Invalid content types, expected (ContentText, ContentImage)'
|
|
111
|
+
|
|
112
|
+
prompt = content[0].text
|
|
113
|
+
input_image_base64 = content[1].image
|
|
114
|
+
input_image = base64_to_PIL(input_image_base64)
|
|
115
|
+
# get the first image as output
|
|
116
|
+
output = self.model(image=input_image, prompt=prompt, **kwargs)
|
|
117
|
+
image = output.images[0]
|
|
118
|
+
|
|
119
|
+
image_base64 = PIL_to_base64(image)
|
|
120
|
+
|
|
121
|
+
return ModelOutput(
|
|
122
|
+
model=self.model_name,
|
|
123
|
+
choices=[ChatCompletionChoice.from_content(content=[ContentImage(image=image_base64)])],
|
|
124
|
+
time=time.time(),
|
|
125
|
+
)
|
evalscope/models/model_apis.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from evalscope.api.model import ModelAPI
|
|
2
2
|
from evalscope.api.registry import register_model_api
|
|
3
3
|
from evalscope.utils.deprecation_utils import deprecated
|
|
4
|
+
from evalscope.utils.import_utils import check_import
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
@register_model_api(name='mock_llm')
|
|
@@ -27,6 +28,8 @@ def server() -> type[ModelAPI]:
|
|
|
27
28
|
|
|
28
29
|
@register_model_api(name='llm_ckpt')
|
|
29
30
|
def llm_ckpt() -> type[ModelAPI]:
|
|
31
|
+
check_import('torch', package='torch', raise_error=True)
|
|
32
|
+
|
|
30
33
|
from .modelscope import ModelScopeAPI
|
|
31
34
|
|
|
32
35
|
return ModelScopeAPI
|
|
@@ -35,6 +38,8 @@ def llm_ckpt() -> type[ModelAPI]:
|
|
|
35
38
|
@register_model_api(name='checkpoint')
|
|
36
39
|
@deprecated(since='1.0.0', remove_in='1.1.0', alternative='llm_ckpt')
|
|
37
40
|
def checkpoint() -> type[ModelAPI]:
|
|
41
|
+
check_import('torch', package='torch', raise_error=True)
|
|
42
|
+
|
|
38
43
|
from .modelscope import ModelScopeAPI
|
|
39
44
|
|
|
40
45
|
return ModelScopeAPI
|
|
@@ -42,6 +47,21 @@ def checkpoint() -> type[ModelAPI]:
|
|
|
42
47
|
|
|
43
48
|
@register_model_api(name='text2image')
|
|
44
49
|
def text2image() -> type[ModelAPI]:
|
|
50
|
+
check_import('torch', package='evalscope[aigc]', raise_error=True)
|
|
51
|
+
check_import('torchvision', package='evalscope[aigc]', raise_error=True)
|
|
52
|
+
check_import('diffusers', package='evalscope[aigc]', raise_error=True)
|
|
53
|
+
|
|
45
54
|
from .text2image_model import Text2ImageAPI
|
|
46
55
|
|
|
47
56
|
return Text2ImageAPI
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@register_model_api(name='image_editing')
|
|
60
|
+
def image_editing() -> type[ModelAPI]:
|
|
61
|
+
check_import('torch', package='evalscope[aigc]', raise_error=True)
|
|
62
|
+
check_import('torchvision', package='evalscope[aigc]', raise_error=True)
|
|
63
|
+
check_import('diffusers', package='evalscope[aigc]', raise_error=True)
|
|
64
|
+
|
|
65
|
+
from .image_edit_model import ImageEditAPI
|
|
66
|
+
|
|
67
|
+
return ImageEditAPI
|
|
@@ -48,6 +48,9 @@ class OpenAICompatibleAPI(ModelAPI):
|
|
|
48
48
|
self.base_url = base_url or os.environ.get('EVALSCOPE_BASE_URL', None)
|
|
49
49
|
assert self.base_url, f'Base URL for {model_name} not found'
|
|
50
50
|
|
|
51
|
+
# remove trailing slash from base_url
|
|
52
|
+
self.base_url = self.base_url.rstrip('/').removesuffix('/chat/completions')
|
|
53
|
+
|
|
51
54
|
# create http client
|
|
52
55
|
self.client = OpenAI(
|
|
53
56
|
api_key=self.api_key,
|
|
@@ -107,8 +107,8 @@ class Text2ImageAPI(ModelAPI):
|
|
|
107
107
|
kwargs['num_inference_steps'] = config.num_inference_steps
|
|
108
108
|
if config.guidance_scale is not None:
|
|
109
109
|
kwargs['guidance_scale'] = config.guidance_scale
|
|
110
|
-
|
|
111
|
-
|
|
110
|
+
# update with extra model parameters
|
|
111
|
+
kwargs.update(config.model_extra)
|
|
112
112
|
|
|
113
113
|
# assume the first text as prompt
|
|
114
114
|
prompt = input[0].text
|
evalscope/models/utils/openai.py
CHANGED
|
@@ -209,7 +209,7 @@ def openai_completion_params(model: str, config: GenerateConfig, tools: bool) ->
|
|
|
209
209
|
return params
|
|
210
210
|
|
|
211
211
|
|
|
212
|
-
def openai_assistant_content(message: ChatMessageAssistant) -> str:
|
|
212
|
+
def openai_assistant_content(message: ChatMessageAssistant, include_reasoning=True) -> str:
|
|
213
213
|
# In agent bridge scenarios, we could encounter concepts such as reasoning and
|
|
214
214
|
# .internal use in the ChatMessageAssistant that are not supported by the OpenAI
|
|
215
215
|
# choices API. This code smuggles that data into the plain text so that it
|
|
@@ -220,7 +220,7 @@ def openai_assistant_content(message: ChatMessageAssistant) -> str:
|
|
|
220
220
|
else:
|
|
221
221
|
content = ''
|
|
222
222
|
for c in message.content:
|
|
223
|
-
if c.type == 'reasoning':
|
|
223
|
+
if c.type == 'reasoning' and include_reasoning:
|
|
224
224
|
attribs = ''
|
|
225
225
|
if c.signature is not None:
|
|
226
226
|
attribs = f'{attribs} signature="{c.signature}"'
|
|
@@ -239,11 +239,14 @@ def openai_assistant_content(message: ChatMessageAssistant) -> str:
|
|
|
239
239
|
return content
|
|
240
240
|
|
|
241
241
|
|
|
242
|
-
def openai_chat_choices(choices: List[ChatCompletionChoice]) -> List[Choice]:
|
|
242
|
+
def openai_chat_choices(choices: List[ChatCompletionChoice], include_reasoning: bool = True) -> List[Choice]:
|
|
243
243
|
oai_choices: List[Choice] = []
|
|
244
244
|
|
|
245
245
|
for index, choice in enumerate(choices):
|
|
246
|
-
|
|
246
|
+
# Handle content
|
|
247
|
+
content = openai_assistant_content(choice.message, include_reasoning=include_reasoning)
|
|
248
|
+
|
|
249
|
+
# Handle tool calls
|
|
247
250
|
if choice.message.tool_calls:
|
|
248
251
|
tool_calls = [openai_chat_tool_call(tc) for tc in choice.message.tool_calls]
|
|
249
252
|
else:
|
evalscope/perf/benchmark.py
CHANGED
|
@@ -42,6 +42,8 @@ async def get_requests(args: Arguments, api_plugin: 'ApiPluginBase') -> AsyncGen
|
|
|
42
42
|
try:
|
|
43
43
|
for messages in message_generator.build_messages():
|
|
44
44
|
dataset_messages.append(messages)
|
|
45
|
+
if len(dataset_messages) >= args.number:
|
|
46
|
+
break
|
|
45
47
|
except StopIteration:
|
|
46
48
|
pass
|
|
47
49
|
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import time
|
|
2
|
-
import torch
|
|
3
2
|
from dataclasses import dataclass, field
|
|
4
3
|
from typing import Any, List, Optional, Tuple
|
|
5
4
|
|
|
5
|
+
from evalscope.utils.import_utils import check_import
|
|
6
6
|
from evalscope.utils.logger import get_logger
|
|
7
7
|
|
|
8
8
|
logger = get_logger()
|
|
@@ -44,10 +44,13 @@ class BenchmarkData:
|
|
|
44
44
|
api_plugin.parse_responses(self.response_messages, request=self.request)
|
|
45
45
|
|
|
46
46
|
def update_gpu_usage(self):
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
47
|
+
if check_import('torch'):
|
|
48
|
+
|
|
49
|
+
import torch
|
|
50
|
+
total_memory = 0
|
|
51
|
+
for i in range(torch.cuda.device_count()):
|
|
52
|
+
total_memory += (torch.cuda.max_memory_allocated(i) / 2**30) # GB
|
|
53
|
+
self.max_gpu_memory_cost = max(self.max_gpu_memory_cost, total_memory)
|
|
51
54
|
|
|
52
55
|
|
|
53
56
|
class Metrics:
|
|
@@ -9,6 +9,7 @@ from sse_starlette.sse import EventSourceResponse
|
|
|
9
9
|
|
|
10
10
|
from evalscope.perf.arguments import Arguments
|
|
11
11
|
from evalscope.utils.chat_service import ChatCompletionRequest, ChatService, ModelList, TextCompletionRequest
|
|
12
|
+
from evalscope.utils.import_utils import check_import
|
|
12
13
|
from evalscope.utils.logger import get_logger
|
|
13
14
|
|
|
14
15
|
logger = get_logger()
|
|
@@ -101,6 +102,8 @@ def create_app(model, attn_implementation=None) -> FastAPI:
|
|
|
101
102
|
def start_app(args: Arguments):
|
|
102
103
|
logger.info('Starting local server, please wait...')
|
|
103
104
|
if args.api == 'local':
|
|
105
|
+
check_import('torch', 'torch', raise_error=True)
|
|
106
|
+
|
|
104
107
|
app = create_app(args.model, args.attn_implementation)
|
|
105
108
|
uvicorn.run(app, host='0.0.0.0', port=args.port, workers=1)
|
|
106
109
|
|