evalscope 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/api/benchmark/__init__.py +1 -1
- evalscope/api/benchmark/adapters/__init__.py +2 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +7 -4
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
- evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +62 -2
- evalscope/api/benchmark/meta.py +9 -0
- evalscope/api/dataset/dataset.py +6 -6
- evalscope/api/dataset/loader.py +2 -1
- evalscope/api/evaluator/cache.py +24 -1
- evalscope/api/evaluator/evaluator.py +5 -0
- evalscope/api/evaluator/state.py +17 -1
- evalscope/api/messages/__init__.py +1 -0
- evalscope/api/messages/chat_message.py +52 -2
- evalscope/api/metric/scorer.py +15 -7
- evalscope/api/mixin/__init__.py +1 -1
- evalscope/api/mixin/llm_judge_mixin.py +2 -0
- evalscope/api/mixin/sandbox_mixin.py +204 -0
- evalscope/api/model/generate_config.py +1 -6
- evalscope/api/model/model.py +5 -2
- evalscope/api/tool/tool_info.py +1 -1
- evalscope/app/app.py +3 -0
- evalscope/app/ui/single_model.py +3 -3
- evalscope/app/utils/data_utils.py +7 -7
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -12
- evalscope/arguments.py +8 -4
- evalscope/backend/opencompass/backend_manager.py +0 -2
- evalscope/backend/rag_eval/utils/embedding.py +9 -1
- evalscope/benchmarks/ai2d/ai2d_adapter.py +53 -0
- evalscope/benchmarks/amc/amc_adapter.py +46 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
- evalscope/benchmarks/bfcl/bfcl_adapter.py +142 -7
- evalscope/benchmarks/bfcl/generation.py +9 -9
- evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
- evalscope/benchmarks/data_collection/data_collection_adapter.py +23 -19
- evalscope/benchmarks/drop/drop_adapter.py +1 -1
- evalscope/benchmarks/frames/frames_adapter.py +2 -1
- evalscope/benchmarks/general_arena/general_arena_adapter.py +5 -1
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +6 -5
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/tau_bench/generation.py +1 -1
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +20 -19
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/config.py +96 -14
- evalscope/constants.py +11 -0
- evalscope/evaluator/evaluator.py +30 -10
- evalscope/metrics/llm_judge.py +19 -7
- evalscope/metrics/metric.py +27 -2
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/model_apis.py +22 -0
- evalscope/models/openai_compatible.py +3 -0
- evalscope/models/text2image_model.py +2 -2
- evalscope/models/utils/openai.py +8 -6
- evalscope/perf/arguments.py +2 -0
- evalscope/perf/benchmark.py +2 -0
- evalscope/perf/plugin/api/base.py +2 -2
- evalscope/perf/plugin/api/default_api.py +7 -7
- evalscope/perf/plugin/api/openai_api.py +83 -19
- evalscope/perf/plugin/datasets/flickr8k.py +2 -2
- evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
- evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
- evalscope/perf/utils/benchmark_util.py +7 -5
- evalscope/perf/utils/local_server.py +3 -0
- evalscope/report/__init__.py +0 -1
- evalscope/report/combinator.py +0 -25
- evalscope/report/generator.py +8 -87
- evalscope/report/report.py +8 -4
- evalscope/run.py +9 -5
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/chat_service.py +1 -1
- evalscope/utils/function_utils.py +41 -0
- evalscope/utils/import_utils.py +73 -1
- evalscope/utils/io_utils.py +56 -7
- evalscope/utils/json_schema.py +23 -2
- evalscope/utils/logger.py +19 -0
- evalscope/utils/model_utils.py +4 -3
- evalscope/utils/multi_choices.py +23 -6
- evalscope/version.py +2 -2
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/METADATA +17 -24
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/RECORD +145 -103
- tests/benchmark/test_eval.py +80 -37
- tests/benchmark/test_image_edit.py +65 -0
- tests/benchmark/test_sandbox.py +81 -0
- tests/benchmark/test_vlm.py +137 -0
- tests/cli/test_all.py +83 -43
- tests/cli/test_collection.py +8 -5
- tests/cli/test_reasoning.py +81 -0
- tests/common.py +73 -0
- tests/perf/test_perf.py +44 -14
- tests/rag/test_clip_benchmark.py +0 -3
- evalscope/api/mixin/dataset_mixin.py +0 -105
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
- tests/aigc/__init__.py +0 -1
- /evalscope/benchmarks/{aigc → ai2d}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/i2i → amc}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → healthbench}/__init__.py +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/LICENSE +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/WHEEL +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/entry_points.txt +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/top_level.txt +0 -0
- /tests/{aigc → benchmark}/test_t2i.py +0 -0
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import threading
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional
|
|
4
|
+
|
|
5
|
+
from evalscope.utils.logger import get_logger
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from ms_enclave.sandbox.manager import SandboxManager
|
|
9
|
+
|
|
10
|
+
from evalscope.config import TaskConfig
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SandboxMixin:
|
|
16
|
+
"""Sandbox mixin for sandboxed code execution."""
|
|
17
|
+
|
|
18
|
+
def __init__(self, task_config: 'TaskConfig'):
|
|
19
|
+
self._task_config = task_config
|
|
20
|
+
|
|
21
|
+
self._manager: Optional['SandboxManager'] = None
|
|
22
|
+
"""Sandbox manager instance."""
|
|
23
|
+
|
|
24
|
+
self._sandbox_id: Optional[str] = None
|
|
25
|
+
"""Sandbox ID."""
|
|
26
|
+
|
|
27
|
+
self._loop: Optional[asyncio.AbstractEventLoop] = None
|
|
28
|
+
"""Event loop for async operations."""
|
|
29
|
+
|
|
30
|
+
# Initialize sandbox synchronously by running async methods
|
|
31
|
+
if self.use_sandbox:
|
|
32
|
+
self._loop = asyncio.new_event_loop()
|
|
33
|
+
|
|
34
|
+
# Start the loop in a separate thread
|
|
35
|
+
def run_loop():
|
|
36
|
+
asyncio.set_event_loop(self._loop)
|
|
37
|
+
self._loop.run_forever()
|
|
38
|
+
|
|
39
|
+
self._loop_thread = threading.Thread(target=run_loop, daemon=True)
|
|
40
|
+
self._loop_thread.start()
|
|
41
|
+
|
|
42
|
+
# Wait for initialization
|
|
43
|
+
future = asyncio.run_coroutine_threadsafe(self._async_init(), self._loop)
|
|
44
|
+
future.result()
|
|
45
|
+
|
|
46
|
+
super().__init__()
|
|
47
|
+
|
|
48
|
+
async def _async_init(self):
|
|
49
|
+
"""Async initialization helper."""
|
|
50
|
+
await self.init_sandbox_manager_async()
|
|
51
|
+
await self.init_sandbox_async()
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def use_sandbox(self) -> bool:
|
|
55
|
+
"""
|
|
56
|
+
Return whether to use sandbox for the benchmark.
|
|
57
|
+
"""
|
|
58
|
+
if not self._task_config:
|
|
59
|
+
return False
|
|
60
|
+
else:
|
|
61
|
+
return self._task_config.use_sandbox
|
|
62
|
+
|
|
63
|
+
@property
|
|
64
|
+
def sandbox_manager(self) -> Optional['SandboxManager']:
|
|
65
|
+
"""Get the sandbox manager instance."""
|
|
66
|
+
return self._manager
|
|
67
|
+
|
|
68
|
+
@property
|
|
69
|
+
def sandbox_id(self) -> Optional[str]:
|
|
70
|
+
"""Get the sandbox ID."""
|
|
71
|
+
return self._sandbox_id
|
|
72
|
+
|
|
73
|
+
async def init_sandbox_manager_async(self) -> Optional['SandboxManager']:
|
|
74
|
+
"""Initialize the sandbox manager asynchronously."""
|
|
75
|
+
if self._manager is not None:
|
|
76
|
+
return self._manager
|
|
77
|
+
|
|
78
|
+
if not self.use_sandbox:
|
|
79
|
+
return None
|
|
80
|
+
|
|
81
|
+
from ms_enclave.sandbox.manager import HttpSandboxManager, LocalSandboxManager
|
|
82
|
+
|
|
83
|
+
manager_config = self._task_config.sandbox_manager_config or {}
|
|
84
|
+
if manager_config.get('base_url'):
|
|
85
|
+
# Remote manager
|
|
86
|
+
self._manager = HttpSandboxManager(**manager_config)
|
|
87
|
+
else:
|
|
88
|
+
# Local manager
|
|
89
|
+
self._manager = LocalSandboxManager(**manager_config)
|
|
90
|
+
|
|
91
|
+
await self._manager.start()
|
|
92
|
+
logger.info('Sandbox manager initialized.')
|
|
93
|
+
return self._manager
|
|
94
|
+
|
|
95
|
+
def init_sandbox_manager(self) -> Optional['SandboxManager']:
|
|
96
|
+
"""Initialize the sandbox manager."""
|
|
97
|
+
if self._manager is not None:
|
|
98
|
+
return self._manager
|
|
99
|
+
|
|
100
|
+
if not self.use_sandbox:
|
|
101
|
+
return None
|
|
102
|
+
|
|
103
|
+
# Use the dedicated loop if available
|
|
104
|
+
if self._loop and not self._loop.is_closed():
|
|
105
|
+
future = asyncio.run_coroutine_threadsafe(self.init_sandbox_manager_async(), self._loop)
|
|
106
|
+
return future.result()
|
|
107
|
+
else:
|
|
108
|
+
# Fallback for cases where no loop is available
|
|
109
|
+
return asyncio.run(self.init_sandbox_manager_async())
|
|
110
|
+
|
|
111
|
+
async def init_sandbox_async(self) -> Optional[str]:
|
|
112
|
+
"""Initialize the sandbox instance asynchronously."""
|
|
113
|
+
if self._sandbox_id is not None:
|
|
114
|
+
return self._sandbox_id
|
|
115
|
+
|
|
116
|
+
if not self.use_sandbox:
|
|
117
|
+
return None
|
|
118
|
+
|
|
119
|
+
from ms_enclave.sandbox.model import DockerSandboxConfig, SandboxType
|
|
120
|
+
|
|
121
|
+
sandbox_config = self._task_config.sandbox_config or DockerSandboxConfig(
|
|
122
|
+
image='python:3.11-slim', tools_config={
|
|
123
|
+
'shell_executor': {},
|
|
124
|
+
'python_executor': {}
|
|
125
|
+
}
|
|
126
|
+
)
|
|
127
|
+
sandbox_type = self._task_config.sandbox_type or SandboxType.DOCKER
|
|
128
|
+
|
|
129
|
+
self._sandbox_id = await self._manager.create_sandbox(sandbox_type=sandbox_type, config=sandbox_config)
|
|
130
|
+
|
|
131
|
+
sandbox_info = await self._manager.get_sandbox_info(self._sandbox_id)
|
|
132
|
+
|
|
133
|
+
logger.info(f'Sandbox of type {sandbox_type} initialized. Info: {sandbox_info.model_dump(exclude_none=True)}')
|
|
134
|
+
return self._sandbox_id
|
|
135
|
+
|
|
136
|
+
def init_sandbox(self) -> Optional[str]:
|
|
137
|
+
"""Initialize the sandbox instance."""
|
|
138
|
+
if self._sandbox_id is not None:
|
|
139
|
+
return self._sandbox_id
|
|
140
|
+
|
|
141
|
+
if not self.use_sandbox:
|
|
142
|
+
return None
|
|
143
|
+
|
|
144
|
+
# Use the dedicated loop if available
|
|
145
|
+
if self._loop and not self._loop.is_closed():
|
|
146
|
+
future = asyncio.run_coroutine_threadsafe(self.init_sandbox_async(), self._loop)
|
|
147
|
+
return future.result()
|
|
148
|
+
else:
|
|
149
|
+
# Fallback for cases where no loop is available
|
|
150
|
+
return asyncio.run(self.init_sandbox_async())
|
|
151
|
+
|
|
152
|
+
def execute_code_in_sandbox(self, code: str, timeout: int = 60, language: str = 'python') -> Dict[str, Any]:
|
|
153
|
+
"""Execute code in the sandbox."""
|
|
154
|
+
if not self._sandbox_id or not self._manager:
|
|
155
|
+
logger.warning('Sandbox is not initialized.')
|
|
156
|
+
return {'error': 'Sandbox is not initialized.'}
|
|
157
|
+
|
|
158
|
+
from ms_enclave.sandbox.model import ExecutionStatus, ToolResult
|
|
159
|
+
|
|
160
|
+
async def _execute_async():
|
|
161
|
+
if language.lower() == 'python':
|
|
162
|
+
tool_name = 'python_executor'
|
|
163
|
+
parameters = {'code': code, 'timeout': timeout}
|
|
164
|
+
result = await self._manager.execute_tool(self._sandbox_id, tool_name, parameters)
|
|
165
|
+
elif language.lower() == 'shell':
|
|
166
|
+
tool_name = 'shell_executor'
|
|
167
|
+
parameters = {'command': code, 'timeout': timeout}
|
|
168
|
+
result = await self._manager.execute_tool(self._sandbox_id, tool_name, parameters)
|
|
169
|
+
else:
|
|
170
|
+
logger.warning(f"Unsupported language: {language}. Supported languages are 'python' and 'shell'.")
|
|
171
|
+
result = ToolResult(
|
|
172
|
+
status=ExecutionStatus.ERROR,
|
|
173
|
+
tool_name='code_executor',
|
|
174
|
+
output=f"Unsupported language: {language}. Supported languages are 'python' and 'shell'."
|
|
175
|
+
)
|
|
176
|
+
return result
|
|
177
|
+
|
|
178
|
+
# Use the dedicated loop if available
|
|
179
|
+
if self._loop and not self._loop.is_closed():
|
|
180
|
+
future = asyncio.run_coroutine_threadsafe(_execute_async(), self._loop)
|
|
181
|
+
result = future.result(timeout + 10) # Add some buffer to the timeout
|
|
182
|
+
else:
|
|
183
|
+
# Fallback for cases where no loop is available
|
|
184
|
+
result = asyncio.run(_execute_async())
|
|
185
|
+
|
|
186
|
+
return result.model_dump(exclude_none=True)
|
|
187
|
+
|
|
188
|
+
def sandbox_finalize(self, *args, **kwargs):
|
|
189
|
+
"""Finalize the sandbox manager."""
|
|
190
|
+
if self._manager:
|
|
191
|
+
try:
|
|
192
|
+
if self._loop and not self._loop.is_closed():
|
|
193
|
+
# Stop the manager using the dedicated loop
|
|
194
|
+
future = asyncio.run_coroutine_threadsafe(self._manager.stop(), self._loop)
|
|
195
|
+
future.result(timeout=30)
|
|
196
|
+
|
|
197
|
+
# Stop the event loop
|
|
198
|
+
self._loop.call_soon_threadsafe(self._loop.stop)
|
|
199
|
+
if hasattr(self, '_loop_thread'):
|
|
200
|
+
self._loop_thread.join(timeout=5)
|
|
201
|
+
|
|
202
|
+
logger.info('Sandbox manager finalized.')
|
|
203
|
+
except Exception as e:
|
|
204
|
+
logger.warning(f'Error finalizing sandbox manager: {e}')
|
|
@@ -25,9 +25,7 @@ class ResponseSchema(BaseModel):
|
|
|
25
25
|
|
|
26
26
|
class GenerateConfig(BaseModel):
|
|
27
27
|
"""Model generation options."""
|
|
28
|
-
|
|
29
|
-
max_retries: Optional[int] = Field(default=None)
|
|
30
|
-
"""Maximum number of times to retry request (defaults to unlimited)."""
|
|
28
|
+
model_config = {'extra': 'allow'}
|
|
31
29
|
|
|
32
30
|
timeout: Optional[int] = Field(default=None)
|
|
33
31
|
"""Request timeout (in seconds)."""
|
|
@@ -38,9 +36,6 @@ class GenerateConfig(BaseModel):
|
|
|
38
36
|
stream: Optional[bool] = Field(default=None)
|
|
39
37
|
"""Whether to stream the response (default is model specific)."""
|
|
40
38
|
|
|
41
|
-
system_message: Optional[str] = Field(default=None)
|
|
42
|
-
"""Override the default system message."""
|
|
43
|
-
|
|
44
39
|
max_tokens: Optional[int] = Field(default=None)
|
|
45
40
|
"""The maximum number of tokens that can be generated in the completion (default is model specific)."""
|
|
46
41
|
|
evalscope/api/model/model.py
CHANGED
|
@@ -318,7 +318,7 @@ def get_model_with_task_config(task_config: 'TaskConfig') -> Model:
|
|
|
318
318
|
|
|
319
319
|
@thread_safe
|
|
320
320
|
def get_model(
|
|
321
|
-
model: str,
|
|
321
|
+
model: Union[str, Model, ModelAPI],
|
|
322
322
|
eval_type: str,
|
|
323
323
|
base_url: Optional[str] = None,
|
|
324
324
|
api_key: Optional[str] = None,
|
|
@@ -346,6 +346,9 @@ def get_model(
|
|
|
346
346
|
if isinstance(model, Model):
|
|
347
347
|
return model
|
|
348
348
|
|
|
349
|
+
if isinstance(model, ModelAPI):
|
|
350
|
+
return Model(model, config, model_args)
|
|
351
|
+
|
|
349
352
|
# see if we can return a memoized model instance
|
|
350
353
|
# (exclude mockllm since custom_outputs is an infinite generator)
|
|
351
354
|
model_cache_key: str = ''
|
|
@@ -362,7 +365,7 @@ def get_model(
|
|
|
362
365
|
|
|
363
366
|
logger.info(
|
|
364
367
|
f'Creating model {model} with eval_type={eval_type} '
|
|
365
|
-
f'base_url={base_url},
|
|
368
|
+
f'base_url={base_url}, config={config.model_dump(exclude_none=True)}, model_args={model_args}'
|
|
366
369
|
)
|
|
367
370
|
|
|
368
371
|
# find a matching model type
|
evalscope/api/tool/tool_info.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import inspect
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
from docstring_parser import Docstring, parse
|
|
4
|
-
from pydantic import BaseModel, Field
|
|
4
|
+
from pydantic import BaseModel, Field, field_validator
|
|
5
5
|
from typing import Any, Callable, Dict, List, Literal, Optional, TypeAlias, Union, get_args, get_type_hints
|
|
6
6
|
|
|
7
7
|
from evalscope.utils.json_schema import JSONSchema, JSONType, json_schema, python_type_to_json_type
|
evalscope/app/app.py
CHANGED
|
@@ -6,6 +6,7 @@ import argparse
|
|
|
6
6
|
from evalscope.utils.logger import configure_logging
|
|
7
7
|
from .arguments import add_argument
|
|
8
8
|
from .ui import create_app_ui
|
|
9
|
+
from .utils.env_utils import setup_env
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
def create_app(args: argparse.Namespace):
|
|
@@ -17,6 +18,8 @@ def create_app(args: argparse.Namespace):
|
|
|
17
18
|
"""
|
|
18
19
|
configure_logging(debug=args.debug)
|
|
19
20
|
|
|
21
|
+
setup_env(args)
|
|
22
|
+
|
|
20
23
|
demo = create_app_ui(args)
|
|
21
24
|
|
|
22
25
|
demo.launch(
|
evalscope/app/ui/single_model.py
CHANGED
|
@@ -198,9 +198,9 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
198
198
|
|
|
199
199
|
# Process the data for display
|
|
200
200
|
input_md = row['Input'] + '\n\n' + process_model_prediction(row['Metadata'])
|
|
201
|
-
generated_md =
|
|
202
|
-
gold_md =
|
|
203
|
-
pred_md =
|
|
201
|
+
generated_md = convert_markdown_image(row['Generated'])
|
|
202
|
+
gold_md = convert_markdown_image(row['Gold'])
|
|
203
|
+
pred_md = process_model_prediction(row['Pred'])
|
|
204
204
|
score_md = process_json_content(row['Score'])
|
|
205
205
|
nscore_val = float(row['NScore']) if not pd.isna(row['NScore']) else 0.0
|
|
206
206
|
|
|
@@ -2,7 +2,6 @@
|
|
|
2
2
|
Data loading and processing utilities for the Evalscope dashboard.
|
|
3
3
|
"""
|
|
4
4
|
import glob
|
|
5
|
-
import numpy as np
|
|
6
5
|
import os
|
|
7
6
|
import pandas as pd
|
|
8
7
|
from typing import Any, Dict, List, Union
|
|
@@ -160,17 +159,18 @@ def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subs
|
|
|
160
159
|
if f'{sample_dataset_name}/{sample_subset_name}' != subset_name:
|
|
161
160
|
continue
|
|
162
161
|
|
|
163
|
-
prediction = sample_score.score.prediction
|
|
164
|
-
target = review_result.target
|
|
165
|
-
extracted_prediction = sample_score.score.extracted_prediction
|
|
166
162
|
score = sample_score.score
|
|
163
|
+
metadata = sample_score.sample_metadata
|
|
164
|
+
prediction = score.prediction
|
|
165
|
+
target = review_result.target
|
|
166
|
+
extracted_prediction = score.extracted_prediction
|
|
167
167
|
raw_d = {
|
|
168
168
|
'Index': str(review_result.index),
|
|
169
169
|
'Input': review_result.input.replace('\n', '\n\n'), # for markdown
|
|
170
|
-
'Metadata':
|
|
171
|
-
'Generated': prediction
|
|
170
|
+
'Metadata': metadata,
|
|
171
|
+
'Generated': prediction,
|
|
172
172
|
'Gold': target,
|
|
173
|
-
'Pred': extracted_prediction,
|
|
173
|
+
'Pred': extracted_prediction if extracted_prediction != prediction else '*Same as Generated*',
|
|
174
174
|
'Score': score.model_dump(exclude_none=True),
|
|
175
175
|
'NScore': normalize_score(score.main_value)
|
|
176
176
|
}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
# flake8: noqa
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def setup_env(args):
|
|
6
|
+
compat_dsw_gradio(args)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def compat_dsw_gradio(args) -> None:
|
|
10
|
+
if ('JUPYTER_NAME' in os.environ) and ('dsw-'
|
|
11
|
+
in os.environ['JUPYTER_NAME']) and ('GRADIO_ROOT_PATH' not in os.environ):
|
|
12
|
+
os.environ['GRADIO_ROOT_PATH'] = f"/{os.environ['JUPYTER_NAME']}/proxy/{args.server_port}"
|
|
@@ -2,11 +2,9 @@
|
|
|
2
2
|
Text processing utilities for the Evalscope dashboard.
|
|
3
3
|
"""
|
|
4
4
|
import json
|
|
5
|
-
import numpy as np
|
|
6
5
|
import os
|
|
7
|
-
import pandas as pd
|
|
8
6
|
import re
|
|
9
|
-
from typing import Any, Dict, List
|
|
7
|
+
from typing import Any, Dict, List, Optional
|
|
10
8
|
|
|
11
9
|
from evalscope.utils.logger import get_logger
|
|
12
10
|
from ..constants import LATEX_DELIMITERS
|
|
@@ -14,15 +12,19 @@ from ..constants import LATEX_DELIMITERS
|
|
|
14
12
|
logger = get_logger()
|
|
15
13
|
|
|
16
14
|
|
|
17
|
-
def convert_markdown_image(text):
|
|
18
|
-
if
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
text = os.path.abspath(text)
|
|
23
|
-
image_tag = f''
|
|
24
|
-
logger.debug(f'Converting image path to markdown: {text} -> {image_tag}')
|
|
15
|
+
def convert_markdown_image(text: str):
|
|
16
|
+
if text.startswith('data:image'):
|
|
17
|
+
# Convert base64 image data to a markdown image tag
|
|
18
|
+
image_tag = f''
|
|
19
|
+
logger.debug(f'Converting base64 image data to markdown: {text[:30]}... -> {image_tag[:40]}...')
|
|
25
20
|
return image_tag
|
|
21
|
+
elif os.path.isfile(text):
|
|
22
|
+
# Convert the image path to a markdown image tag
|
|
23
|
+
if text.endswith('.png') or text.endswith('.jpg') or text.endswith('.jpeg'):
|
|
24
|
+
text = os.path.abspath(text)
|
|
25
|
+
image_tag = f''
|
|
26
|
+
logger.debug(f'Converting image path to markdown: {text} -> {image_tag}')
|
|
27
|
+
return image_tag
|
|
26
28
|
return text
|
|
27
29
|
|
|
28
30
|
|
|
@@ -85,7 +87,7 @@ def process_model_prediction_old(item: Any, max_length: int = 2048) -> str:
|
|
|
85
87
|
return result
|
|
86
88
|
|
|
87
89
|
|
|
88
|
-
def process_model_prediction(item: Any, max_length: int =
|
|
90
|
+
def process_model_prediction(item: Any, max_length: Optional[int] = None) -> str:
|
|
89
91
|
if isinstance(item, (dict, list)):
|
|
90
92
|
result = json.dumps(item, ensure_ascii=False, indent=2)
|
|
91
93
|
result = f'```json\n{result}\n```'
|
evalscope/arguments.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
import argparse
|
|
3
3
|
import json
|
|
4
4
|
|
|
5
|
-
from evalscope.constants import EvalBackend, EvalType, JudgeStrategy, ModelTask
|
|
5
|
+
from evalscope.constants import EvalBackend, EvalType, JudgeStrategy, ModelTask
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
class ParseStrArgsAction(argparse.Action):
|
|
@@ -60,8 +60,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
60
60
|
parser.add_argument('--generation-config', type=str, action=ParseStrArgsAction, help='The generation config, should be a string.') # noqa: E501
|
|
61
61
|
|
|
62
62
|
# Evaluation-related arguments
|
|
63
|
-
parser.add_argument('--eval-type', type=str, help='The type for evaluating.'
|
|
64
|
-
choices=[EvalType.CHECKPOINT, EvalType.CUSTOM, EvalType.SERVICE])
|
|
63
|
+
parser.add_argument('--eval-type', type=str, help='The type for evaluating.')
|
|
65
64
|
parser.add_argument('--eval-backend', type=str, help='The evaluation backend to use.',
|
|
66
65
|
choices=[EvalBackend.NATIVE, EvalBackend.OPEN_COMPASS, EvalBackend.VLM_EVAL_KIT, EvalBackend.RAG_EVAL]) # noqa: E501
|
|
67
66
|
parser.add_argument('--eval-config', type=str, required=False, help='The eval task config file path for evaluation backend.') # noqa: E501
|
|
@@ -77,7 +76,6 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
77
76
|
# Debug and runtime mode arguments
|
|
78
77
|
parser.add_argument('--ignore-errors', action='store_true', default=False, help='Ignore errors during evaluation.')
|
|
79
78
|
parser.add_argument('--debug', action='store_true', default=False, help='Debug mode, will print information for debugging.') # noqa: E501
|
|
80
|
-
parser.add_argument('--dry-run', action='store_true', default=False, help='Dry run in single processing mode.')
|
|
81
79
|
parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility.')
|
|
82
80
|
parser.add_argument('--api-key', type=str, default='EMPTY', help='The API key for the remote API model.')
|
|
83
81
|
parser.add_argument('--api-url', type=str, default=None, help='The API url for the remote API model.')
|
|
@@ -89,6 +87,12 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
89
87
|
parser.add_argument('--judge-model-args', type=json.loads, default='{}', help='The judge model args, should be a json string.') # noqa: E501
|
|
90
88
|
parser.add_argument('--judge-worker-num', type=int, default=1, help='The number of workers for the judge model.')
|
|
91
89
|
parser.add_argument('--analysis-report', action='store_true', default=False, help='Generate analysis report for the evaluation results using judge model.') # noqa: E501
|
|
90
|
+
|
|
91
|
+
# Sandbox-related arguments
|
|
92
|
+
parser.add_argument('--use-sandbox', action='store_true', default=False, help='Whether to use sandbox for model evaluation.') # noqa: E501
|
|
93
|
+
parser.add_argument('--sandbox-type', type=str, default='docker', help='The sandbox type to use.') # noqa: E501
|
|
94
|
+
parser.add_argument('--sandbox-config', type=json.loads, default='{}', help='The sandbox config, should be a json string.') # noqa: E501
|
|
95
|
+
parser.add_argument('--sandbox-manager-config', type=json.loads, default='{}', help='The sandbox manager config, should be a json string.') # noqa: E501
|
|
92
96
|
# yapf: enable
|
|
93
97
|
|
|
94
98
|
|
|
@@ -47,7 +47,6 @@ class OpenCompassBackendManager(BackendManager):
|
|
|
47
47
|
datasets: list, the datasets.
|
|
48
48
|
models: list, the models.
|
|
49
49
|
work_dir (Optional): str, the working directory. Default to None, which means the current directory.
|
|
50
|
-
dry_run (Optional): bool, the dry-run flag. Default to False.
|
|
51
50
|
debug (Optional): bool, the debug flag. Default to False.
|
|
52
51
|
reuse (Optional): str, reuse previous outputs & results. Default to None.
|
|
53
52
|
generation_kwargs (Optional): dict, the generation config. Default to {}.
|
|
@@ -140,7 +139,6 @@ class OpenCompassBackendManager(BackendManager):
|
|
|
140
139
|
cmd_str = f'python -m run_oc ' \
|
|
141
140
|
f'--models {" ".join(self.args.models)} ' \
|
|
142
141
|
f'--datasets {" ".join(self.args.datasets)} ' \
|
|
143
|
-
f'{self.get_restore_arg("dry-run", self.args.dry_run)} ' \
|
|
144
142
|
f'{self.get_arg_with_default("work-dir", self.args.work_dir)}'
|
|
145
143
|
|
|
146
144
|
elif cmd_mode == CmdMode.SCRIPT:
|
|
@@ -164,6 +164,13 @@ class CrossEncoderModel(BaseModel):
|
|
|
164
164
|
max_length=self.max_seq_length,
|
|
165
165
|
automodel_args=self.model_kwargs,
|
|
166
166
|
)
|
|
167
|
+
self.tokenizer = self.model.tokenizer
|
|
168
|
+
# set pad token
|
|
169
|
+
if self.tokenizer.pad_token is None:
|
|
170
|
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
|
171
|
+
if ('pad_token_id' not in self.model.config) or (self.model.config.pad_token_id is None):
|
|
172
|
+
self.model.config.update({'pad_token_id': self.tokenizer.eos_token_id})
|
|
173
|
+
|
|
167
174
|
self.supported_encode_params = get_supported_params(self.model.predict)
|
|
168
175
|
|
|
169
176
|
def predict(self, sentences: List[List[str]], **kwargs) -> Tensor:
|
|
@@ -189,6 +196,7 @@ class APIEmbeddingModel(BaseModel):
|
|
|
189
196
|
self.openai_api_base = kwargs.get('api_base')
|
|
190
197
|
self.openai_api_key = kwargs.get('api_key')
|
|
191
198
|
self.dimensions = kwargs.get('dimensions')
|
|
199
|
+
self.check_embedding_ctx_length = kwargs.get('check_embedding_ctx_length', False)
|
|
192
200
|
self.framework = ['API']
|
|
193
201
|
|
|
194
202
|
self.model = OpenAIEmbeddings(
|
|
@@ -196,7 +204,7 @@ class APIEmbeddingModel(BaseModel):
|
|
|
196
204
|
openai_api_base=self.openai_api_base,
|
|
197
205
|
openai_api_key=self.openai_api_key,
|
|
198
206
|
dimensions=self.dimensions,
|
|
199
|
-
check_embedding_ctx_length=
|
|
207
|
+
check_embedding_ctx_length=self.check_embedding_ctx_length,
|
|
200
208
|
)
|
|
201
209
|
|
|
202
210
|
super().__init__(model_name_or_path=self.model_name, **kwargs)
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
from typing import Any, Dict, List
|
|
2
|
+
|
|
3
|
+
from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
|
|
4
|
+
from evalscope.api.dataset import Sample
|
|
5
|
+
from evalscope.api.evaluator import TaskState
|
|
6
|
+
from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
|
|
7
|
+
from evalscope.api.registry import register_benchmark
|
|
8
|
+
from evalscope.constants import Tags
|
|
9
|
+
from evalscope.utils.io_utils import bytes_to_base64
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
11
|
+
from evalscope.utils.multi_choices import MultipleChoiceTemplate, parse_answers, prompt
|
|
12
|
+
|
|
13
|
+
logger = get_logger()
|
|
14
|
+
|
|
15
|
+
SUBSET_LIST = ['default']
|
|
16
|
+
|
|
17
|
+
MULT_CHOICE_PROMPT = MultipleChoiceTemplate.SINGLE_ANSWER_COT
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@register_benchmark(
|
|
21
|
+
BenchmarkMeta(
|
|
22
|
+
name='ai2d',
|
|
23
|
+
pretty_name='AI2D',
|
|
24
|
+
tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
|
|
25
|
+
description='A Diagram Is Worth A Dozen Images',
|
|
26
|
+
dataset_id='lmms-lab/ai2d',
|
|
27
|
+
subset_list=SUBSET_LIST,
|
|
28
|
+
metric_list=['acc'],
|
|
29
|
+
eval_split='test',
|
|
30
|
+
prompt_template=MULT_CHOICE_PROMPT,
|
|
31
|
+
)
|
|
32
|
+
)
|
|
33
|
+
class Ai2dAdapter(VisionLanguageAdapter):
|
|
34
|
+
|
|
35
|
+
def __init__(self, **kwargs):
|
|
36
|
+
super().__init__(**kwargs)
|
|
37
|
+
|
|
38
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
39
|
+
answers_list: list[str] = record['options']
|
|
40
|
+
input_text = prompt(question=record['question'], choices=answers_list, template=MULT_CHOICE_PROMPT)
|
|
41
|
+
content_list: list[Content] = [ContentText(text=input_text)]
|
|
42
|
+
image = record.get('image')
|
|
43
|
+
if image:
|
|
44
|
+
image_base64 = bytes_to_base64(image['bytes'], format='png', add_header=True)
|
|
45
|
+
content_list.append(ContentImage(image=image_base64))
|
|
46
|
+
|
|
47
|
+
label_answer = chr(int(record['answer']) + ord('A'))
|
|
48
|
+
|
|
49
|
+
return Sample(input=[ChatMessageUser(content=content_list)], choices=answers_list, target=label_answer)
|
|
50
|
+
|
|
51
|
+
def extract_answer(self, prediction: str, task_state: TaskState) -> str:
|
|
52
|
+
answers = parse_answers(task_state)
|
|
53
|
+
return ''.join(sorted(list(answers)))
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from typing import Any, Dict
|
|
2
|
+
|
|
3
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
4
|
+
from evalscope.api.dataset import Sample
|
|
5
|
+
from evalscope.api.registry import register_benchmark
|
|
6
|
+
from evalscope.constants import Tags
|
|
7
|
+
from evalscope.utils.logger import get_logger
|
|
8
|
+
|
|
9
|
+
logger = get_logger()
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@register_benchmark(
|
|
13
|
+
BenchmarkMeta(
|
|
14
|
+
name='amc',
|
|
15
|
+
pretty_name='AMC',
|
|
16
|
+
tags=[Tags.MATH, Tags.REASONING],
|
|
17
|
+
description=
|
|
18
|
+
'AMC (American Mathematics Competitions) is a series of mathematics competitions for high school students.',
|
|
19
|
+
dataset_id='evalscope/amc_22-24',
|
|
20
|
+
subset_list=['amc22', 'amc23', 'amc24'],
|
|
21
|
+
metric_list=[{
|
|
22
|
+
'acc': {
|
|
23
|
+
'numeric': True
|
|
24
|
+
}
|
|
25
|
+
}],
|
|
26
|
+
prompt_template='{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
|
|
27
|
+
)
|
|
28
|
+
)
|
|
29
|
+
class AMCAdapter(DefaultDataAdapter):
|
|
30
|
+
|
|
31
|
+
def __init__(self, *args, **kwargs):
|
|
32
|
+
super().__init__(*args, **kwargs)
|
|
33
|
+
|
|
34
|
+
# Use split as subset
|
|
35
|
+
self.split_as_subset = True
|
|
36
|
+
|
|
37
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
38
|
+
return Sample(
|
|
39
|
+
input=record['problem'],
|
|
40
|
+
target=record['answer'],
|
|
41
|
+
metadata={
|
|
42
|
+
'year': record['year'],
|
|
43
|
+
'url': record['url'],
|
|
44
|
+
'solution': record.get('solution', '')
|
|
45
|
+
},
|
|
46
|
+
)
|
|
@@ -141,35 +141,61 @@ class BBHAdapter(DefaultDataAdapter):
|
|
|
141
141
|
@classmethod
|
|
142
142
|
def _extract_mc_answer(cls, ans: str) -> str:
|
|
143
143
|
"""
|
|
144
|
-
Extract
|
|
144
|
+
Extract normalized answer for BBH multiple-choice tasks.
|
|
145
|
+
Handles formats like:
|
|
146
|
+
- "answer is (A)"
|
|
147
|
+
- "The answer is A."
|
|
148
|
+
- Extra text after answer.
|
|
149
|
+
Always uses the *last* occurrence of "answer is".
|
|
145
150
|
"""
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
151
|
+
ans = ans.strip()
|
|
152
|
+
|
|
153
|
+
parts = ans.split('So the answer is ')
|
|
154
|
+
if len(parts) > 1:
|
|
155
|
+
ans = parts[-1].strip()
|
|
156
|
+
ans = ans.split('\n')[0].strip()
|
|
157
|
+
|
|
158
|
+
# Remove trailing period
|
|
159
|
+
if ans.endswith('.'):
|
|
160
|
+
ans = ans[:-1].strip()
|
|
161
|
+
|
|
162
|
+
# Capture uppercase letter inside parentheses (A) (B) ...
|
|
163
|
+
match = re.search(r'\(([A-Z])\)', ans)
|
|
150
164
|
if match:
|
|
151
165
|
return match.group(1)
|
|
152
|
-
|
|
166
|
+
|
|
167
|
+
# Capture single uppercase letter
|
|
168
|
+
match = re.search(r'\b([A-Z])\b', ans)
|
|
153
169
|
if match:
|
|
154
170
|
return match.group(1)
|
|
171
|
+
|
|
155
172
|
return ans
|
|
156
173
|
|
|
157
174
|
@classmethod
|
|
158
175
|
def _extract_ff_answer(cls, ans: str):
|
|
159
176
|
"""
|
|
160
|
-
Extract the answer
|
|
177
|
+
Extract the normalized answer for BBH free-form tasks.
|
|
178
|
+
Handles patterns like:
|
|
179
|
+
- "answer is XXX."
|
|
180
|
+
- "The answer is **valid**."
|
|
181
|
+
- Extra trailing dots / line breaks.
|
|
182
|
+
- Bold-marked answers (**xxx**).
|
|
183
|
+
Always uses the *last* occurrence of "answer is".
|
|
161
184
|
"""
|
|
162
|
-
|
|
185
|
+
ans = ans.strip()
|
|
163
186
|
|
|
164
|
-
|
|
165
|
-
if
|
|
166
|
-
|
|
167
|
-
|
|
187
|
+
parts = ans.split('So the answer is ')
|
|
188
|
+
if len(parts) > 1:
|
|
189
|
+
ans = parts[-1].strip()
|
|
190
|
+
ans = ans.split('\n')[0].strip()
|
|
168
191
|
|
|
169
|
-
|
|
170
|
-
if len(ans_line) != 1:
|
|
171
|
-
ans = ans_line[1].strip()
|
|
172
|
-
ans = ans.split('\n')[0]
|
|
192
|
+
# Remove trailing period
|
|
173
193
|
if ans.endswith('.'):
|
|
174
|
-
ans = ans[:-1]
|
|
194
|
+
ans = ans[:-1].strip()
|
|
195
|
+
|
|
196
|
+
# If answer is in bold (**xxx**), prefer the content inside
|
|
197
|
+
match = re.search(r'\*\*(.*?)\*\*', ans)
|
|
198
|
+
if match:
|
|
199
|
+
ans = match.group(1).strip()
|
|
200
|
+
|
|
175
201
|
return ans
|