evalscope 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (148) hide show
  1. evalscope/api/benchmark/__init__.py +1 -1
  2. evalscope/api/benchmark/adapters/__init__.py +2 -0
  3. evalscope/api/benchmark/adapters/default_data_adapter.py +7 -4
  4. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  5. evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
  6. evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
  7. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  8. evalscope/api/benchmark/benchmark.py +62 -2
  9. evalscope/api/benchmark/meta.py +9 -0
  10. evalscope/api/dataset/dataset.py +6 -6
  11. evalscope/api/dataset/loader.py +2 -1
  12. evalscope/api/evaluator/cache.py +24 -1
  13. evalscope/api/evaluator/evaluator.py +5 -0
  14. evalscope/api/evaluator/state.py +17 -1
  15. evalscope/api/messages/__init__.py +1 -0
  16. evalscope/api/messages/chat_message.py +52 -2
  17. evalscope/api/metric/scorer.py +15 -7
  18. evalscope/api/mixin/__init__.py +1 -1
  19. evalscope/api/mixin/llm_judge_mixin.py +2 -0
  20. evalscope/api/mixin/sandbox_mixin.py +204 -0
  21. evalscope/api/model/generate_config.py +1 -6
  22. evalscope/api/model/model.py +5 -2
  23. evalscope/api/tool/tool_info.py +1 -1
  24. evalscope/app/app.py +3 -0
  25. evalscope/app/ui/single_model.py +3 -3
  26. evalscope/app/utils/data_utils.py +7 -7
  27. evalscope/app/utils/env_utils.py +12 -0
  28. evalscope/app/utils/text_utils.py +14 -12
  29. evalscope/arguments.py +8 -4
  30. evalscope/backend/opencompass/backend_manager.py +0 -2
  31. evalscope/backend/rag_eval/utils/embedding.py +9 -1
  32. evalscope/benchmarks/ai2d/ai2d_adapter.py +53 -0
  33. evalscope/benchmarks/amc/amc_adapter.py +46 -0
  34. evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
  35. evalscope/benchmarks/bfcl/bfcl_adapter.py +142 -7
  36. evalscope/benchmarks/bfcl/generation.py +9 -9
  37. evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
  38. evalscope/benchmarks/data_collection/data_collection_adapter.py +23 -19
  39. evalscope/benchmarks/drop/drop_adapter.py +1 -1
  40. evalscope/benchmarks/frames/frames_adapter.py +2 -1
  41. evalscope/benchmarks/general_arena/general_arena_adapter.py +5 -1
  42. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  43. evalscope/benchmarks/healthbench/utils.py +102 -0
  44. evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
  45. evalscope/benchmarks/humaneval/utils.py +235 -0
  46. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  47. evalscope/benchmarks/image_edit/__init__.py +0 -0
  48. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  49. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  50. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  51. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  52. evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
  53. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
  54. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  55. evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
  56. evalscope/benchmarks/math_vista/__init__.py +0 -0
  57. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  58. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  59. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
  60. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  61. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  62. evalscope/benchmarks/mm_star/__init__.py +0 -0
  63. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  64. evalscope/benchmarks/mmmu/__init__.py +0 -0
  65. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  66. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  67. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  68. evalscope/benchmarks/multi_if/__init__.py +0 -0
  69. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  70. evalscope/benchmarks/multi_if/metrics.py +120 -0
  71. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  72. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +6 -5
  73. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  74. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  75. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  76. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  77. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  78. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  79. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  80. evalscope/benchmarks/tau_bench/generation.py +1 -1
  81. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +20 -19
  82. evalscope/benchmarks/text2image/__init__.py +0 -0
  83. evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
  84. evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
  85. evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
  86. evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
  87. evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
  88. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
  89. evalscope/cli/start_app.py +7 -1
  90. evalscope/cli/start_perf.py +7 -1
  91. evalscope/config.py +96 -14
  92. evalscope/constants.py +11 -0
  93. evalscope/evaluator/evaluator.py +30 -10
  94. evalscope/metrics/llm_judge.py +19 -7
  95. evalscope/metrics/metric.py +27 -2
  96. evalscope/models/image_edit_model.py +125 -0
  97. evalscope/models/model_apis.py +22 -0
  98. evalscope/models/openai_compatible.py +3 -0
  99. evalscope/models/text2image_model.py +2 -2
  100. evalscope/models/utils/openai.py +8 -6
  101. evalscope/perf/arguments.py +2 -0
  102. evalscope/perf/benchmark.py +2 -0
  103. evalscope/perf/plugin/api/base.py +2 -2
  104. evalscope/perf/plugin/api/default_api.py +7 -7
  105. evalscope/perf/plugin/api/openai_api.py +83 -19
  106. evalscope/perf/plugin/datasets/flickr8k.py +2 -2
  107. evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
  108. evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
  109. evalscope/perf/utils/benchmark_util.py +7 -5
  110. evalscope/perf/utils/local_server.py +3 -0
  111. evalscope/report/__init__.py +0 -1
  112. evalscope/report/combinator.py +0 -25
  113. evalscope/report/generator.py +8 -87
  114. evalscope/report/report.py +8 -4
  115. evalscope/run.py +9 -5
  116. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  117. evalscope/utils/chat_service.py +1 -1
  118. evalscope/utils/function_utils.py +41 -0
  119. evalscope/utils/import_utils.py +73 -1
  120. evalscope/utils/io_utils.py +56 -7
  121. evalscope/utils/json_schema.py +23 -2
  122. evalscope/utils/logger.py +19 -0
  123. evalscope/utils/model_utils.py +4 -3
  124. evalscope/utils/multi_choices.py +23 -6
  125. evalscope/version.py +2 -2
  126. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/METADATA +17 -24
  127. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/RECORD +145 -103
  128. tests/benchmark/test_eval.py +80 -37
  129. tests/benchmark/test_image_edit.py +65 -0
  130. tests/benchmark/test_sandbox.py +81 -0
  131. tests/benchmark/test_vlm.py +137 -0
  132. tests/cli/test_all.py +83 -43
  133. tests/cli/test_collection.py +8 -5
  134. tests/cli/test_reasoning.py +81 -0
  135. tests/common.py +73 -0
  136. tests/perf/test_perf.py +44 -14
  137. tests/rag/test_clip_benchmark.py +0 -3
  138. evalscope/api/mixin/dataset_mixin.py +0 -105
  139. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
  140. tests/aigc/__init__.py +0 -1
  141. /evalscope/benchmarks/{aigc → ai2d}/__init__.py +0 -0
  142. /evalscope/benchmarks/{aigc/i2i → amc}/__init__.py +0 -0
  143. /evalscope/benchmarks/{aigc/t2i → healthbench}/__init__.py +0 -0
  144. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/LICENSE +0 -0
  145. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/WHEEL +0 -0
  146. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/entry_points.txt +0 -0
  147. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/top_level.txt +0 -0
  148. /tests/{aigc → benchmark}/test_t2i.py +0 -0
@@ -0,0 +1,204 @@
1
+ import asyncio
2
+ import threading
3
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional
4
+
5
+ from evalscope.utils.logger import get_logger
6
+
7
+ if TYPE_CHECKING:
8
+ from ms_enclave.sandbox.manager import SandboxManager
9
+
10
+ from evalscope.config import TaskConfig
11
+
12
+ logger = get_logger()
13
+
14
+
15
+ class SandboxMixin:
16
+ """Sandbox mixin for sandboxed code execution."""
17
+
18
+ def __init__(self, task_config: 'TaskConfig'):
19
+ self._task_config = task_config
20
+
21
+ self._manager: Optional['SandboxManager'] = None
22
+ """Sandbox manager instance."""
23
+
24
+ self._sandbox_id: Optional[str] = None
25
+ """Sandbox ID."""
26
+
27
+ self._loop: Optional[asyncio.AbstractEventLoop] = None
28
+ """Event loop for async operations."""
29
+
30
+ # Initialize sandbox synchronously by running async methods
31
+ if self.use_sandbox:
32
+ self._loop = asyncio.new_event_loop()
33
+
34
+ # Start the loop in a separate thread
35
+ def run_loop():
36
+ asyncio.set_event_loop(self._loop)
37
+ self._loop.run_forever()
38
+
39
+ self._loop_thread = threading.Thread(target=run_loop, daemon=True)
40
+ self._loop_thread.start()
41
+
42
+ # Wait for initialization
43
+ future = asyncio.run_coroutine_threadsafe(self._async_init(), self._loop)
44
+ future.result()
45
+
46
+ super().__init__()
47
+
48
+ async def _async_init(self):
49
+ """Async initialization helper."""
50
+ await self.init_sandbox_manager_async()
51
+ await self.init_sandbox_async()
52
+
53
+ @property
54
+ def use_sandbox(self) -> bool:
55
+ """
56
+ Return whether to use sandbox for the benchmark.
57
+ """
58
+ if not self._task_config:
59
+ return False
60
+ else:
61
+ return self._task_config.use_sandbox
62
+
63
+ @property
64
+ def sandbox_manager(self) -> Optional['SandboxManager']:
65
+ """Get the sandbox manager instance."""
66
+ return self._manager
67
+
68
+ @property
69
+ def sandbox_id(self) -> Optional[str]:
70
+ """Get the sandbox ID."""
71
+ return self._sandbox_id
72
+
73
+ async def init_sandbox_manager_async(self) -> Optional['SandboxManager']:
74
+ """Initialize the sandbox manager asynchronously."""
75
+ if self._manager is not None:
76
+ return self._manager
77
+
78
+ if not self.use_sandbox:
79
+ return None
80
+
81
+ from ms_enclave.sandbox.manager import HttpSandboxManager, LocalSandboxManager
82
+
83
+ manager_config = self._task_config.sandbox_manager_config or {}
84
+ if manager_config.get('base_url'):
85
+ # Remote manager
86
+ self._manager = HttpSandboxManager(**manager_config)
87
+ else:
88
+ # Local manager
89
+ self._manager = LocalSandboxManager(**manager_config)
90
+
91
+ await self._manager.start()
92
+ logger.info('Sandbox manager initialized.')
93
+ return self._manager
94
+
95
+ def init_sandbox_manager(self) -> Optional['SandboxManager']:
96
+ """Initialize the sandbox manager."""
97
+ if self._manager is not None:
98
+ return self._manager
99
+
100
+ if not self.use_sandbox:
101
+ return None
102
+
103
+ # Use the dedicated loop if available
104
+ if self._loop and not self._loop.is_closed():
105
+ future = asyncio.run_coroutine_threadsafe(self.init_sandbox_manager_async(), self._loop)
106
+ return future.result()
107
+ else:
108
+ # Fallback for cases where no loop is available
109
+ return asyncio.run(self.init_sandbox_manager_async())
110
+
111
+ async def init_sandbox_async(self) -> Optional[str]:
112
+ """Initialize the sandbox instance asynchronously."""
113
+ if self._sandbox_id is not None:
114
+ return self._sandbox_id
115
+
116
+ if not self.use_sandbox:
117
+ return None
118
+
119
+ from ms_enclave.sandbox.model import DockerSandboxConfig, SandboxType
120
+
121
+ sandbox_config = self._task_config.sandbox_config or DockerSandboxConfig(
122
+ image='python:3.11-slim', tools_config={
123
+ 'shell_executor': {},
124
+ 'python_executor': {}
125
+ }
126
+ )
127
+ sandbox_type = self._task_config.sandbox_type or SandboxType.DOCKER
128
+
129
+ self._sandbox_id = await self._manager.create_sandbox(sandbox_type=sandbox_type, config=sandbox_config)
130
+
131
+ sandbox_info = await self._manager.get_sandbox_info(self._sandbox_id)
132
+
133
+ logger.info(f'Sandbox of type {sandbox_type} initialized. Info: {sandbox_info.model_dump(exclude_none=True)}')
134
+ return self._sandbox_id
135
+
136
+ def init_sandbox(self) -> Optional[str]:
137
+ """Initialize the sandbox instance."""
138
+ if self._sandbox_id is not None:
139
+ return self._sandbox_id
140
+
141
+ if not self.use_sandbox:
142
+ return None
143
+
144
+ # Use the dedicated loop if available
145
+ if self._loop and not self._loop.is_closed():
146
+ future = asyncio.run_coroutine_threadsafe(self.init_sandbox_async(), self._loop)
147
+ return future.result()
148
+ else:
149
+ # Fallback for cases where no loop is available
150
+ return asyncio.run(self.init_sandbox_async())
151
+
152
+ def execute_code_in_sandbox(self, code: str, timeout: int = 60, language: str = 'python') -> Dict[str, Any]:
153
+ """Execute code in the sandbox."""
154
+ if not self._sandbox_id or not self._manager:
155
+ logger.warning('Sandbox is not initialized.')
156
+ return {'error': 'Sandbox is not initialized.'}
157
+
158
+ from ms_enclave.sandbox.model import ExecutionStatus, ToolResult
159
+
160
+ async def _execute_async():
161
+ if language.lower() == 'python':
162
+ tool_name = 'python_executor'
163
+ parameters = {'code': code, 'timeout': timeout}
164
+ result = await self._manager.execute_tool(self._sandbox_id, tool_name, parameters)
165
+ elif language.lower() == 'shell':
166
+ tool_name = 'shell_executor'
167
+ parameters = {'command': code, 'timeout': timeout}
168
+ result = await self._manager.execute_tool(self._sandbox_id, tool_name, parameters)
169
+ else:
170
+ logger.warning(f"Unsupported language: {language}. Supported languages are 'python' and 'shell'.")
171
+ result = ToolResult(
172
+ status=ExecutionStatus.ERROR,
173
+ tool_name='code_executor',
174
+ output=f"Unsupported language: {language}. Supported languages are 'python' and 'shell'."
175
+ )
176
+ return result
177
+
178
+ # Use the dedicated loop if available
179
+ if self._loop and not self._loop.is_closed():
180
+ future = asyncio.run_coroutine_threadsafe(_execute_async(), self._loop)
181
+ result = future.result(timeout + 10) # Add some buffer to the timeout
182
+ else:
183
+ # Fallback for cases where no loop is available
184
+ result = asyncio.run(_execute_async())
185
+
186
+ return result.model_dump(exclude_none=True)
187
+
188
+ def sandbox_finalize(self, *args, **kwargs):
189
+ """Finalize the sandbox manager."""
190
+ if self._manager:
191
+ try:
192
+ if self._loop and not self._loop.is_closed():
193
+ # Stop the manager using the dedicated loop
194
+ future = asyncio.run_coroutine_threadsafe(self._manager.stop(), self._loop)
195
+ future.result(timeout=30)
196
+
197
+ # Stop the event loop
198
+ self._loop.call_soon_threadsafe(self._loop.stop)
199
+ if hasattr(self, '_loop_thread'):
200
+ self._loop_thread.join(timeout=5)
201
+
202
+ logger.info('Sandbox manager finalized.')
203
+ except Exception as e:
204
+ logger.warning(f'Error finalizing sandbox manager: {e}')
@@ -25,9 +25,7 @@ class ResponseSchema(BaseModel):
25
25
 
26
26
  class GenerateConfig(BaseModel):
27
27
  """Model generation options."""
28
-
29
- max_retries: Optional[int] = Field(default=None)
30
- """Maximum number of times to retry request (defaults to unlimited)."""
28
+ model_config = {'extra': 'allow'}
31
29
 
32
30
  timeout: Optional[int] = Field(default=None)
33
31
  """Request timeout (in seconds)."""
@@ -38,9 +36,6 @@ class GenerateConfig(BaseModel):
38
36
  stream: Optional[bool] = Field(default=None)
39
37
  """Whether to stream the response (default is model specific)."""
40
38
 
41
- system_message: Optional[str] = Field(default=None)
42
- """Override the default system message."""
43
-
44
39
  max_tokens: Optional[int] = Field(default=None)
45
40
  """The maximum number of tokens that can be generated in the completion (default is model specific)."""
46
41
 
@@ -318,7 +318,7 @@ def get_model_with_task_config(task_config: 'TaskConfig') -> Model:
318
318
 
319
319
  @thread_safe
320
320
  def get_model(
321
- model: str,
321
+ model: Union[str, Model, ModelAPI],
322
322
  eval_type: str,
323
323
  base_url: Optional[str] = None,
324
324
  api_key: Optional[str] = None,
@@ -346,6 +346,9 @@ def get_model(
346
346
  if isinstance(model, Model):
347
347
  return model
348
348
 
349
+ if isinstance(model, ModelAPI):
350
+ return Model(model, config, model_args)
351
+
349
352
  # see if we can return a memoized model instance
350
353
  # (exclude mockllm since custom_outputs is an infinite generator)
351
354
  model_cache_key: str = ''
@@ -362,7 +365,7 @@ def get_model(
362
365
 
363
366
  logger.info(
364
367
  f'Creating model {model} with eval_type={eval_type} '
365
- f'base_url={base_url}, api_key={api_key}, config={config}, model_args={model_args}'
368
+ f'base_url={base_url}, config={config.model_dump(exclude_none=True)}, model_args={model_args}'
366
369
  )
367
370
 
368
371
  # find a matching model type
@@ -1,7 +1,7 @@
1
1
  import inspect
2
2
  from dataclasses import dataclass
3
3
  from docstring_parser import Docstring, parse
4
- from pydantic import BaseModel, Field
4
+ from pydantic import BaseModel, Field, field_validator
5
5
  from typing import Any, Callable, Dict, List, Literal, Optional, TypeAlias, Union, get_args, get_type_hints
6
6
 
7
7
  from evalscope.utils.json_schema import JSONSchema, JSONType, json_schema, python_type_to_json_type
evalscope/app/app.py CHANGED
@@ -6,6 +6,7 @@ import argparse
6
6
  from evalscope.utils.logger import configure_logging
7
7
  from .arguments import add_argument
8
8
  from .ui import create_app_ui
9
+ from .utils.env_utils import setup_env
9
10
 
10
11
 
11
12
  def create_app(args: argparse.Namespace):
@@ -17,6 +18,8 @@ def create_app(args: argparse.Namespace):
17
18
  """
18
19
  configure_logging(debug=args.debug)
19
20
 
21
+ setup_env(args)
22
+
20
23
  demo = create_app_ui(args)
21
24
 
22
25
  demo.launch(
@@ -198,9 +198,9 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
198
198
 
199
199
  # Process the data for display
200
200
  input_md = row['Input'] + '\n\n' + process_model_prediction(row['Metadata'])
201
- generated_md = process_model_prediction(row['Generated'])
202
- gold_md = process_model_prediction(row['Gold'])
203
- pred_md = convert_markdown_image(process_model_prediction(row['Pred']))
201
+ generated_md = convert_markdown_image(row['Generated'])
202
+ gold_md = convert_markdown_image(row['Gold'])
203
+ pred_md = process_model_prediction(row['Pred'])
204
204
  score_md = process_json_content(row['Score'])
205
205
  nscore_val = float(row['NScore']) if not pd.isna(row['NScore']) else 0.0
206
206
 
@@ -2,7 +2,6 @@
2
2
  Data loading and processing utilities for the Evalscope dashboard.
3
3
  """
4
4
  import glob
5
- import numpy as np
6
5
  import os
7
6
  import pandas as pd
8
7
  from typing import Any, Dict, List, Union
@@ -160,17 +159,18 @@ def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subs
160
159
  if f'{sample_dataset_name}/{sample_subset_name}' != subset_name:
161
160
  continue
162
161
 
163
- prediction = sample_score.score.prediction
164
- target = review_result.target
165
- extracted_prediction = sample_score.score.extracted_prediction
166
162
  score = sample_score.score
163
+ metadata = sample_score.sample_metadata
164
+ prediction = score.prediction
165
+ target = review_result.target
166
+ extracted_prediction = score.extracted_prediction
167
167
  raw_d = {
168
168
  'Index': str(review_result.index),
169
169
  'Input': review_result.input.replace('\n', '\n\n'), # for markdown
170
- 'Metadata': sample_score.sample_metadata,
171
- 'Generated': prediction if prediction != extracted_prediction else '*Same as Pred*',
170
+ 'Metadata': metadata,
171
+ 'Generated': prediction,
172
172
  'Gold': target,
173
- 'Pred': extracted_prediction,
173
+ 'Pred': extracted_prediction if extracted_prediction != prediction else '*Same as Generated*',
174
174
  'Score': score.model_dump(exclude_none=True),
175
175
  'NScore': normalize_score(score.main_value)
176
176
  }
@@ -0,0 +1,12 @@
1
+ # flake8: noqa
2
+ import os
3
+
4
+
5
+ def setup_env(args):
6
+ compat_dsw_gradio(args)
7
+
8
+
9
+ def compat_dsw_gradio(args) -> None:
10
+ if ('JUPYTER_NAME' in os.environ) and ('dsw-'
11
+ in os.environ['JUPYTER_NAME']) and ('GRADIO_ROOT_PATH' not in os.environ):
12
+ os.environ['GRADIO_ROOT_PATH'] = f"/{os.environ['JUPYTER_NAME']}/proxy/{args.server_port}"
@@ -2,11 +2,9 @@
2
2
  Text processing utilities for the Evalscope dashboard.
3
3
  """
4
4
  import json
5
- import numpy as np
6
5
  import os
7
- import pandas as pd
8
6
  import re
9
- from typing import Any, Dict, List
7
+ from typing import Any, Dict, List, Optional
10
8
 
11
9
  from evalscope.utils.logger import get_logger
12
10
  from ..constants import LATEX_DELIMITERS
@@ -14,15 +12,19 @@ from ..constants import LATEX_DELIMITERS
14
12
  logger = get_logger()
15
13
 
16
14
 
17
- def convert_markdown_image(text):
18
- if not os.path.isfile(text):
19
- return text
20
- # Convert the image path to a markdown image tag
21
- if text.endswith('.png') or text.endswith('.jpg') or text.endswith('.jpeg'):
22
- text = os.path.abspath(text)
23
- image_tag = f'![image](gradio_api/file={text})'
24
- logger.debug(f'Converting image path to markdown: {text} -> {image_tag}')
15
+ def convert_markdown_image(text: str):
16
+ if text.startswith('data:image'):
17
+ # Convert base64 image data to a markdown image tag
18
+ image_tag = f'![image]({text})'
19
+ logger.debug(f'Converting base64 image data to markdown: {text[:30]}... -> {image_tag[:40]}...')
25
20
  return image_tag
21
+ elif os.path.isfile(text):
22
+ # Convert the image path to a markdown image tag
23
+ if text.endswith('.png') or text.endswith('.jpg') or text.endswith('.jpeg'):
24
+ text = os.path.abspath(text)
25
+ image_tag = f'![image](gradio_api/file={text})'
26
+ logger.debug(f'Converting image path to markdown: {text} -> {image_tag}')
27
+ return image_tag
26
28
  return text
27
29
 
28
30
 
@@ -85,7 +87,7 @@ def process_model_prediction_old(item: Any, max_length: int = 2048) -> str:
85
87
  return result
86
88
 
87
89
 
88
- def process_model_prediction(item: Any, max_length: int = 32000) -> str:
90
+ def process_model_prediction(item: Any, max_length: Optional[int] = None) -> str:
89
91
  if isinstance(item, (dict, list)):
90
92
  result = json.dumps(item, ensure_ascii=False, indent=2)
91
93
  result = f'```json\n{result}\n```'
evalscope/arguments.py CHANGED
@@ -2,7 +2,7 @@
2
2
  import argparse
3
3
  import json
4
4
 
5
- from evalscope.constants import EvalBackend, EvalType, JudgeStrategy, ModelTask, OutputType
5
+ from evalscope.constants import EvalBackend, EvalType, JudgeStrategy, ModelTask
6
6
 
7
7
 
8
8
  class ParseStrArgsAction(argparse.Action):
@@ -60,8 +60,7 @@ def add_argument(parser: argparse.ArgumentParser):
60
60
  parser.add_argument('--generation-config', type=str, action=ParseStrArgsAction, help='The generation config, should be a string.') # noqa: E501
61
61
 
62
62
  # Evaluation-related arguments
63
- parser.add_argument('--eval-type', type=str, help='The type for evaluating.',
64
- choices=[EvalType.CHECKPOINT, EvalType.CUSTOM, EvalType.SERVICE])
63
+ parser.add_argument('--eval-type', type=str, help='The type for evaluating.')
65
64
  parser.add_argument('--eval-backend', type=str, help='The evaluation backend to use.',
66
65
  choices=[EvalBackend.NATIVE, EvalBackend.OPEN_COMPASS, EvalBackend.VLM_EVAL_KIT, EvalBackend.RAG_EVAL]) # noqa: E501
67
66
  parser.add_argument('--eval-config', type=str, required=False, help='The eval task config file path for evaluation backend.') # noqa: E501
@@ -77,7 +76,6 @@ def add_argument(parser: argparse.ArgumentParser):
77
76
  # Debug and runtime mode arguments
78
77
  parser.add_argument('--ignore-errors', action='store_true', default=False, help='Ignore errors during evaluation.')
79
78
  parser.add_argument('--debug', action='store_true', default=False, help='Debug mode, will print information for debugging.') # noqa: E501
80
- parser.add_argument('--dry-run', action='store_true', default=False, help='Dry run in single processing mode.')
81
79
  parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility.')
82
80
  parser.add_argument('--api-key', type=str, default='EMPTY', help='The API key for the remote API model.')
83
81
  parser.add_argument('--api-url', type=str, default=None, help='The API url for the remote API model.')
@@ -89,6 +87,12 @@ def add_argument(parser: argparse.ArgumentParser):
89
87
  parser.add_argument('--judge-model-args', type=json.loads, default='{}', help='The judge model args, should be a json string.') # noqa: E501
90
88
  parser.add_argument('--judge-worker-num', type=int, default=1, help='The number of workers for the judge model.')
91
89
  parser.add_argument('--analysis-report', action='store_true', default=False, help='Generate analysis report for the evaluation results using judge model.') # noqa: E501
90
+
91
+ # Sandbox-related arguments
92
+ parser.add_argument('--use-sandbox', action='store_true', default=False, help='Whether to use sandbox for model evaluation.') # noqa: E501
93
+ parser.add_argument('--sandbox-type', type=str, default='docker', help='The sandbox type to use.') # noqa: E501
94
+ parser.add_argument('--sandbox-config', type=json.loads, default='{}', help='The sandbox config, should be a json string.') # noqa: E501
95
+ parser.add_argument('--sandbox-manager-config', type=json.loads, default='{}', help='The sandbox manager config, should be a json string.') # noqa: E501
92
96
  # yapf: enable
93
97
 
94
98
 
@@ -47,7 +47,6 @@ class OpenCompassBackendManager(BackendManager):
47
47
  datasets: list, the datasets.
48
48
  models: list, the models.
49
49
  work_dir (Optional): str, the working directory. Default to None, which means the current directory.
50
- dry_run (Optional): bool, the dry-run flag. Default to False.
51
50
  debug (Optional): bool, the debug flag. Default to False.
52
51
  reuse (Optional): str, reuse previous outputs & results. Default to None.
53
52
  generation_kwargs (Optional): dict, the generation config. Default to {}.
@@ -140,7 +139,6 @@ class OpenCompassBackendManager(BackendManager):
140
139
  cmd_str = f'python -m run_oc ' \
141
140
  f'--models {" ".join(self.args.models)} ' \
142
141
  f'--datasets {" ".join(self.args.datasets)} ' \
143
- f'{self.get_restore_arg("dry-run", self.args.dry_run)} ' \
144
142
  f'{self.get_arg_with_default("work-dir", self.args.work_dir)}'
145
143
 
146
144
  elif cmd_mode == CmdMode.SCRIPT:
@@ -164,6 +164,13 @@ class CrossEncoderModel(BaseModel):
164
164
  max_length=self.max_seq_length,
165
165
  automodel_args=self.model_kwargs,
166
166
  )
167
+ self.tokenizer = self.model.tokenizer
168
+ # set pad token
169
+ if self.tokenizer.pad_token is None:
170
+ self.tokenizer.pad_token = self.tokenizer.eos_token
171
+ if ('pad_token_id' not in self.model.config) or (self.model.config.pad_token_id is None):
172
+ self.model.config.update({'pad_token_id': self.tokenizer.eos_token_id})
173
+
167
174
  self.supported_encode_params = get_supported_params(self.model.predict)
168
175
 
169
176
  def predict(self, sentences: List[List[str]], **kwargs) -> Tensor:
@@ -189,6 +196,7 @@ class APIEmbeddingModel(BaseModel):
189
196
  self.openai_api_base = kwargs.get('api_base')
190
197
  self.openai_api_key = kwargs.get('api_key')
191
198
  self.dimensions = kwargs.get('dimensions')
199
+ self.check_embedding_ctx_length = kwargs.get('check_embedding_ctx_length', False)
192
200
  self.framework = ['API']
193
201
 
194
202
  self.model = OpenAIEmbeddings(
@@ -196,7 +204,7 @@ class APIEmbeddingModel(BaseModel):
196
204
  openai_api_base=self.openai_api_base,
197
205
  openai_api_key=self.openai_api_key,
198
206
  dimensions=self.dimensions,
199
- check_embedding_ctx_length=False
207
+ check_embedding_ctx_length=self.check_embedding_ctx_length,
200
208
  )
201
209
 
202
210
  super().__init__(model_name_or_path=self.model_name, **kwargs)
@@ -0,0 +1,53 @@
1
+ from typing import Any, Dict, List
2
+
3
+ from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
4
+ from evalscope.api.dataset import Sample
5
+ from evalscope.api.evaluator import TaskState
6
+ from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
7
+ from evalscope.api.registry import register_benchmark
8
+ from evalscope.constants import Tags
9
+ from evalscope.utils.io_utils import bytes_to_base64
10
+ from evalscope.utils.logger import get_logger
11
+ from evalscope.utils.multi_choices import MultipleChoiceTemplate, parse_answers, prompt
12
+
13
+ logger = get_logger()
14
+
15
+ SUBSET_LIST = ['default']
16
+
17
+ MULT_CHOICE_PROMPT = MultipleChoiceTemplate.SINGLE_ANSWER_COT
18
+
19
+
20
+ @register_benchmark(
21
+ BenchmarkMeta(
22
+ name='ai2d',
23
+ pretty_name='AI2D',
24
+ tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
25
+ description='A Diagram Is Worth A Dozen Images',
26
+ dataset_id='lmms-lab/ai2d',
27
+ subset_list=SUBSET_LIST,
28
+ metric_list=['acc'],
29
+ eval_split='test',
30
+ prompt_template=MULT_CHOICE_PROMPT,
31
+ )
32
+ )
33
+ class Ai2dAdapter(VisionLanguageAdapter):
34
+
35
+ def __init__(self, **kwargs):
36
+ super().__init__(**kwargs)
37
+
38
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
39
+ answers_list: list[str] = record['options']
40
+ input_text = prompt(question=record['question'], choices=answers_list, template=MULT_CHOICE_PROMPT)
41
+ content_list: list[Content] = [ContentText(text=input_text)]
42
+ image = record.get('image')
43
+ if image:
44
+ image_base64 = bytes_to_base64(image['bytes'], format='png', add_header=True)
45
+ content_list.append(ContentImage(image=image_base64))
46
+
47
+ label_answer = chr(int(record['answer']) + ord('A'))
48
+
49
+ return Sample(input=[ChatMessageUser(content=content_list)], choices=answers_list, target=label_answer)
50
+
51
+ def extract_answer(self, prediction: str, task_state: TaskState) -> str:
52
+ answers = parse_answers(task_state)
53
+ return ''.join(sorted(list(answers)))
@@ -0,0 +1,46 @@
1
+ from typing import Any, Dict
2
+
3
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
4
+ from evalscope.api.dataset import Sample
5
+ from evalscope.api.registry import register_benchmark
6
+ from evalscope.constants import Tags
7
+ from evalscope.utils.logger import get_logger
8
+
9
+ logger = get_logger()
10
+
11
+
12
+ @register_benchmark(
13
+ BenchmarkMeta(
14
+ name='amc',
15
+ pretty_name='AMC',
16
+ tags=[Tags.MATH, Tags.REASONING],
17
+ description=
18
+ 'AMC (American Mathematics Competitions) is a series of mathematics competitions for high school students.',
19
+ dataset_id='evalscope/amc_22-24',
20
+ subset_list=['amc22', 'amc23', 'amc24'],
21
+ metric_list=[{
22
+ 'acc': {
23
+ 'numeric': True
24
+ }
25
+ }],
26
+ prompt_template='{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
27
+ )
28
+ )
29
+ class AMCAdapter(DefaultDataAdapter):
30
+
31
+ def __init__(self, *args, **kwargs):
32
+ super().__init__(*args, **kwargs)
33
+
34
+ # Use split as subset
35
+ self.split_as_subset = True
36
+
37
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
38
+ return Sample(
39
+ input=record['problem'],
40
+ target=record['answer'],
41
+ metadata={
42
+ 'year': record['year'],
43
+ 'url': record['url'],
44
+ 'solution': record.get('solution', '')
45
+ },
46
+ )
@@ -141,35 +141,61 @@ class BBHAdapter(DefaultDataAdapter):
141
141
  @classmethod
142
142
  def _extract_mc_answer(cls, ans: str) -> str:
143
143
  """
144
- Extract the answer from the model output for Multiple choice task.
144
+ Extract normalized answer for BBH multiple-choice tasks.
145
+ Handles formats like:
146
+ - "answer is (A)"
147
+ - "The answer is A."
148
+ - Extra text after answer.
149
+ Always uses the *last* occurrence of "answer is".
145
150
  """
146
- ans_line = ans.split('answer is ')
147
- if len(ans_line) != 1:
148
- ans = ans_line[1].strip()
149
- match = re.search(r'\(([A-Z])\)*', ans)
151
+ ans = ans.strip()
152
+
153
+ parts = ans.split('So the answer is ')
154
+ if len(parts) > 1:
155
+ ans = parts[-1].strip()
156
+ ans = ans.split('\n')[0].strip()
157
+
158
+ # Remove trailing period
159
+ if ans.endswith('.'):
160
+ ans = ans[:-1].strip()
161
+
162
+ # Capture uppercase letter inside parentheses (A) (B) ...
163
+ match = re.search(r'\(([A-Z])\)', ans)
150
164
  if match:
151
165
  return match.group(1)
152
- match = re.search(r'([A-Z])', ans)
166
+
167
+ # Capture single uppercase letter
168
+ match = re.search(r'\b([A-Z])\b', ans)
153
169
  if match:
154
170
  return match.group(1)
171
+
155
172
  return ans
156
173
 
157
174
  @classmethod
158
175
  def _extract_ff_answer(cls, ans: str):
159
176
  """
160
- Extract the answer from the model output for Free-form task.
177
+ Extract the normalized answer for BBH free-form tasks.
178
+ Handles patterns like:
179
+ - "answer is XXX."
180
+ - "The answer is **valid**."
181
+ - Extra trailing dots / line breaks.
182
+ - Bold-marked answers (**xxx**).
183
+ Always uses the *last* occurrence of "answer is".
161
184
  """
162
- pattern = r'answer is\s+(.*?)\.'
185
+ ans = ans.strip()
163
186
 
164
- match = re.search(pattern, ans)
165
- if match:
166
- res = match.group(1)
167
- return res
187
+ parts = ans.split('So the answer is ')
188
+ if len(parts) > 1:
189
+ ans = parts[-1].strip()
190
+ ans = ans.split('\n')[0].strip()
168
191
 
169
- ans_line = ans.split('answer is ')
170
- if len(ans_line) != 1:
171
- ans = ans_line[1].strip()
172
- ans = ans.split('\n')[0]
192
+ # Remove trailing period
173
193
  if ans.endswith('.'):
174
- ans = ans[:-1]
194
+ ans = ans[:-1].strip()
195
+
196
+ # If answer is in bold (**xxx**), prefer the content inside
197
+ match = re.search(r'\*\*(.*?)\*\*', ans)
198
+ if match:
199
+ ans = match.group(1).strip()
200
+
175
201
  return ans