evalscope 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (148) hide show
  1. evalscope/api/benchmark/__init__.py +1 -1
  2. evalscope/api/benchmark/adapters/__init__.py +2 -0
  3. evalscope/api/benchmark/adapters/default_data_adapter.py +7 -4
  4. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  5. evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
  6. evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
  7. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  8. evalscope/api/benchmark/benchmark.py +62 -2
  9. evalscope/api/benchmark/meta.py +9 -0
  10. evalscope/api/dataset/dataset.py +6 -6
  11. evalscope/api/dataset/loader.py +2 -1
  12. evalscope/api/evaluator/cache.py +24 -1
  13. evalscope/api/evaluator/evaluator.py +5 -0
  14. evalscope/api/evaluator/state.py +17 -1
  15. evalscope/api/messages/__init__.py +1 -0
  16. evalscope/api/messages/chat_message.py +52 -2
  17. evalscope/api/metric/scorer.py +15 -7
  18. evalscope/api/mixin/__init__.py +1 -1
  19. evalscope/api/mixin/llm_judge_mixin.py +2 -0
  20. evalscope/api/mixin/sandbox_mixin.py +204 -0
  21. evalscope/api/model/generate_config.py +1 -6
  22. evalscope/api/model/model.py +5 -2
  23. evalscope/api/tool/tool_info.py +1 -1
  24. evalscope/app/app.py +3 -0
  25. evalscope/app/ui/single_model.py +3 -3
  26. evalscope/app/utils/data_utils.py +7 -7
  27. evalscope/app/utils/env_utils.py +12 -0
  28. evalscope/app/utils/text_utils.py +14 -12
  29. evalscope/arguments.py +8 -4
  30. evalscope/backend/opencompass/backend_manager.py +0 -2
  31. evalscope/backend/rag_eval/utils/embedding.py +9 -1
  32. evalscope/benchmarks/ai2d/ai2d_adapter.py +53 -0
  33. evalscope/benchmarks/amc/amc_adapter.py +46 -0
  34. evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
  35. evalscope/benchmarks/bfcl/bfcl_adapter.py +142 -7
  36. evalscope/benchmarks/bfcl/generation.py +9 -9
  37. evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
  38. evalscope/benchmarks/data_collection/data_collection_adapter.py +23 -19
  39. evalscope/benchmarks/drop/drop_adapter.py +1 -1
  40. evalscope/benchmarks/frames/frames_adapter.py +2 -1
  41. evalscope/benchmarks/general_arena/general_arena_adapter.py +5 -1
  42. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  43. evalscope/benchmarks/healthbench/utils.py +102 -0
  44. evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
  45. evalscope/benchmarks/humaneval/utils.py +235 -0
  46. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  47. evalscope/benchmarks/image_edit/__init__.py +0 -0
  48. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  49. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  50. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  51. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  52. evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
  53. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
  54. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  55. evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
  56. evalscope/benchmarks/math_vista/__init__.py +0 -0
  57. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  58. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  59. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
  60. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  61. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  62. evalscope/benchmarks/mm_star/__init__.py +0 -0
  63. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  64. evalscope/benchmarks/mmmu/__init__.py +0 -0
  65. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  66. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  67. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  68. evalscope/benchmarks/multi_if/__init__.py +0 -0
  69. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  70. evalscope/benchmarks/multi_if/metrics.py +120 -0
  71. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  72. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +6 -5
  73. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  74. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  75. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  76. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  77. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  78. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  79. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  80. evalscope/benchmarks/tau_bench/generation.py +1 -1
  81. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +20 -19
  82. evalscope/benchmarks/text2image/__init__.py +0 -0
  83. evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
  84. evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
  85. evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
  86. evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
  87. evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
  88. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
  89. evalscope/cli/start_app.py +7 -1
  90. evalscope/cli/start_perf.py +7 -1
  91. evalscope/config.py +96 -14
  92. evalscope/constants.py +11 -0
  93. evalscope/evaluator/evaluator.py +30 -10
  94. evalscope/metrics/llm_judge.py +19 -7
  95. evalscope/metrics/metric.py +27 -2
  96. evalscope/models/image_edit_model.py +125 -0
  97. evalscope/models/model_apis.py +22 -0
  98. evalscope/models/openai_compatible.py +3 -0
  99. evalscope/models/text2image_model.py +2 -2
  100. evalscope/models/utils/openai.py +8 -6
  101. evalscope/perf/arguments.py +2 -0
  102. evalscope/perf/benchmark.py +2 -0
  103. evalscope/perf/plugin/api/base.py +2 -2
  104. evalscope/perf/plugin/api/default_api.py +7 -7
  105. evalscope/perf/plugin/api/openai_api.py +83 -19
  106. evalscope/perf/plugin/datasets/flickr8k.py +2 -2
  107. evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
  108. evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
  109. evalscope/perf/utils/benchmark_util.py +7 -5
  110. evalscope/perf/utils/local_server.py +3 -0
  111. evalscope/report/__init__.py +0 -1
  112. evalscope/report/combinator.py +0 -25
  113. evalscope/report/generator.py +8 -87
  114. evalscope/report/report.py +8 -4
  115. evalscope/run.py +9 -5
  116. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  117. evalscope/utils/chat_service.py +1 -1
  118. evalscope/utils/function_utils.py +41 -0
  119. evalscope/utils/import_utils.py +73 -1
  120. evalscope/utils/io_utils.py +56 -7
  121. evalscope/utils/json_schema.py +23 -2
  122. evalscope/utils/logger.py +19 -0
  123. evalscope/utils/model_utils.py +4 -3
  124. evalscope/utils/multi_choices.py +23 -6
  125. evalscope/version.py +2 -2
  126. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/METADATA +17 -24
  127. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/RECORD +145 -103
  128. tests/benchmark/test_eval.py +80 -37
  129. tests/benchmark/test_image_edit.py +65 -0
  130. tests/benchmark/test_sandbox.py +81 -0
  131. tests/benchmark/test_vlm.py +137 -0
  132. tests/cli/test_all.py +83 -43
  133. tests/cli/test_collection.py +8 -5
  134. tests/cli/test_reasoning.py +81 -0
  135. tests/common.py +73 -0
  136. tests/perf/test_perf.py +44 -14
  137. tests/rag/test_clip_benchmark.py +0 -3
  138. evalscope/api/mixin/dataset_mixin.py +0 -105
  139. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
  140. tests/aigc/__init__.py +0 -1
  141. /evalscope/benchmarks/{aigc → ai2d}/__init__.py +0 -0
  142. /evalscope/benchmarks/{aigc/i2i → amc}/__init__.py +0 -0
  143. /evalscope/benchmarks/{aigc/t2i → healthbench}/__init__.py +0 -0
  144. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/LICENSE +0 -0
  145. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/WHEEL +0 -0
  146. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/entry_points.txt +0 -0
  147. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/top_level.txt +0 -0
  148. /tests/{aigc → benchmark}/test_t2i.py +0 -0
@@ -1,8 +1,8 @@
1
1
  import time
2
- import torch
3
2
  from dataclasses import dataclass, field
4
3
  from typing import Any, List, Optional, Tuple
5
4
 
5
+ from evalscope.utils.import_utils import check_import
6
6
  from evalscope.utils.logger import get_logger
7
7
 
8
8
  logger = get_logger()
@@ -44,10 +44,12 @@ class BenchmarkData:
44
44
  api_plugin.parse_responses(self.response_messages, request=self.request)
45
45
 
46
46
  def update_gpu_usage(self):
47
- total_memory = 0
48
- for i in range(torch.cuda.device_count()):
49
- total_memory += (torch.cuda.max_memory_allocated(i) / 2**30) # GB
50
- self.max_gpu_memory_cost = max(self.max_gpu_memory_cost, total_memory)
47
+ if check_import('torch', raise_warning=False):
48
+ import torch
49
+ total_memory = 0
50
+ for i in range(torch.cuda.device_count()):
51
+ total_memory += (torch.cuda.max_memory_allocated(i) / 2**30) # GB
52
+ self.max_gpu_memory_cost = max(self.max_gpu_memory_cost, total_memory)
51
53
 
52
54
 
53
55
  class Metrics:
@@ -9,6 +9,7 @@ from sse_starlette.sse import EventSourceResponse
9
9
 
10
10
  from evalscope.perf.arguments import Arguments
11
11
  from evalscope.utils.chat_service import ChatCompletionRequest, ChatService, ModelList, TextCompletionRequest
12
+ from evalscope.utils.import_utils import check_import
12
13
  from evalscope.utils.logger import get_logger
13
14
 
14
15
  logger = get_logger()
@@ -101,6 +102,8 @@ def create_app(model, attn_implementation=None) -> FastAPI:
101
102
  def start_app(args: Arguments):
102
103
  logger.info('Starting local server, please wait...')
103
104
  if args.api == 'local':
105
+ check_import('torch', 'torch', raise_error=True)
106
+
104
107
  app = create_app(args.model, args.attn_implementation)
105
108
  uvicorn.run(app, host='0.0.0.0', port=args.port, workers=1)
106
109
 
@@ -14,7 +14,6 @@ else:
14
14
  'gen_table',
15
15
  'get_data_frame',
16
16
  'get_report_list',
17
- 'gen_report_table',
18
17
  ],
19
18
  'generator': [
20
19
  'ReportGenerator',
@@ -86,28 +86,3 @@ def gen_table(
86
86
  add_overall_metric=add_overall_metric
87
87
  )
88
88
  return tabulate(table, headers=table.columns, tablefmt='grid', showindex=False)
89
-
90
-
91
- class ReportsRecorder:
92
- COMMON_DATASET_PATH = []
93
- CUSTOM_DATASET_PATH = []
94
-
95
- def __init__(self, oss_url: str = '', endpoint: str = ''):
96
- pass
97
-
98
-
99
- if __name__ == '__main__':
100
- report_dir_1 = './outputs/20250117_151926'
101
- # report_dir_2 = './outputs/20250107_204445/reports'
102
-
103
- report_table = gen_table(reports_path_list=[report_dir_1])
104
- print(report_table)
105
-
106
- # ALL VALUES ONLY FOR EXAMPLE
107
- # +--------------------------+-------------------+-------------+
108
- # | Model | CompetitionMath | GSM8K |
109
- # +==========================+===================+=============+
110
- # | ZhipuAI_chatglm2-6b-base | 25.0 (acc) | 30.50 (acc) |
111
- # +--------------------------+-------------------+-------------+
112
- # | ZhipuAI_chatglm2-6b | 30.5 (acc) | 40.50 (acc) |
113
- # +--------------------------+-------------------+-------------+
@@ -8,105 +8,26 @@ from evalscope.report.report import *
8
8
  if TYPE_CHECKING:
9
9
  from evalscope.api.benchmark import DataAdapter
10
10
  from evalscope.api.metric import AggScore
11
- from evalscope.benchmarks import DataAdapter as OldDataAdapter
12
11
 
13
12
 
14
13
  class ReportGenerator:
15
14
 
16
15
  @staticmethod
17
- def gen_report(subset_score_map: dict, model_name: str, data_adapter: 'OldDataAdapter', **kwargs) -> Report:
18
- """
19
- Generate a report for a specific dataset based on provided subset scores.
20
-
21
- Args:
22
- subset_score_map (dict): A mapping from subset names to a list of score dictionaries.
23
- {
24
- 'subset_name': [
25
- {'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100},
26
- {'metric_name': 'WeightedAverageAccuracy', 'score': 0.3389, 'num': 100}
27
- ],
28
- ...
29
- }
30
- report_name (str): The name of the report to generate.
31
- data_adapter (DataAdapter): An adapter object for data handling.
32
-
33
- Returns:
34
- Report: A structured report object containing metrics, categories, and subsets.
35
-
36
- >>> report = gen_report(subset_score_map, "My Report", data_adapter, dataset_name="Dataset", model_name="Model")
37
- """ # noqa: E501
38
-
39
- dataset_name = data_adapter.name
40
- category_map = data_adapter.category_map
41
- report_name = f'{model_name}@{dataset_name}'
42
-
43
- def flatten_subset() -> DataFrame:
44
- """
45
- Flatten subset score map to a DataFrame.
46
-
47
- Example:
48
- name score num categories metric_name
49
- 0 ARC-Easy 0.5 2 [default] AverageAccuracy
50
- 1 ARC-Challenge 0.5 2 [default] AverageAccuracy
51
- """
52
- subsets = []
53
- for subset_name, scores in subset_score_map.items():
54
- for score_item in scores:
55
- categories = category_map.get(subset_name, ['default'])
56
- if isinstance(categories, str):
57
- categories = [categories]
58
- subsets.append(
59
- dict(
60
- name=subset_name,
61
- score=score_item['score'],
62
- num=score_item['num'],
63
- metric_name=score_item['metric_name'],
64
- categories=tuple(categories)
65
- )
66
- )
67
- df = pd.DataFrame(subsets)
68
- return df
69
-
70
- df = flatten_subset()
71
-
16
+ def gen_collection_report(df: DataFrame, all_dataset_name: str, model_name: str) -> Report:
72
17
  metrics_list = []
73
- for metric_name, group_metric in df.groupby('metric_name', sort=False):
18
+ for metric_name, group_metric in df.groupby('metric', sort=False):
74
19
  categories = []
75
20
  for category_name, group_category in group_metric.groupby('categories'):
76
21
  subsets = []
77
- for _, row in group_category.iterrows():
78
- subsets.append(Subset(name=row['name'], score=row['score'], num=row['num']))
79
-
22
+ for (dataset_name, subset_name), group_subset in group_category.groupby(['dataset_name',
23
+ 'subset_name']):
24
+ avg_score = group_subset['score'].mean()
25
+ num = group_subset['score'].count()
26
+ subsets.append(Subset(name=f'{dataset_name}/{subset_name}', score=float(avg_score), num=int(num)))
80
27
  categories.append(Category(name=category_name, subsets=subsets))
81
-
82
28
  metrics_list.append(Metric(name=metric_name, categories=categories))
83
-
84
- report = Report(
85
- name=report_name,
86
- metrics=metrics_list,
87
- dataset_name=dataset_name,
88
- model_name=model_name,
89
- dataset_description=data_adapter.description,
90
- dataset_pretty_name=data_adapter.pretty_name
91
- )
92
- return report
93
-
94
- @staticmethod
95
- def gen_collection_report(df: DataFrame, all_dataset_name: str, model_name: str) -> Report:
96
- categories = []
97
- for category_name, group_category in df.groupby('categories'):
98
- subsets = []
99
- for (dataset_name, subset_name), group_subset in group_category.groupby(['dataset_name', 'subset_name']):
100
- avg_score = group_subset['score'].mean()
101
- num = group_subset['score'].count()
102
- subsets.append(Subset(name=f'{dataset_name}/{subset_name}', score=float(avg_score), num=int(num)))
103
-
104
- categories.append(Category(name=category_name, subsets=subsets))
105
29
  return Report(
106
- name=DataCollection.NAME,
107
- metrics=[Metric(name='Average', categories=categories)],
108
- dataset_name=all_dataset_name,
109
- model_name=model_name
30
+ name=DataCollection.NAME, metrics=metrics_list, dataset_name=all_dataset_name, model_name=model_name
110
31
  )
111
32
 
112
33
  @staticmethod
@@ -22,7 +22,7 @@ ANALYSIS_PROMPT = """根据给出的json格式的模型评测结果,输出分
22
22
  """
23
23
 
24
24
 
25
- def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float, dict]:
25
+ def normalize_score(score: Union[float, dict, int], keep_num: int = 4) -> Union[float, dict]:
26
26
  """
27
27
  Normalize score.
28
28
 
@@ -37,9 +37,10 @@ def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float
37
37
  score = round(score, keep_num)
38
38
  elif isinstance(score, dict):
39
39
  score = {k: round(v, keep_num) for k, v in score.items()}
40
+ elif isinstance(score, int):
41
+ score = float(score)
40
42
  else:
41
43
  logger.warning(f'Unknown score type: {type(score)}')
42
-
43
44
  return score
44
45
 
45
46
 
@@ -103,6 +104,7 @@ class ReportKey:
103
104
  subset_name = 'Subset'
104
105
  num = 'Num'
105
106
  score = 'Score'
107
+ overall_score = 'OVERALL'
106
108
 
107
109
 
108
110
  @dataclass
@@ -181,12 +183,14 @@ class Report:
181
183
  table[ReportKey.num].append(subset.num)
182
184
  table[ReportKey.score].append(subset.score)
183
185
  # add overall metric when there are multiple subsets
184
- if metric_count > 1 and add_overall_metric:
186
+ if metric_count > 1 and add_overall_metric and (
187
+ ReportKey.overall_score not in table[ReportKey.subset_name]
188
+ ):
185
189
  table[ReportKey.model_name].append(self.model_name)
186
190
  table[ReportKey.dataset_name].append(self.dataset_name)
187
191
  table[ReportKey.metric_name].append(metric.name)
188
192
  table[ReportKey.category_name].append(('-', ))
189
- table[ReportKey.subset_name].append('OVERALL')
193
+ table[ReportKey.subset_name].append(ReportKey.overall_score)
190
194
  table[ReportKey.num].append(metric.num)
191
195
  table[ReportKey.score].append(metric.score)
192
196
  # NOTE: only flatten metrics if needed, use the first metric by default
evalscope/run.py CHANGED
@@ -131,8 +131,9 @@ def evaluate_model(task_config: TaskConfig, outputs: OutputsStructure) -> dict:
131
131
  )
132
132
  evaluators.append(evaluator)
133
133
 
134
- # Update task_config.dataset_args with benchmark metadata
135
- task_config.dataset_args[dataset_name] = benchmark.to_dict()
134
+ # Update task_config.dataset_args with benchmark metadata, except for DataCollection
135
+ if dataset_name != DataCollection.NAME:
136
+ task_config.dataset_args[dataset_name] = benchmark.to_dict()
136
137
 
137
138
  # dump task_cfg to outputs.configs_dir after creating evaluators
138
139
  task_config.dump_yaml(outputs.configs_dir)
@@ -149,17 +150,20 @@ def evaluate_model(task_config: TaskConfig, outputs: OutputsStructure) -> dict:
149
150
  logger.info(f'Overall report table: \n{report_table} \n')
150
151
  except Exception:
151
152
  logger.error('Failed to generate report table.')
152
-
153
153
  # Clean up
154
154
  if model is not None:
155
155
  import gc
156
- import torch
157
156
 
158
157
  del model
159
158
  del evaluators
160
- torch.cuda.empty_cache()
161
159
  gc.collect()
162
160
 
161
+ from evalscope.utils.import_utils import check_import
162
+ if check_import('torch', raise_warning=False):
163
+ import torch
164
+ if torch.cuda.is_available():
165
+ torch.cuda.empty_cache()
166
+
163
167
  return eval_results
164
168
 
165
169
 
@@ -1,9 +1,5 @@
1
- import os
2
1
  from dataclasses import dataclass
3
- from swift.llm import InferEngine, InferRequest, PtEngine, RequestConfig, get_template
4
2
 
5
- # 设置GPU环境变量
6
- os.environ['CUDA_VISIBLE_DEVICES'] = '0'
7
3
 
8
4
  @dataclass
9
5
  class SwiftInferArgs:
@@ -1,6 +1,5 @@
1
1
  import os
2
2
  import time
3
- import torch
4
3
  from contextlib import contextmanager
5
4
  from functools import partial
6
5
  from pydantic import BaseModel, Field
@@ -95,6 +94,7 @@ class TextCompletionResponse(BaseModel):
95
94
  class ChatService:
96
95
 
97
96
  def __init__(self, model_path, attn_implementation):
97
+ import torch
98
98
  from modelscope import AutoModelForCausalLM, AutoTokenizer
99
99
  from transformers import TextIteratorStreamer
100
100
 
@@ -1,4 +1,6 @@
1
1
  import threading
2
+ import time
3
+ from contextlib import contextmanager
2
4
  from functools import wraps
3
5
 
4
6
 
@@ -27,3 +29,42 @@ def thread_safe(func):
27
29
  return func(*args, **kwargs)
28
30
 
29
31
  return wrapper
32
+
33
+
34
+ def retry_func(retries=3, sleep_interval=0):
35
+ """A decorator that retries a function call up to `retries` times if an exception occurs."""
36
+
37
+ def decorator(func):
38
+
39
+ @wraps(func)
40
+ def wrapper(*args, **kwargs):
41
+ last_exception = None
42
+ for attempt in range(retries):
43
+ try:
44
+ return func(*args, **kwargs)
45
+ except Exception as e:
46
+ last_exception = e
47
+ if sleep_interval > 0:
48
+ time.sleep(sleep_interval)
49
+ raise last_exception
50
+
51
+ return wrapper
52
+
53
+ return decorator
54
+
55
+
56
+ @contextmanager
57
+ def retry_context(retries=3, sleep_interval=0):
58
+ """A context manager that retries the code block up to `retries` times if an exception occurs."""
59
+ last_exception = None
60
+ for attempt in range(retries):
61
+ try:
62
+ yield
63
+ return # If no exception, exit successfully
64
+ except Exception as e:
65
+ last_exception = e
66
+ if sleep_interval > 0:
67
+ time.sleep(sleep_interval)
68
+ if attempt == retries - 1: # Last attempt
69
+ break
70
+ raise last_exception
@@ -5,13 +5,85 @@ import importlib
5
5
  import os
6
6
  from itertools import chain
7
7
  from types import ModuleType
8
- from typing import Any
8
+ from typing import Any, Optional, Union
9
9
 
10
+ from evalscope.constants import IS_BUILD_DOC
10
11
  from .logger import get_logger
11
12
 
12
13
  logger = get_logger() # pylint: disable=invalid-name
13
14
 
14
15
 
16
+ def check_import(
17
+ module_name: Union[str, list[str]],
18
+ package: Optional[Union[str, list[str]]] = None,
19
+ raise_warning: bool = True,
20
+ raise_error: bool = False,
21
+ feature_name: Optional[str] = 'this feature',
22
+ ) -> bool:
23
+ """Check if a module or list of modules can be imported.
24
+
25
+ Args:
26
+ module_name (Union[str, list[str]]): The name(s) of the module(s) to check.
27
+ package (Union[str, list[str]], optional): The package(s) to install if the module(s) are not found.
28
+ Defaults to None.
29
+ raise_error (bool, optional): Whether to raise an error if any module is not found. Defaults to False.
30
+ raise_warning (bool, optional): Whether to log a warning if any module is not found. Defaults to True.
31
+ feature_name (str, optional): The feature name that requires the module(s). Used in the warning/error message.
32
+ Defaults to 'this feature'.
33
+
34
+ Returns:
35
+ bool: True if all modules can be imported, False otherwise.
36
+ """
37
+ # Convert single strings to lists for uniform processing
38
+ if isinstance(module_name, str):
39
+ module_names = [module_name]
40
+ else:
41
+ module_names = module_name
42
+
43
+ if package is None:
44
+ packages = [None] * len(module_names)
45
+ elif isinstance(package, str):
46
+ packages = [package] * len(module_names)
47
+ else:
48
+ packages = package
49
+ # Ensure packages list has same length as module_names
50
+ if len(packages) < len(module_names):
51
+ packages.extend([None] * (len(module_names) - len(packages)))
52
+
53
+ missing_modules = []
54
+ missing_packages = []
55
+
56
+ for i, mod_name in enumerate(module_names):
57
+ try:
58
+ importlib.import_module(mod_name)
59
+ except ImportError:
60
+ missing_modules.append(mod_name)
61
+ if i < len(packages) and packages[i]:
62
+ missing_packages.append(packages[i])
63
+
64
+ if missing_modules:
65
+ if len(missing_modules) == 1:
66
+ error_msg = f'`{missing_modules[0]}` not found.'
67
+ else:
68
+ error_msg = f'The following modules are not found: {", ".join(f"`{mod}`" for mod in missing_modules)}.'
69
+
70
+ if missing_packages:
71
+ if len(missing_packages) == 1:
72
+ error_msg += f' Please run `pip install {missing_packages[0]}` to use {feature_name}.'
73
+ else:
74
+ unique_packages = list(dict.fromkeys(missing_packages)) # Remove duplicates while preserving order
75
+ error_msg += f' Please run `pip install {" ".join(unique_packages)}` to use {feature_name}.'
76
+
77
+ if raise_warning:
78
+ logger.warning(error_msg)
79
+
80
+ if not IS_BUILD_DOC and raise_error:
81
+ raise ImportError(error_msg)
82
+ return False
83
+
84
+ return True
85
+
86
+
15
87
  class _LazyModule(ModuleType):
16
88
  """
17
89
  Module class that surfaces all objects but only performs associated imports when the objects are requested.
@@ -1,6 +1,7 @@
1
1
  import base64
2
2
  import csv
3
3
  import hashlib
4
+ import io
4
5
  import json
5
6
  import jsonlines as jsonl
6
7
  import os
@@ -8,6 +9,7 @@ import re
8
9
  import string
9
10
  import unicodedata
10
11
  import yaml
12
+ from datetime import datetime
11
13
  from io import BytesIO
12
14
  from PIL import Image
13
15
 
@@ -122,6 +124,9 @@ def dump_jsonl_data(data_list, jsonl_file, dump_mode=DumpMode.OVERWRITE):
122
124
  if not isinstance(data_list, list):
123
125
  data_list = [data_list]
124
126
 
127
+ # Convert non-serializable types to serializable ones
128
+ data_list = convert_normal_types(data_list)
129
+
125
130
  if dump_mode == DumpMode.OVERWRITE:
126
131
  dump_mode = 'w'
127
132
  elif dump_mode == DumpMode.APPEND:
@@ -283,22 +288,64 @@ def get_valid_list(input_list, candidate_list):
283
288
  [i for i in input_list if i not in candidate_list]
284
289
 
285
290
 
286
- def PIL_to_base64(image: Image.Image, format: str = 'JPEG') -> str:
291
+ def PIL_to_base64(image: Image.Image, format: str = 'JPEG', add_header: bool = False) -> str:
287
292
  """
288
293
  Convert a PIL Image to a base64 encoded string.
289
294
 
290
295
  Args:
291
296
  image (Image.Image): The PIL Image to convert.
292
297
  format (str): The format to save the image in. Default is 'JPEG'.
298
+ add_header (bool): Whether to add the base64 header. Default is False.
299
+
293
300
  Returns:
294
301
  str: Base64 encoded string of the image.
295
302
  """
296
303
  buffered = BytesIO()
297
304
  image.save(buffered, format=format)
298
305
  img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
306
+ if add_header:
307
+ img_str = f'data:image/{format.lower()};base64,{img_str}'
299
308
  return img_str
300
309
 
301
310
 
311
+ def bytes_to_base64(bytes_data: bytes, *, format: str = 'png', add_header: bool = False, content_type='image') -> str:
312
+ """Convert bytes to a base64 encoded string.
313
+
314
+ Args:
315
+ bytes_data (bytes): The bytes to convert.
316
+ format (str): The format of the image. Default is 'png'.
317
+ add_header (bool): Whether to add the base64 header. Default is False.
318
+ content_type (str): The type of the data, 'image' or 'audio'. Default is 'image'.
319
+
320
+ Returns:
321
+ str: Base64 encoded string of the bytes.
322
+ """
323
+ base64_str = base64.b64encode(bytes_data).decode('utf-8')
324
+ if add_header:
325
+ base64_str = f'data:{content_type}/{format};base64,{base64_str}'
326
+ return base64_str
327
+
328
+
329
+ def base64_to_PIL(base64_str):
330
+ """Convert a base64 encoded string to a PIL Image.
331
+
332
+ Args:
333
+ base64_str (str): The base64 encoded string.
334
+
335
+ Returns:
336
+ Image.Image: The decoded PIL Image.
337
+ """
338
+ # remove header
339
+ if ',' in base64_str:
340
+ base64_str = base64_str.split(',', 1)[1]
341
+
342
+ # decode
343
+ img_data = base64.b64decode(base64_str)
344
+ img_file = io.BytesIO(img_data)
345
+ img = Image.open(img_file)
346
+ return img
347
+
348
+
302
349
  def safe_filename(s: str, max_length: int = 255) -> str:
303
350
  """
304
351
  Convert a string into a safe filename by removing or replacing unsafe characters.
@@ -351,11 +398,13 @@ def safe_filename(s: str, max_length: int = 255) -> str:
351
398
  return s
352
399
 
353
400
 
354
- def convert_numpy_types(obj):
355
- """Recursively convert numpy types to native Python types for JSON serialization."""
401
+ def convert_normal_types(obj):
402
+ """Recursively convert numpy types and datetime objects to native Python types for JSON serialization."""
356
403
  import numpy as np
357
404
 
358
- if isinstance(obj, np.bool_):
405
+ if isinstance(obj, datetime):
406
+ return obj.isoformat()
407
+ elif isinstance(obj, np.bool_):
359
408
  return bool(obj)
360
409
  elif isinstance(obj, np.integer):
361
410
  return int(obj)
@@ -364,10 +413,10 @@ def convert_numpy_types(obj):
364
413
  elif isinstance(obj, np.ndarray):
365
414
  return obj.tolist()
366
415
  elif isinstance(obj, dict):
367
- return {key: convert_numpy_types(value) for key, value in obj.items()}
416
+ return {key: convert_normal_types(value) for key, value in obj.items()}
368
417
  elif isinstance(obj, list):
369
- return [convert_numpy_types(item) for item in obj]
418
+ return [convert_normal_types(item) for item in obj]
370
419
  elif isinstance(obj, tuple):
371
- return tuple(convert_numpy_types(item) for item in obj)
420
+ return tuple(convert_normal_types(item) for item in obj)
372
421
  else:
373
422
  return obj
@@ -4,7 +4,7 @@ from copy import deepcopy
4
4
  from dataclasses import is_dataclass
5
5
  from datetime import date, datetime, time
6
6
  from enum import EnumMeta
7
- from pydantic import BaseModel, Field
7
+ from pydantic import BaseModel, Field, field_validator, model_validator
8
8
  from typing import (
9
9
  Any,
10
10
  Dict,
@@ -59,6 +59,26 @@ class JSONSchema(BaseModel):
59
59
  required: Optional[List[str]] = Field(default=None)
60
60
  """Required fields for object parameters."""
61
61
 
62
+ @field_validator('type')
63
+ def validate_type(cls, v: Optional[str]) -> Optional[JSONType]:
64
+ return python_type_to_json_type(v)
65
+
66
+ @model_validator(mode='before')
67
+ def convert_type_before_validation(cls, values):
68
+ values = deepcopy(values)
69
+
70
+ def recursive_convert_type(obj):
71
+ if isinstance(obj, dict):
72
+ if 'type' in obj:
73
+ obj['type'] = python_type_to_json_type(obj['type'])
74
+ for k, v in obj.items():
75
+ obj[k] = recursive_convert_type(v)
76
+ elif isinstance(obj, list):
77
+ return [recursive_convert_type(item) for item in obj]
78
+ return obj
79
+
80
+ return recursive_convert_type(values)
81
+
62
82
 
63
83
  def json_schema(t: Type[Any]) -> JSONSchema:
64
84
  """Provide a JSON Schema for the specified type.
@@ -152,6 +172,8 @@ def cls_json_schema(cls: Type[Any]) -> JSONSchema:
152
172
 
153
173
 
154
174
  def python_type_to_json_type(python_type: Optional[str]) -> JSONType:
175
+ if python_type is not None and python_type in get_args(JSONType):
176
+ return python_type
155
177
  if python_type == 'str':
156
178
  return 'string'
157
179
  elif python_type == 'int':
@@ -205,4 +227,3 @@ def resolve_schema_references(schema: Dict[str, Any]) -> Dict[str, Any]:
205
227
  return obj
206
228
 
207
229
  return cast(Dict[str, Any], _resolve_refs(schema))
208
- return cast(Dict[str, Any], _resolve_refs(schema))
evalscope/utils/logger.py CHANGED
@@ -28,6 +28,25 @@ logging.getLogger('datasets').setLevel(logging.WARNING)
28
28
  logging.getLogger('httpx').setLevel(logging.WARNING)
29
29
  logging.getLogger('modelscope').setLevel(logging.ERROR)
30
30
 
31
+ info_set = set()
32
+ warning_set = set()
33
+
34
+
35
+ def info_once(self, msg, *args, **kwargs):
36
+ hash_id = kwargs.get('hash_id') or msg
37
+ if hash_id in info_set:
38
+ return
39
+ info_set.add(hash_id)
40
+ self.info(msg)
41
+
42
+
43
+ def warning_once(self, msg, *args, **kwargs):
44
+ hash_id = kwargs.get('hash_id') or msg
45
+ if hash_id in warning_set:
46
+ return
47
+ warning_set.add(hash_id)
48
+ self.warning(msg)
49
+
31
50
 
32
51
  def get_logger(
33
52
  log_file: Optional[str] = None,
@@ -3,6 +3,8 @@ import random
3
3
  from enum import Enum
4
4
  from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
5
5
 
6
+ from evalscope.utils.import_utils import check_import
7
+
6
8
  if TYPE_CHECKING:
7
9
  from transformers import GenerationConfig
8
10
 
@@ -67,7 +69,8 @@ def seed_everything(seed: int):
67
69
  """
68
70
  random.seed(seed)
69
71
  np.random.seed(seed)
70
- try:
72
+
73
+ if check_import('torch', raise_warning=False):
71
74
  import torch
72
75
 
73
76
  torch.manual_seed(seed)
@@ -75,5 +78,3 @@ def seed_everything(seed: int):
75
78
  torch.cuda.manual_seed_all(seed)
76
79
  torch.backends.cudnn.deterministic = True
77
80
  torch.backends.cudnn.benchmark = False
78
- except ImportError:
79
- pass