evalscope 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/api/benchmark/__init__.py +1 -1
- evalscope/api/benchmark/adapters/__init__.py +2 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +7 -4
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
- evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +62 -2
- evalscope/api/benchmark/meta.py +9 -0
- evalscope/api/dataset/dataset.py +6 -6
- evalscope/api/dataset/loader.py +2 -1
- evalscope/api/evaluator/cache.py +24 -1
- evalscope/api/evaluator/evaluator.py +5 -0
- evalscope/api/evaluator/state.py +17 -1
- evalscope/api/messages/__init__.py +1 -0
- evalscope/api/messages/chat_message.py +52 -2
- evalscope/api/metric/scorer.py +15 -7
- evalscope/api/mixin/__init__.py +1 -1
- evalscope/api/mixin/llm_judge_mixin.py +2 -0
- evalscope/api/mixin/sandbox_mixin.py +204 -0
- evalscope/api/model/generate_config.py +1 -6
- evalscope/api/model/model.py +5 -2
- evalscope/api/tool/tool_info.py +1 -1
- evalscope/app/app.py +3 -0
- evalscope/app/ui/single_model.py +3 -3
- evalscope/app/utils/data_utils.py +7 -7
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -12
- evalscope/arguments.py +8 -4
- evalscope/backend/opencompass/backend_manager.py +0 -2
- evalscope/backend/rag_eval/utils/embedding.py +9 -1
- evalscope/benchmarks/ai2d/ai2d_adapter.py +53 -0
- evalscope/benchmarks/amc/amc_adapter.py +46 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
- evalscope/benchmarks/bfcl/bfcl_adapter.py +142 -7
- evalscope/benchmarks/bfcl/generation.py +9 -9
- evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
- evalscope/benchmarks/data_collection/data_collection_adapter.py +23 -19
- evalscope/benchmarks/drop/drop_adapter.py +1 -1
- evalscope/benchmarks/frames/frames_adapter.py +2 -1
- evalscope/benchmarks/general_arena/general_arena_adapter.py +5 -1
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +6 -5
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/tau_bench/generation.py +1 -1
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +20 -19
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/config.py +96 -14
- evalscope/constants.py +11 -0
- evalscope/evaluator/evaluator.py +30 -10
- evalscope/metrics/llm_judge.py +19 -7
- evalscope/metrics/metric.py +27 -2
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/model_apis.py +22 -0
- evalscope/models/openai_compatible.py +3 -0
- evalscope/models/text2image_model.py +2 -2
- evalscope/models/utils/openai.py +8 -6
- evalscope/perf/arguments.py +2 -0
- evalscope/perf/benchmark.py +2 -0
- evalscope/perf/plugin/api/base.py +2 -2
- evalscope/perf/plugin/api/default_api.py +7 -7
- evalscope/perf/plugin/api/openai_api.py +83 -19
- evalscope/perf/plugin/datasets/flickr8k.py +2 -2
- evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
- evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
- evalscope/perf/utils/benchmark_util.py +7 -5
- evalscope/perf/utils/local_server.py +3 -0
- evalscope/report/__init__.py +0 -1
- evalscope/report/combinator.py +0 -25
- evalscope/report/generator.py +8 -87
- evalscope/report/report.py +8 -4
- evalscope/run.py +9 -5
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/chat_service.py +1 -1
- evalscope/utils/function_utils.py +41 -0
- evalscope/utils/import_utils.py +73 -1
- evalscope/utils/io_utils.py +56 -7
- evalscope/utils/json_schema.py +23 -2
- evalscope/utils/logger.py +19 -0
- evalscope/utils/model_utils.py +4 -3
- evalscope/utils/multi_choices.py +23 -6
- evalscope/version.py +2 -2
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/METADATA +17 -24
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/RECORD +145 -103
- tests/benchmark/test_eval.py +80 -37
- tests/benchmark/test_image_edit.py +65 -0
- tests/benchmark/test_sandbox.py +81 -0
- tests/benchmark/test_vlm.py +137 -0
- tests/cli/test_all.py +83 -43
- tests/cli/test_collection.py +8 -5
- tests/cli/test_reasoning.py +81 -0
- tests/common.py +73 -0
- tests/perf/test_perf.py +44 -14
- tests/rag/test_clip_benchmark.py +0 -3
- evalscope/api/mixin/dataset_mixin.py +0 -105
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
- tests/aigc/__init__.py +0 -1
- /evalscope/benchmarks/{aigc → ai2d}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/i2i → amc}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → healthbench}/__init__.py +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/LICENSE +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/WHEEL +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/entry_points.txt +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/top_level.txt +0 -0
- /tests/{aigc → benchmark}/test_t2i.py +0 -0
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import time
|
|
2
|
-
import torch
|
|
3
2
|
from dataclasses import dataclass, field
|
|
4
3
|
from typing import Any, List, Optional, Tuple
|
|
5
4
|
|
|
5
|
+
from evalscope.utils.import_utils import check_import
|
|
6
6
|
from evalscope.utils.logger import get_logger
|
|
7
7
|
|
|
8
8
|
logger = get_logger()
|
|
@@ -44,10 +44,12 @@ class BenchmarkData:
|
|
|
44
44
|
api_plugin.parse_responses(self.response_messages, request=self.request)
|
|
45
45
|
|
|
46
46
|
def update_gpu_usage(self):
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
total_memory
|
|
50
|
-
|
|
47
|
+
if check_import('torch', raise_warning=False):
|
|
48
|
+
import torch
|
|
49
|
+
total_memory = 0
|
|
50
|
+
for i in range(torch.cuda.device_count()):
|
|
51
|
+
total_memory += (torch.cuda.max_memory_allocated(i) / 2**30) # GB
|
|
52
|
+
self.max_gpu_memory_cost = max(self.max_gpu_memory_cost, total_memory)
|
|
51
53
|
|
|
52
54
|
|
|
53
55
|
class Metrics:
|
|
@@ -9,6 +9,7 @@ from sse_starlette.sse import EventSourceResponse
|
|
|
9
9
|
|
|
10
10
|
from evalscope.perf.arguments import Arguments
|
|
11
11
|
from evalscope.utils.chat_service import ChatCompletionRequest, ChatService, ModelList, TextCompletionRequest
|
|
12
|
+
from evalscope.utils.import_utils import check_import
|
|
12
13
|
from evalscope.utils.logger import get_logger
|
|
13
14
|
|
|
14
15
|
logger = get_logger()
|
|
@@ -101,6 +102,8 @@ def create_app(model, attn_implementation=None) -> FastAPI:
|
|
|
101
102
|
def start_app(args: Arguments):
|
|
102
103
|
logger.info('Starting local server, please wait...')
|
|
103
104
|
if args.api == 'local':
|
|
105
|
+
check_import('torch', 'torch', raise_error=True)
|
|
106
|
+
|
|
104
107
|
app = create_app(args.model, args.attn_implementation)
|
|
105
108
|
uvicorn.run(app, host='0.0.0.0', port=args.port, workers=1)
|
|
106
109
|
|
evalscope/report/__init__.py
CHANGED
evalscope/report/combinator.py
CHANGED
|
@@ -86,28 +86,3 @@ def gen_table(
|
|
|
86
86
|
add_overall_metric=add_overall_metric
|
|
87
87
|
)
|
|
88
88
|
return tabulate(table, headers=table.columns, tablefmt='grid', showindex=False)
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
class ReportsRecorder:
|
|
92
|
-
COMMON_DATASET_PATH = []
|
|
93
|
-
CUSTOM_DATASET_PATH = []
|
|
94
|
-
|
|
95
|
-
def __init__(self, oss_url: str = '', endpoint: str = ''):
|
|
96
|
-
pass
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
if __name__ == '__main__':
|
|
100
|
-
report_dir_1 = './outputs/20250117_151926'
|
|
101
|
-
# report_dir_2 = './outputs/20250107_204445/reports'
|
|
102
|
-
|
|
103
|
-
report_table = gen_table(reports_path_list=[report_dir_1])
|
|
104
|
-
print(report_table)
|
|
105
|
-
|
|
106
|
-
# ALL VALUES ONLY FOR EXAMPLE
|
|
107
|
-
# +--------------------------+-------------------+-------------+
|
|
108
|
-
# | Model | CompetitionMath | GSM8K |
|
|
109
|
-
# +==========================+===================+=============+
|
|
110
|
-
# | ZhipuAI_chatglm2-6b-base | 25.0 (acc) | 30.50 (acc) |
|
|
111
|
-
# +--------------------------+-------------------+-------------+
|
|
112
|
-
# | ZhipuAI_chatglm2-6b | 30.5 (acc) | 40.50 (acc) |
|
|
113
|
-
# +--------------------------+-------------------+-------------+
|
evalscope/report/generator.py
CHANGED
|
@@ -8,105 +8,26 @@ from evalscope.report.report import *
|
|
|
8
8
|
if TYPE_CHECKING:
|
|
9
9
|
from evalscope.api.benchmark import DataAdapter
|
|
10
10
|
from evalscope.api.metric import AggScore
|
|
11
|
-
from evalscope.benchmarks import DataAdapter as OldDataAdapter
|
|
12
11
|
|
|
13
12
|
|
|
14
13
|
class ReportGenerator:
|
|
15
14
|
|
|
16
15
|
@staticmethod
|
|
17
|
-
def
|
|
18
|
-
"""
|
|
19
|
-
Generate a report for a specific dataset based on provided subset scores.
|
|
20
|
-
|
|
21
|
-
Args:
|
|
22
|
-
subset_score_map (dict): A mapping from subset names to a list of score dictionaries.
|
|
23
|
-
{
|
|
24
|
-
'subset_name': [
|
|
25
|
-
{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100},
|
|
26
|
-
{'metric_name': 'WeightedAverageAccuracy', 'score': 0.3389, 'num': 100}
|
|
27
|
-
],
|
|
28
|
-
...
|
|
29
|
-
}
|
|
30
|
-
report_name (str): The name of the report to generate.
|
|
31
|
-
data_adapter (DataAdapter): An adapter object for data handling.
|
|
32
|
-
|
|
33
|
-
Returns:
|
|
34
|
-
Report: A structured report object containing metrics, categories, and subsets.
|
|
35
|
-
|
|
36
|
-
>>> report = gen_report(subset_score_map, "My Report", data_adapter, dataset_name="Dataset", model_name="Model")
|
|
37
|
-
""" # noqa: E501
|
|
38
|
-
|
|
39
|
-
dataset_name = data_adapter.name
|
|
40
|
-
category_map = data_adapter.category_map
|
|
41
|
-
report_name = f'{model_name}@{dataset_name}'
|
|
42
|
-
|
|
43
|
-
def flatten_subset() -> DataFrame:
|
|
44
|
-
"""
|
|
45
|
-
Flatten subset score map to a DataFrame.
|
|
46
|
-
|
|
47
|
-
Example:
|
|
48
|
-
name score num categories metric_name
|
|
49
|
-
0 ARC-Easy 0.5 2 [default] AverageAccuracy
|
|
50
|
-
1 ARC-Challenge 0.5 2 [default] AverageAccuracy
|
|
51
|
-
"""
|
|
52
|
-
subsets = []
|
|
53
|
-
for subset_name, scores in subset_score_map.items():
|
|
54
|
-
for score_item in scores:
|
|
55
|
-
categories = category_map.get(subset_name, ['default'])
|
|
56
|
-
if isinstance(categories, str):
|
|
57
|
-
categories = [categories]
|
|
58
|
-
subsets.append(
|
|
59
|
-
dict(
|
|
60
|
-
name=subset_name,
|
|
61
|
-
score=score_item['score'],
|
|
62
|
-
num=score_item['num'],
|
|
63
|
-
metric_name=score_item['metric_name'],
|
|
64
|
-
categories=tuple(categories)
|
|
65
|
-
)
|
|
66
|
-
)
|
|
67
|
-
df = pd.DataFrame(subsets)
|
|
68
|
-
return df
|
|
69
|
-
|
|
70
|
-
df = flatten_subset()
|
|
71
|
-
|
|
16
|
+
def gen_collection_report(df: DataFrame, all_dataset_name: str, model_name: str) -> Report:
|
|
72
17
|
metrics_list = []
|
|
73
|
-
for metric_name, group_metric in df.groupby('
|
|
18
|
+
for metric_name, group_metric in df.groupby('metric', sort=False):
|
|
74
19
|
categories = []
|
|
75
20
|
for category_name, group_category in group_metric.groupby('categories'):
|
|
76
21
|
subsets = []
|
|
77
|
-
for
|
|
78
|
-
|
|
79
|
-
|
|
22
|
+
for (dataset_name, subset_name), group_subset in group_category.groupby(['dataset_name',
|
|
23
|
+
'subset_name']):
|
|
24
|
+
avg_score = group_subset['score'].mean()
|
|
25
|
+
num = group_subset['score'].count()
|
|
26
|
+
subsets.append(Subset(name=f'{dataset_name}/{subset_name}', score=float(avg_score), num=int(num)))
|
|
80
27
|
categories.append(Category(name=category_name, subsets=subsets))
|
|
81
|
-
|
|
82
28
|
metrics_list.append(Metric(name=metric_name, categories=categories))
|
|
83
|
-
|
|
84
|
-
report = Report(
|
|
85
|
-
name=report_name,
|
|
86
|
-
metrics=metrics_list,
|
|
87
|
-
dataset_name=dataset_name,
|
|
88
|
-
model_name=model_name,
|
|
89
|
-
dataset_description=data_adapter.description,
|
|
90
|
-
dataset_pretty_name=data_adapter.pretty_name
|
|
91
|
-
)
|
|
92
|
-
return report
|
|
93
|
-
|
|
94
|
-
@staticmethod
|
|
95
|
-
def gen_collection_report(df: DataFrame, all_dataset_name: str, model_name: str) -> Report:
|
|
96
|
-
categories = []
|
|
97
|
-
for category_name, group_category in df.groupby('categories'):
|
|
98
|
-
subsets = []
|
|
99
|
-
for (dataset_name, subset_name), group_subset in group_category.groupby(['dataset_name', 'subset_name']):
|
|
100
|
-
avg_score = group_subset['score'].mean()
|
|
101
|
-
num = group_subset['score'].count()
|
|
102
|
-
subsets.append(Subset(name=f'{dataset_name}/{subset_name}', score=float(avg_score), num=int(num)))
|
|
103
|
-
|
|
104
|
-
categories.append(Category(name=category_name, subsets=subsets))
|
|
105
29
|
return Report(
|
|
106
|
-
name=DataCollection.NAME,
|
|
107
|
-
metrics=[Metric(name='Average', categories=categories)],
|
|
108
|
-
dataset_name=all_dataset_name,
|
|
109
|
-
model_name=model_name
|
|
30
|
+
name=DataCollection.NAME, metrics=metrics_list, dataset_name=all_dataset_name, model_name=model_name
|
|
110
31
|
)
|
|
111
32
|
|
|
112
33
|
@staticmethod
|
evalscope/report/report.py
CHANGED
|
@@ -22,7 +22,7 @@ ANALYSIS_PROMPT = """根据给出的json格式的模型评测结果,输出分
|
|
|
22
22
|
"""
|
|
23
23
|
|
|
24
24
|
|
|
25
|
-
def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float, dict]:
|
|
25
|
+
def normalize_score(score: Union[float, dict, int], keep_num: int = 4) -> Union[float, dict]:
|
|
26
26
|
"""
|
|
27
27
|
Normalize score.
|
|
28
28
|
|
|
@@ -37,9 +37,10 @@ def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float
|
|
|
37
37
|
score = round(score, keep_num)
|
|
38
38
|
elif isinstance(score, dict):
|
|
39
39
|
score = {k: round(v, keep_num) for k, v in score.items()}
|
|
40
|
+
elif isinstance(score, int):
|
|
41
|
+
score = float(score)
|
|
40
42
|
else:
|
|
41
43
|
logger.warning(f'Unknown score type: {type(score)}')
|
|
42
|
-
|
|
43
44
|
return score
|
|
44
45
|
|
|
45
46
|
|
|
@@ -103,6 +104,7 @@ class ReportKey:
|
|
|
103
104
|
subset_name = 'Subset'
|
|
104
105
|
num = 'Num'
|
|
105
106
|
score = 'Score'
|
|
107
|
+
overall_score = 'OVERALL'
|
|
106
108
|
|
|
107
109
|
|
|
108
110
|
@dataclass
|
|
@@ -181,12 +183,14 @@ class Report:
|
|
|
181
183
|
table[ReportKey.num].append(subset.num)
|
|
182
184
|
table[ReportKey.score].append(subset.score)
|
|
183
185
|
# add overall metric when there are multiple subsets
|
|
184
|
-
if metric_count > 1 and add_overall_metric
|
|
186
|
+
if metric_count > 1 and add_overall_metric and (
|
|
187
|
+
ReportKey.overall_score not in table[ReportKey.subset_name]
|
|
188
|
+
):
|
|
185
189
|
table[ReportKey.model_name].append(self.model_name)
|
|
186
190
|
table[ReportKey.dataset_name].append(self.dataset_name)
|
|
187
191
|
table[ReportKey.metric_name].append(metric.name)
|
|
188
192
|
table[ReportKey.category_name].append(('-', ))
|
|
189
|
-
table[ReportKey.subset_name].append(
|
|
193
|
+
table[ReportKey.subset_name].append(ReportKey.overall_score)
|
|
190
194
|
table[ReportKey.num].append(metric.num)
|
|
191
195
|
table[ReportKey.score].append(metric.score)
|
|
192
196
|
# NOTE: only flatten metrics if needed, use the first metric by default
|
evalscope/run.py
CHANGED
|
@@ -131,8 +131,9 @@ def evaluate_model(task_config: TaskConfig, outputs: OutputsStructure) -> dict:
|
|
|
131
131
|
)
|
|
132
132
|
evaluators.append(evaluator)
|
|
133
133
|
|
|
134
|
-
# Update task_config.dataset_args with benchmark metadata
|
|
135
|
-
|
|
134
|
+
# Update task_config.dataset_args with benchmark metadata, except for DataCollection
|
|
135
|
+
if dataset_name != DataCollection.NAME:
|
|
136
|
+
task_config.dataset_args[dataset_name] = benchmark.to_dict()
|
|
136
137
|
|
|
137
138
|
# dump task_cfg to outputs.configs_dir after creating evaluators
|
|
138
139
|
task_config.dump_yaml(outputs.configs_dir)
|
|
@@ -149,17 +150,20 @@ def evaluate_model(task_config: TaskConfig, outputs: OutputsStructure) -> dict:
|
|
|
149
150
|
logger.info(f'Overall report table: \n{report_table} \n')
|
|
150
151
|
except Exception:
|
|
151
152
|
logger.error('Failed to generate report table.')
|
|
152
|
-
|
|
153
153
|
# Clean up
|
|
154
154
|
if model is not None:
|
|
155
155
|
import gc
|
|
156
|
-
import torch
|
|
157
156
|
|
|
158
157
|
del model
|
|
159
158
|
del evaluators
|
|
160
|
-
torch.cuda.empty_cache()
|
|
161
159
|
gc.collect()
|
|
162
160
|
|
|
161
|
+
from evalscope.utils.import_utils import check_import
|
|
162
|
+
if check_import('torch', raise_warning=False):
|
|
163
|
+
import torch
|
|
164
|
+
if torch.cuda.is_available():
|
|
165
|
+
torch.cuda.empty_cache()
|
|
166
|
+
|
|
163
167
|
return eval_results
|
|
164
168
|
|
|
165
169
|
|
evalscope/utils/chat_service.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import time
|
|
3
|
-
import torch
|
|
4
3
|
from contextlib import contextmanager
|
|
5
4
|
from functools import partial
|
|
6
5
|
from pydantic import BaseModel, Field
|
|
@@ -95,6 +94,7 @@ class TextCompletionResponse(BaseModel):
|
|
|
95
94
|
class ChatService:
|
|
96
95
|
|
|
97
96
|
def __init__(self, model_path, attn_implementation):
|
|
97
|
+
import torch
|
|
98
98
|
from modelscope import AutoModelForCausalLM, AutoTokenizer
|
|
99
99
|
from transformers import TextIteratorStreamer
|
|
100
100
|
|
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
import threading
|
|
2
|
+
import time
|
|
3
|
+
from contextlib import contextmanager
|
|
2
4
|
from functools import wraps
|
|
3
5
|
|
|
4
6
|
|
|
@@ -27,3 +29,42 @@ def thread_safe(func):
|
|
|
27
29
|
return func(*args, **kwargs)
|
|
28
30
|
|
|
29
31
|
return wrapper
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def retry_func(retries=3, sleep_interval=0):
|
|
35
|
+
"""A decorator that retries a function call up to `retries` times if an exception occurs."""
|
|
36
|
+
|
|
37
|
+
def decorator(func):
|
|
38
|
+
|
|
39
|
+
@wraps(func)
|
|
40
|
+
def wrapper(*args, **kwargs):
|
|
41
|
+
last_exception = None
|
|
42
|
+
for attempt in range(retries):
|
|
43
|
+
try:
|
|
44
|
+
return func(*args, **kwargs)
|
|
45
|
+
except Exception as e:
|
|
46
|
+
last_exception = e
|
|
47
|
+
if sleep_interval > 0:
|
|
48
|
+
time.sleep(sleep_interval)
|
|
49
|
+
raise last_exception
|
|
50
|
+
|
|
51
|
+
return wrapper
|
|
52
|
+
|
|
53
|
+
return decorator
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@contextmanager
|
|
57
|
+
def retry_context(retries=3, sleep_interval=0):
|
|
58
|
+
"""A context manager that retries the code block up to `retries` times if an exception occurs."""
|
|
59
|
+
last_exception = None
|
|
60
|
+
for attempt in range(retries):
|
|
61
|
+
try:
|
|
62
|
+
yield
|
|
63
|
+
return # If no exception, exit successfully
|
|
64
|
+
except Exception as e:
|
|
65
|
+
last_exception = e
|
|
66
|
+
if sleep_interval > 0:
|
|
67
|
+
time.sleep(sleep_interval)
|
|
68
|
+
if attempt == retries - 1: # Last attempt
|
|
69
|
+
break
|
|
70
|
+
raise last_exception
|
evalscope/utils/import_utils.py
CHANGED
|
@@ -5,13 +5,85 @@ import importlib
|
|
|
5
5
|
import os
|
|
6
6
|
from itertools import chain
|
|
7
7
|
from types import ModuleType
|
|
8
|
-
from typing import Any
|
|
8
|
+
from typing import Any, Optional, Union
|
|
9
9
|
|
|
10
|
+
from evalscope.constants import IS_BUILD_DOC
|
|
10
11
|
from .logger import get_logger
|
|
11
12
|
|
|
12
13
|
logger = get_logger() # pylint: disable=invalid-name
|
|
13
14
|
|
|
14
15
|
|
|
16
|
+
def check_import(
|
|
17
|
+
module_name: Union[str, list[str]],
|
|
18
|
+
package: Optional[Union[str, list[str]]] = None,
|
|
19
|
+
raise_warning: bool = True,
|
|
20
|
+
raise_error: bool = False,
|
|
21
|
+
feature_name: Optional[str] = 'this feature',
|
|
22
|
+
) -> bool:
|
|
23
|
+
"""Check if a module or list of modules can be imported.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
module_name (Union[str, list[str]]): The name(s) of the module(s) to check.
|
|
27
|
+
package (Union[str, list[str]], optional): The package(s) to install if the module(s) are not found.
|
|
28
|
+
Defaults to None.
|
|
29
|
+
raise_error (bool, optional): Whether to raise an error if any module is not found. Defaults to False.
|
|
30
|
+
raise_warning (bool, optional): Whether to log a warning if any module is not found. Defaults to True.
|
|
31
|
+
feature_name (str, optional): The feature name that requires the module(s). Used in the warning/error message.
|
|
32
|
+
Defaults to 'this feature'.
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
bool: True if all modules can be imported, False otherwise.
|
|
36
|
+
"""
|
|
37
|
+
# Convert single strings to lists for uniform processing
|
|
38
|
+
if isinstance(module_name, str):
|
|
39
|
+
module_names = [module_name]
|
|
40
|
+
else:
|
|
41
|
+
module_names = module_name
|
|
42
|
+
|
|
43
|
+
if package is None:
|
|
44
|
+
packages = [None] * len(module_names)
|
|
45
|
+
elif isinstance(package, str):
|
|
46
|
+
packages = [package] * len(module_names)
|
|
47
|
+
else:
|
|
48
|
+
packages = package
|
|
49
|
+
# Ensure packages list has same length as module_names
|
|
50
|
+
if len(packages) < len(module_names):
|
|
51
|
+
packages.extend([None] * (len(module_names) - len(packages)))
|
|
52
|
+
|
|
53
|
+
missing_modules = []
|
|
54
|
+
missing_packages = []
|
|
55
|
+
|
|
56
|
+
for i, mod_name in enumerate(module_names):
|
|
57
|
+
try:
|
|
58
|
+
importlib.import_module(mod_name)
|
|
59
|
+
except ImportError:
|
|
60
|
+
missing_modules.append(mod_name)
|
|
61
|
+
if i < len(packages) and packages[i]:
|
|
62
|
+
missing_packages.append(packages[i])
|
|
63
|
+
|
|
64
|
+
if missing_modules:
|
|
65
|
+
if len(missing_modules) == 1:
|
|
66
|
+
error_msg = f'`{missing_modules[0]}` not found.'
|
|
67
|
+
else:
|
|
68
|
+
error_msg = f'The following modules are not found: {", ".join(f"`{mod}`" for mod in missing_modules)}.'
|
|
69
|
+
|
|
70
|
+
if missing_packages:
|
|
71
|
+
if len(missing_packages) == 1:
|
|
72
|
+
error_msg += f' Please run `pip install {missing_packages[0]}` to use {feature_name}.'
|
|
73
|
+
else:
|
|
74
|
+
unique_packages = list(dict.fromkeys(missing_packages)) # Remove duplicates while preserving order
|
|
75
|
+
error_msg += f' Please run `pip install {" ".join(unique_packages)}` to use {feature_name}.'
|
|
76
|
+
|
|
77
|
+
if raise_warning:
|
|
78
|
+
logger.warning(error_msg)
|
|
79
|
+
|
|
80
|
+
if not IS_BUILD_DOC and raise_error:
|
|
81
|
+
raise ImportError(error_msg)
|
|
82
|
+
return False
|
|
83
|
+
|
|
84
|
+
return True
|
|
85
|
+
|
|
86
|
+
|
|
15
87
|
class _LazyModule(ModuleType):
|
|
16
88
|
"""
|
|
17
89
|
Module class that surfaces all objects but only performs associated imports when the objects are requested.
|
evalscope/utils/io_utils.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import base64
|
|
2
2
|
import csv
|
|
3
3
|
import hashlib
|
|
4
|
+
import io
|
|
4
5
|
import json
|
|
5
6
|
import jsonlines as jsonl
|
|
6
7
|
import os
|
|
@@ -8,6 +9,7 @@ import re
|
|
|
8
9
|
import string
|
|
9
10
|
import unicodedata
|
|
10
11
|
import yaml
|
|
12
|
+
from datetime import datetime
|
|
11
13
|
from io import BytesIO
|
|
12
14
|
from PIL import Image
|
|
13
15
|
|
|
@@ -122,6 +124,9 @@ def dump_jsonl_data(data_list, jsonl_file, dump_mode=DumpMode.OVERWRITE):
|
|
|
122
124
|
if not isinstance(data_list, list):
|
|
123
125
|
data_list = [data_list]
|
|
124
126
|
|
|
127
|
+
# Convert non-serializable types to serializable ones
|
|
128
|
+
data_list = convert_normal_types(data_list)
|
|
129
|
+
|
|
125
130
|
if dump_mode == DumpMode.OVERWRITE:
|
|
126
131
|
dump_mode = 'w'
|
|
127
132
|
elif dump_mode == DumpMode.APPEND:
|
|
@@ -283,22 +288,64 @@ def get_valid_list(input_list, candidate_list):
|
|
|
283
288
|
[i for i in input_list if i not in candidate_list]
|
|
284
289
|
|
|
285
290
|
|
|
286
|
-
def PIL_to_base64(image: Image.Image, format: str = 'JPEG') -> str:
|
|
291
|
+
def PIL_to_base64(image: Image.Image, format: str = 'JPEG', add_header: bool = False) -> str:
|
|
287
292
|
"""
|
|
288
293
|
Convert a PIL Image to a base64 encoded string.
|
|
289
294
|
|
|
290
295
|
Args:
|
|
291
296
|
image (Image.Image): The PIL Image to convert.
|
|
292
297
|
format (str): The format to save the image in. Default is 'JPEG'.
|
|
298
|
+
add_header (bool): Whether to add the base64 header. Default is False.
|
|
299
|
+
|
|
293
300
|
Returns:
|
|
294
301
|
str: Base64 encoded string of the image.
|
|
295
302
|
"""
|
|
296
303
|
buffered = BytesIO()
|
|
297
304
|
image.save(buffered, format=format)
|
|
298
305
|
img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
|
|
306
|
+
if add_header:
|
|
307
|
+
img_str = f'data:image/{format.lower()};base64,{img_str}'
|
|
299
308
|
return img_str
|
|
300
309
|
|
|
301
310
|
|
|
311
|
+
def bytes_to_base64(bytes_data: bytes, *, format: str = 'png', add_header: bool = False, content_type='image') -> str:
|
|
312
|
+
"""Convert bytes to a base64 encoded string.
|
|
313
|
+
|
|
314
|
+
Args:
|
|
315
|
+
bytes_data (bytes): The bytes to convert.
|
|
316
|
+
format (str): The format of the image. Default is 'png'.
|
|
317
|
+
add_header (bool): Whether to add the base64 header. Default is False.
|
|
318
|
+
content_type (str): The type of the data, 'image' or 'audio'. Default is 'image'.
|
|
319
|
+
|
|
320
|
+
Returns:
|
|
321
|
+
str: Base64 encoded string of the bytes.
|
|
322
|
+
"""
|
|
323
|
+
base64_str = base64.b64encode(bytes_data).decode('utf-8')
|
|
324
|
+
if add_header:
|
|
325
|
+
base64_str = f'data:{content_type}/{format};base64,{base64_str}'
|
|
326
|
+
return base64_str
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def base64_to_PIL(base64_str):
|
|
330
|
+
"""Convert a base64 encoded string to a PIL Image.
|
|
331
|
+
|
|
332
|
+
Args:
|
|
333
|
+
base64_str (str): The base64 encoded string.
|
|
334
|
+
|
|
335
|
+
Returns:
|
|
336
|
+
Image.Image: The decoded PIL Image.
|
|
337
|
+
"""
|
|
338
|
+
# remove header
|
|
339
|
+
if ',' in base64_str:
|
|
340
|
+
base64_str = base64_str.split(',', 1)[1]
|
|
341
|
+
|
|
342
|
+
# decode
|
|
343
|
+
img_data = base64.b64decode(base64_str)
|
|
344
|
+
img_file = io.BytesIO(img_data)
|
|
345
|
+
img = Image.open(img_file)
|
|
346
|
+
return img
|
|
347
|
+
|
|
348
|
+
|
|
302
349
|
def safe_filename(s: str, max_length: int = 255) -> str:
|
|
303
350
|
"""
|
|
304
351
|
Convert a string into a safe filename by removing or replacing unsafe characters.
|
|
@@ -351,11 +398,13 @@ def safe_filename(s: str, max_length: int = 255) -> str:
|
|
|
351
398
|
return s
|
|
352
399
|
|
|
353
400
|
|
|
354
|
-
def
|
|
355
|
-
"""Recursively convert numpy types to native Python types for JSON serialization."""
|
|
401
|
+
def convert_normal_types(obj):
|
|
402
|
+
"""Recursively convert numpy types and datetime objects to native Python types for JSON serialization."""
|
|
356
403
|
import numpy as np
|
|
357
404
|
|
|
358
|
-
if isinstance(obj,
|
|
405
|
+
if isinstance(obj, datetime):
|
|
406
|
+
return obj.isoformat()
|
|
407
|
+
elif isinstance(obj, np.bool_):
|
|
359
408
|
return bool(obj)
|
|
360
409
|
elif isinstance(obj, np.integer):
|
|
361
410
|
return int(obj)
|
|
@@ -364,10 +413,10 @@ def convert_numpy_types(obj):
|
|
|
364
413
|
elif isinstance(obj, np.ndarray):
|
|
365
414
|
return obj.tolist()
|
|
366
415
|
elif isinstance(obj, dict):
|
|
367
|
-
return {key:
|
|
416
|
+
return {key: convert_normal_types(value) for key, value in obj.items()}
|
|
368
417
|
elif isinstance(obj, list):
|
|
369
|
-
return [
|
|
418
|
+
return [convert_normal_types(item) for item in obj]
|
|
370
419
|
elif isinstance(obj, tuple):
|
|
371
|
-
return tuple(
|
|
420
|
+
return tuple(convert_normal_types(item) for item in obj)
|
|
372
421
|
else:
|
|
373
422
|
return obj
|
evalscope/utils/json_schema.py
CHANGED
|
@@ -4,7 +4,7 @@ from copy import deepcopy
|
|
|
4
4
|
from dataclasses import is_dataclass
|
|
5
5
|
from datetime import date, datetime, time
|
|
6
6
|
from enum import EnumMeta
|
|
7
|
-
from pydantic import BaseModel, Field
|
|
7
|
+
from pydantic import BaseModel, Field, field_validator, model_validator
|
|
8
8
|
from typing import (
|
|
9
9
|
Any,
|
|
10
10
|
Dict,
|
|
@@ -59,6 +59,26 @@ class JSONSchema(BaseModel):
|
|
|
59
59
|
required: Optional[List[str]] = Field(default=None)
|
|
60
60
|
"""Required fields for object parameters."""
|
|
61
61
|
|
|
62
|
+
@field_validator('type')
|
|
63
|
+
def validate_type(cls, v: Optional[str]) -> Optional[JSONType]:
|
|
64
|
+
return python_type_to_json_type(v)
|
|
65
|
+
|
|
66
|
+
@model_validator(mode='before')
|
|
67
|
+
def convert_type_before_validation(cls, values):
|
|
68
|
+
values = deepcopy(values)
|
|
69
|
+
|
|
70
|
+
def recursive_convert_type(obj):
|
|
71
|
+
if isinstance(obj, dict):
|
|
72
|
+
if 'type' in obj:
|
|
73
|
+
obj['type'] = python_type_to_json_type(obj['type'])
|
|
74
|
+
for k, v in obj.items():
|
|
75
|
+
obj[k] = recursive_convert_type(v)
|
|
76
|
+
elif isinstance(obj, list):
|
|
77
|
+
return [recursive_convert_type(item) for item in obj]
|
|
78
|
+
return obj
|
|
79
|
+
|
|
80
|
+
return recursive_convert_type(values)
|
|
81
|
+
|
|
62
82
|
|
|
63
83
|
def json_schema(t: Type[Any]) -> JSONSchema:
|
|
64
84
|
"""Provide a JSON Schema for the specified type.
|
|
@@ -152,6 +172,8 @@ def cls_json_schema(cls: Type[Any]) -> JSONSchema:
|
|
|
152
172
|
|
|
153
173
|
|
|
154
174
|
def python_type_to_json_type(python_type: Optional[str]) -> JSONType:
|
|
175
|
+
if python_type is not None and python_type in get_args(JSONType):
|
|
176
|
+
return python_type
|
|
155
177
|
if python_type == 'str':
|
|
156
178
|
return 'string'
|
|
157
179
|
elif python_type == 'int':
|
|
@@ -205,4 +227,3 @@ def resolve_schema_references(schema: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
205
227
|
return obj
|
|
206
228
|
|
|
207
229
|
return cast(Dict[str, Any], _resolve_refs(schema))
|
|
208
|
-
return cast(Dict[str, Any], _resolve_refs(schema))
|
evalscope/utils/logger.py
CHANGED
|
@@ -28,6 +28,25 @@ logging.getLogger('datasets').setLevel(logging.WARNING)
|
|
|
28
28
|
logging.getLogger('httpx').setLevel(logging.WARNING)
|
|
29
29
|
logging.getLogger('modelscope').setLevel(logging.ERROR)
|
|
30
30
|
|
|
31
|
+
info_set = set()
|
|
32
|
+
warning_set = set()
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def info_once(self, msg, *args, **kwargs):
|
|
36
|
+
hash_id = kwargs.get('hash_id') or msg
|
|
37
|
+
if hash_id in info_set:
|
|
38
|
+
return
|
|
39
|
+
info_set.add(hash_id)
|
|
40
|
+
self.info(msg)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def warning_once(self, msg, *args, **kwargs):
|
|
44
|
+
hash_id = kwargs.get('hash_id') or msg
|
|
45
|
+
if hash_id in warning_set:
|
|
46
|
+
return
|
|
47
|
+
warning_set.add(hash_id)
|
|
48
|
+
self.warning(msg)
|
|
49
|
+
|
|
31
50
|
|
|
32
51
|
def get_logger(
|
|
33
52
|
log_file: Optional[str] = None,
|
evalscope/utils/model_utils.py
CHANGED
|
@@ -3,6 +3,8 @@ import random
|
|
|
3
3
|
from enum import Enum
|
|
4
4
|
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
|
|
5
5
|
|
|
6
|
+
from evalscope.utils.import_utils import check_import
|
|
7
|
+
|
|
6
8
|
if TYPE_CHECKING:
|
|
7
9
|
from transformers import GenerationConfig
|
|
8
10
|
|
|
@@ -67,7 +69,8 @@ def seed_everything(seed: int):
|
|
|
67
69
|
"""
|
|
68
70
|
random.seed(seed)
|
|
69
71
|
np.random.seed(seed)
|
|
70
|
-
|
|
72
|
+
|
|
73
|
+
if check_import('torch', raise_warning=False):
|
|
71
74
|
import torch
|
|
72
75
|
|
|
73
76
|
torch.manual_seed(seed)
|
|
@@ -75,5 +78,3 @@ def seed_everything(seed: int):
|
|
|
75
78
|
torch.cuda.manual_seed_all(seed)
|
|
76
79
|
torch.backends.cudnn.deterministic = True
|
|
77
80
|
torch.backends.cudnn.benchmark = False
|
|
78
|
-
except ImportError:
|
|
79
|
-
pass
|