evalscope 0.16.1__py3-none-any.whl → 0.16.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/app/app.py +20 -5
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +23 -11
- evalscope/backend/rag_eval/utils/embedding.py +2 -4
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +1 -0
- evalscope/benchmarks/aime/aime24_adapter.py +3 -1
- evalscope/benchmarks/aime/aime25_adapter.py +3 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +5 -0
- evalscope/benchmarks/arc/arc_adapter.py +3 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +7 -3
- evalscope/benchmarks/bbh/bbh_adapter.py +3 -0
- evalscope/benchmarks/benchmark.py +1 -0
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/bfcl_adapter.py +237 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +3 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +4 -1
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +3 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -0
- evalscope/benchmarks/data_adapter.py +2 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +1 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +1 -0
- evalscope/benchmarks/drop/drop_adapter.py +3 -0
- evalscope/benchmarks/frames/frames_adapter.py +1 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +19 -23
- evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +3 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +3 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +3 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +3 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +3 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +3 -0
- evalscope/benchmarks/musr/musr_adapter.py +3 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +15 -8
- evalscope/benchmarks/needle_haystack/utils.py +2 -2
- evalscope/benchmarks/process_bench/process_bench_adapter.py +3 -0
- evalscope/benchmarks/race/race_adapter.py +3 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +3 -0
- evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +1 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +21 -3
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +1 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +5 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -0
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +3 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +3 -0
- evalscope/collections/evaluator.py +50 -28
- evalscope/constants.py +1 -1
- evalscope/evaluator/evaluator.py +6 -5
- evalscope/metrics/t2v_metrics/__init__.py +9 -23
- evalscope/models/adapters/__init__.py +2 -0
- evalscope/models/adapters/base_adapter.py +31 -27
- evalscope/models/adapters/bfcl_adapter.py +244 -0
- evalscope/models/adapters/server_adapter.py +78 -17
- evalscope/models/custom/custom_model.py +0 -3
- evalscope/models/custom/dummy_model.py +77 -39
- evalscope/models/local_model.py +1 -1
- evalscope/models/register.py +2 -1
- evalscope/perf/arguments.py +2 -0
- evalscope/perf/benchmark.py +16 -3
- evalscope/perf/plugin/api/openai_api.py +2 -0
- evalscope/report/combinator.py +38 -12
- evalscope/report/utils.py +24 -1
- evalscope/run.py +1 -1
- evalscope/summarizer.py +1 -1
- evalscope/utils/io_utils.py +59 -2
- evalscope/version.py +2 -2
- {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/METADATA +4 -3
- {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/RECORD +82 -79
- tests/aigc/test_t2i.py +8 -8
- tests/cli/test_all.py +40 -33
- tests/cli/test_collection.py +4 -3
- tests/cli/test_run.py +36 -21
- tests/rag/test_clip_benchmark.py +5 -1
- tests/rag/test_mteb.py +46 -2
- {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/LICENSE +0 -0
- {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/WHEEL +0 -0
- {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/entry_points.txt +0 -0
- {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/top_level.txt +0 -0
|
@@ -1,61 +1,99 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
import os
|
|
3
2
|
import time
|
|
4
3
|
from typing import List
|
|
5
4
|
|
|
5
|
+
from evalscope.models import CustomModel
|
|
6
6
|
from evalscope.utils.logger import get_logger
|
|
7
|
-
from .custom_model import CustomModel
|
|
8
7
|
|
|
9
8
|
logger = get_logger()
|
|
10
|
-
"""
|
|
11
|
-
This script is used to rewrite the evaluation results without re-running the model predictions.
|
|
12
|
-
"""
|
|
13
9
|
|
|
14
10
|
|
|
15
11
|
class DummyCustomModel(CustomModel):
|
|
16
12
|
|
|
17
|
-
def __init__(self, config: dict = {
|
|
13
|
+
def __init__(self, config: dict = {}, **kwargs):
|
|
18
14
|
super(DummyCustomModel, self).__init__(config=config, **kwargs)
|
|
19
15
|
|
|
20
|
-
def
|
|
21
|
-
|
|
16
|
+
def make_request_messages(self, input_item: dict) -> list:
|
|
17
|
+
"""
|
|
18
|
+
Make request messages for OpenAI API.
|
|
19
|
+
"""
|
|
20
|
+
if input_item.get('messages', None):
|
|
21
|
+
return input_item['messages']
|
|
22
22
|
|
|
23
|
-
|
|
23
|
+
data: list = input_item['data']
|
|
24
|
+
if isinstance(data[0], tuple): # for truthful_qa and hellaswag
|
|
25
|
+
query = '\n'.join(''.join(item) for item in data)
|
|
26
|
+
system_prompt = input_item.get('system_prompt', None)
|
|
27
|
+
else:
|
|
28
|
+
query = data[0]
|
|
29
|
+
system_prompt = input_item.get('system_prompt', None)
|
|
24
30
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
'message': {
|
|
29
|
-
'content': response,
|
|
30
|
-
'role': 'assistant'
|
|
31
|
-
}
|
|
32
|
-
}],
|
|
33
|
-
'created': time.time(),
|
|
34
|
-
'model': self.config.get('model_id'), # should be model_id
|
|
35
|
-
'object': 'chat.completion',
|
|
36
|
-
'usage': {
|
|
37
|
-
'completion_tokens': 0,
|
|
38
|
-
'prompt_tokens': 0,
|
|
39
|
-
'total_tokens': 0
|
|
40
|
-
}
|
|
41
|
-
}
|
|
31
|
+
messages = []
|
|
32
|
+
if system_prompt:
|
|
33
|
+
messages.append({'role': 'system', 'content': system_prompt})
|
|
42
34
|
|
|
43
|
-
|
|
35
|
+
messages.append({'role': 'user', 'content': query})
|
|
44
36
|
|
|
37
|
+
return messages
|
|
45
38
|
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
39
|
+
def predict(self, prompts: List[dict], **kwargs):
|
|
40
|
+
original_inputs = kwargs.get('origin_inputs', None)
|
|
41
|
+
infer_cfg = kwargs.get('infer_cfg', None)
|
|
42
|
+
|
|
43
|
+
logger.debug(f'** Prompts: {prompts}')
|
|
44
|
+
if original_inputs is not None:
|
|
45
|
+
logger.debug(f'** Original inputs: {original_inputs}')
|
|
46
|
+
if infer_cfg is not None:
|
|
47
|
+
logger.debug(f'** Inference config: {infer_cfg}')
|
|
48
|
+
|
|
49
|
+
# Simulate a response based on the prompts
|
|
50
|
+
# Must return a list of dicts with the same format as the OpenAI API.
|
|
51
|
+
responses = []
|
|
52
|
+
for input_item in original_inputs:
|
|
53
|
+
message = self.make_request_messages(input_item)
|
|
54
|
+
response = f'Dummy response for prompt: {message}'
|
|
55
|
+
|
|
56
|
+
res_d = {
|
|
57
|
+
'choices': [{
|
|
58
|
+
'index': 0,
|
|
59
|
+
'message': {
|
|
60
|
+
'content': response,
|
|
61
|
+
'role': 'assistant'
|
|
62
|
+
}
|
|
63
|
+
}],
|
|
64
|
+
'created': time.time(),
|
|
65
|
+
'model': self.config.get('model_id'),
|
|
66
|
+
'object': 'chat.completion',
|
|
67
|
+
'usage': {
|
|
68
|
+
'completion_tokens': 0,
|
|
69
|
+
'prompt_tokens': 0,
|
|
70
|
+
'total_tokens': 0
|
|
71
|
+
}
|
|
72
|
+
}
|
|
52
73
|
|
|
53
|
-
|
|
74
|
+
responses.append(res_d)
|
|
54
75
|
|
|
55
|
-
|
|
76
|
+
return responses
|
|
56
77
|
|
|
57
|
-
task_cfg_d = yaml_to_dict(task_cfg_file)
|
|
58
|
-
task_cfg_d.update({'model': swift_model})
|
|
59
78
|
|
|
60
|
-
|
|
61
|
-
|
|
79
|
+
if __name__ == '__main__':
|
|
80
|
+
from evalscope import TaskConfig, run_task
|
|
81
|
+
|
|
82
|
+
dummy_model = DummyCustomModel()
|
|
83
|
+
task_config = TaskConfig(
|
|
84
|
+
model=dummy_model,
|
|
85
|
+
model_id='evalscope-model-dummy',
|
|
86
|
+
datasets=['gsm8k'],
|
|
87
|
+
eval_type='custom', # must be custom for custom model evaluation
|
|
88
|
+
generation_config={
|
|
89
|
+
'max_new_tokens': 100,
|
|
90
|
+
'temperature': 0.0,
|
|
91
|
+
'top_p': 1.0,
|
|
92
|
+
'top_k': 50,
|
|
93
|
+
'repetition_penalty': 1.0
|
|
94
|
+
},
|
|
95
|
+
debug=True,
|
|
96
|
+
limit=5,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
eval_results = run_task(task_cfg=task_config)
|
evalscope/models/local_model.py
CHANGED
|
@@ -82,7 +82,7 @@ class LocalImageModel(LocalModel):
|
|
|
82
82
|
def __init__(self, **kwargs):
|
|
83
83
|
super().__init__(**kwargs)
|
|
84
84
|
|
|
85
|
-
self.pipeline_cls = kwargs.pop('pipeline_cls', None)
|
|
85
|
+
self.pipeline_cls = self.kwargs.pop('pipeline_cls', None)
|
|
86
86
|
# default to DiffusionPipeline if not specified
|
|
87
87
|
if self.pipeline_cls is None:
|
|
88
88
|
if 'flux' in self.model_id.lower():
|
evalscope/models/register.py
CHANGED
|
@@ -47,8 +47,9 @@ def register_model_adapter_class(cls, name=None):
|
|
|
47
47
|
# register all model adapters
|
|
48
48
|
register_model_adapter_class(BaseModelAdapter, name='base')
|
|
49
49
|
register_model_adapter_class(ChatGenerationModelAdapter, name=OutputType.GENERATION)
|
|
50
|
-
register_model_adapter_class(ContinuationLogitsModelAdapter, name=OutputType.
|
|
50
|
+
register_model_adapter_class(ContinuationLogitsModelAdapter, name=OutputType.CONTINUOUS)
|
|
51
51
|
register_model_adapter_class(MultiChoiceModelAdapter, name=OutputType.MULTIPLE_CHOICE)
|
|
52
52
|
register_model_adapter_class(CustomModelAdapter, name='custom')
|
|
53
53
|
register_model_adapter_class(ServerModelAdapter, name='server')
|
|
54
|
+
register_model_adapter_class(BFCLAdapter, name='bfcl_server')
|
|
54
55
|
register_model_adapter_class(T2IModelAdapter, name=OutputType.IMAGE_GENERATION)
|
evalscope/perf/arguments.py
CHANGED
|
@@ -55,6 +55,7 @@ class Arguments:
|
|
|
55
55
|
|
|
56
56
|
# Response settings
|
|
57
57
|
frequency_penalty: Optional[float] = None # Frequency penalty for the response
|
|
58
|
+
repetition_penalty: Optional[float] = None # Repetition penalty for the response
|
|
58
59
|
logprobs: Optional[bool] = None # Whether to log probabilities
|
|
59
60
|
max_tokens: Optional[int] = 2048 # Maximum number of tokens in the response
|
|
60
61
|
min_tokens: Optional[int] = None # Minimum number of tokens in the response
|
|
@@ -181,6 +182,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
181
182
|
|
|
182
183
|
# Response settings
|
|
183
184
|
parser.add_argument('--frequency-penalty', type=float, help='The frequency_penalty value', default=None)
|
|
185
|
+
parser.add_argument('--repetition-penalty', type=float, help='The repetition_penalty value', default=None)
|
|
184
186
|
parser.add_argument('--logprobs', action='store_true', help='The logprobs', default=None)
|
|
185
187
|
parser.add_argument(
|
|
186
188
|
'--max-tokens', type=int, help='The maximum number of tokens that can be generated', default=2048)
|
evalscope/perf/benchmark.py
CHANGED
|
@@ -41,14 +41,27 @@ async def get_requests(args: Arguments) -> AsyncGenerator[dict, None]:
|
|
|
41
41
|
message_generator_class = DatasetRegistry(args.dataset)
|
|
42
42
|
message_generator = message_generator_class(args)
|
|
43
43
|
|
|
44
|
+
dataset_messages = []
|
|
45
|
+
try:
|
|
46
|
+
for messages in message_generator:
|
|
47
|
+
dataset_messages.append(messages)
|
|
48
|
+
except StopIteration:
|
|
49
|
+
pass
|
|
50
|
+
|
|
51
|
+
if not dataset_messages:
|
|
52
|
+
raise Exception('Dataset is empty!')
|
|
53
|
+
|
|
44
54
|
count = 0
|
|
45
|
-
|
|
55
|
+
dataset_index = 0
|
|
56
|
+
|
|
57
|
+
while count < args.number:
|
|
58
|
+
messages = dataset_messages[dataset_index]
|
|
46
59
|
request = query_generator.build_request(messages, args)
|
|
47
60
|
if request is not None:
|
|
48
61
|
yield request
|
|
49
62
|
count += 1
|
|
50
|
-
|
|
51
|
-
|
|
63
|
+
|
|
64
|
+
dataset_index = (dataset_index + 1) % len(dataset_messages)
|
|
52
65
|
|
|
53
66
|
if args.prompt:
|
|
54
67
|
prompt = load_prompt(args.prompt)
|
|
@@ -75,6 +75,8 @@ class OpenaiPlugin(ApiPluginBase):
|
|
|
75
75
|
payload['min_tokens'] = param.min_tokens
|
|
76
76
|
if param.frequency_penalty is not None:
|
|
77
77
|
payload['frequency_penalty'] = param.frequency_penalty
|
|
78
|
+
if param.repetition_penalty is not None:
|
|
79
|
+
payload['repetition_penalty'] = param.repetition_penalty
|
|
78
80
|
if param.logprobs is not None:
|
|
79
81
|
payload['logprobs'] = param.logprobs
|
|
80
82
|
if param.n_choices is not None:
|
evalscope/report/combinator.py
CHANGED
|
@@ -34,25 +34,51 @@ def get_report_list(reports_path_list: List[str]) -> List[Report]:
|
|
|
34
34
|
|
|
35
35
|
def get_data_frame(report_list: List[Report],
|
|
36
36
|
flatten_metrics: bool = True,
|
|
37
|
-
flatten_categories: bool = True
|
|
37
|
+
flatten_categories: bool = True,
|
|
38
|
+
add_overall_metric: bool = False) -> pd.DataFrame:
|
|
38
39
|
tables = []
|
|
39
40
|
for report in report_list:
|
|
40
|
-
df = report.to_dataframe(
|
|
41
|
+
df = report.to_dataframe(
|
|
42
|
+
flatten_metrics=flatten_metrics,
|
|
43
|
+
flatten_categories=flatten_categories,
|
|
44
|
+
add_overall_metric=add_overall_metric)
|
|
41
45
|
tables.append(df)
|
|
42
46
|
return pd.concat(tables, ignore_index=True)
|
|
43
47
|
|
|
44
48
|
|
|
45
|
-
def gen_table(reports_path_list: list
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
def gen_report_table(report: Report) -> str:
|
|
49
|
+
def gen_table(reports_path_list: list[str] = None,
|
|
50
|
+
report_list: list[Report] = None,
|
|
51
|
+
flatten_metrics: bool = True,
|
|
52
|
+
flatten_categories: bool = True,
|
|
53
|
+
add_overall_metric: bool = False) -> str:
|
|
52
54
|
"""
|
|
53
|
-
|
|
55
|
+
Generates a formatted table from a list of report paths or Report objects.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
reports_path_list (list[str], optional): List of file paths to report files.
|
|
59
|
+
Either this or `report_list` must be provided.
|
|
60
|
+
report_list (list[Report], optional): List of Report objects.
|
|
61
|
+
Either this or `reports_path_list` must be provided.
|
|
62
|
+
flatten_metrics (bool, optional): Whether to flatten the metrics in the output table. Defaults to True.
|
|
63
|
+
flatten_categories (bool, optional): Whether to flatten the categories in the output table. Defaults to True.
|
|
64
|
+
add_overall_metric (bool, optional): Whether to add an overall metric column to the table. Defaults to False.
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
str: A string representation of the table in grid format.
|
|
68
|
+
|
|
69
|
+
Raises:
|
|
70
|
+
AssertionError: If neither `reports_path_list` nor `report_list` is provided.
|
|
54
71
|
"""
|
|
55
|
-
|
|
72
|
+
assert (reports_path_list is not None) or (report_list is not None), \
|
|
73
|
+
'Either reports_path_list or report_list must be provided.'
|
|
74
|
+
if report_list is None:
|
|
75
|
+
report_list = get_report_list(reports_path_list)
|
|
76
|
+
# Generate a DataFrame from the report list
|
|
77
|
+
table = get_data_frame(
|
|
78
|
+
report_list,
|
|
79
|
+
flatten_metrics=flatten_metrics,
|
|
80
|
+
flatten_categories=flatten_categories,
|
|
81
|
+
add_overall_metric=add_overall_metric)
|
|
56
82
|
return tabulate(table, headers=table.columns, tablefmt='grid', showindex=False)
|
|
57
83
|
|
|
58
84
|
|
|
@@ -68,7 +94,7 @@ if __name__ == '__main__':
|
|
|
68
94
|
report_dir_1 = './outputs/20250117_151926'
|
|
69
95
|
# report_dir_2 = './outputs/20250107_204445/reports'
|
|
70
96
|
|
|
71
|
-
report_table = gen_table([report_dir_1])
|
|
97
|
+
report_table = gen_table(reports_path_list=[report_dir_1])
|
|
72
98
|
print(report_table)
|
|
73
99
|
|
|
74
100
|
# ALL VALUES ONLY FOR EXAMPLE
|
evalscope/report/utils.py
CHANGED
|
@@ -133,11 +133,25 @@ class Report:
|
|
|
133
133
|
data = json.load(f)
|
|
134
134
|
return cls.from_dict(data)
|
|
135
135
|
|
|
136
|
-
def to_dataframe(self,
|
|
136
|
+
def to_dataframe(self,
|
|
137
|
+
flatten_metrics: bool = True,
|
|
138
|
+
flatten_categories: bool = True,
|
|
139
|
+
add_overall_metric: bool = False) -> pd.DataFrame:
|
|
140
|
+
"""
|
|
141
|
+
Convert the report to a pandas DataFrame.
|
|
142
|
+
Args:
|
|
143
|
+
flatten_metrics (bool): Whether to flatten the metrics to a single row.
|
|
144
|
+
flatten_categories (bool): Whether to flatten the categories to multiple rows.
|
|
145
|
+
add_overall_metric (bool): Whether to add an overall metric row.
|
|
146
|
+
Returns:
|
|
147
|
+
pd.DataFrame: The report as a pandas DataFrame.
|
|
148
|
+
"""
|
|
137
149
|
table = defaultdict(list)
|
|
138
150
|
for metric in self.metrics:
|
|
151
|
+
metric_count = 0
|
|
139
152
|
for category in metric.categories:
|
|
140
153
|
for subset in category.subsets:
|
|
154
|
+
metric_count += 1
|
|
141
155
|
table[ReportKey.model_name].append(self.model_name)
|
|
142
156
|
table[ReportKey.dataset_name].append(self.dataset_name)
|
|
143
157
|
table[ReportKey.metric_name].append(metric.name)
|
|
@@ -145,6 +159,15 @@ class Report:
|
|
|
145
159
|
table[ReportKey.subset_name].append(subset.name)
|
|
146
160
|
table[ReportKey.num].append(subset.num)
|
|
147
161
|
table[ReportKey.score].append(subset.score)
|
|
162
|
+
# add overall metric when there are multiple subsets
|
|
163
|
+
if metric_count > 1 and add_overall_metric:
|
|
164
|
+
table[ReportKey.model_name].append(self.model_name)
|
|
165
|
+
table[ReportKey.dataset_name].append(self.dataset_name)
|
|
166
|
+
table[ReportKey.metric_name].append(metric.name)
|
|
167
|
+
table[ReportKey.category_name].append(('-', ))
|
|
168
|
+
table[ReportKey.subset_name].append('OVERALL')
|
|
169
|
+
table[ReportKey.num].append(metric.num)
|
|
170
|
+
table[ReportKey.score].append(metric.score)
|
|
148
171
|
# NOTE: only flatten metrics if needed, use the first metric by default
|
|
149
172
|
if not flatten_metrics:
|
|
150
173
|
break
|
evalscope/run.py
CHANGED
|
@@ -133,7 +133,7 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
|
|
|
133
133
|
|
|
134
134
|
# Make overall report
|
|
135
135
|
try:
|
|
136
|
-
report_table: str = gen_table([outputs.reports_dir])
|
|
136
|
+
report_table: str = gen_table(reports_path_list=[outputs.reports_dir], add_overall_metric=True)
|
|
137
137
|
logger.info(f'Overall report table: \n{report_table} \n')
|
|
138
138
|
except Exception:
|
|
139
139
|
logger.error('Failed to generate report table.')
|
evalscope/summarizer.py
CHANGED
|
@@ -30,7 +30,7 @@ class Summarizer:
|
|
|
30
30
|
with open(report_file, 'r') as f:
|
|
31
31
|
res_list.append(json.load(f))
|
|
32
32
|
|
|
33
|
-
report_table: str = gen_table([reports_dir])
|
|
33
|
+
report_table: str = gen_table(reports_path_list=[reports_dir])
|
|
34
34
|
logger.info(f'*** Report table ***\n{report_table}')
|
|
35
35
|
|
|
36
36
|
return res_list
|
evalscope/utils/io_utils.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import csv
|
|
1
2
|
import json
|
|
2
3
|
import jsonlines as jsonl
|
|
3
4
|
import os
|
|
@@ -112,8 +113,58 @@ def dump_jsonl_data(data_list, jsonl_file, dump_mode=DumpMode.OVERWRITE):
|
|
|
112
113
|
writer.write_all(data_list)
|
|
113
114
|
|
|
114
115
|
|
|
115
|
-
def jsonl_to_csv():
|
|
116
|
-
|
|
116
|
+
def jsonl_to_csv(jsonl_file, csv_file):
|
|
117
|
+
"""
|
|
118
|
+
Convert jsonl file to csv file.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
jsonl_file: jsonl file path.
|
|
122
|
+
csv_file: csv file path.
|
|
123
|
+
"""
|
|
124
|
+
data = jsonl_to_list(jsonl_file)
|
|
125
|
+
if not data:
|
|
126
|
+
logger.warning(f'No data found in {jsonl_file}.')
|
|
127
|
+
return
|
|
128
|
+
|
|
129
|
+
with open(csv_file, 'w', newline='', encoding='utf-8') as f:
|
|
130
|
+
writer = csv.writer(f)
|
|
131
|
+
writer.writerow(data[0].keys()) # Write header
|
|
132
|
+
for item in data:
|
|
133
|
+
writer.writerow(item.values())
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def csv_to_list(csv_file) -> list:
|
|
137
|
+
"""
|
|
138
|
+
Read csv file to list.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
csv_file: csv file path.
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
list: list of lines. Each line is a dict.
|
|
145
|
+
"""
|
|
146
|
+
res_list = []
|
|
147
|
+
with open(csv_file, 'r', encoding='utf-8') as f:
|
|
148
|
+
reader = csv.DictReader(f)
|
|
149
|
+
for row in reader:
|
|
150
|
+
res_list.append(row)
|
|
151
|
+
return res_list
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def csv_to_jsonl(csv_file, jsonl_file):
|
|
155
|
+
"""
|
|
156
|
+
Convert csv file to jsonl file.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
csv_file: csv file path.
|
|
160
|
+
jsonl_file: jsonl file path.
|
|
161
|
+
"""
|
|
162
|
+
data = csv_to_list(csv_file)
|
|
163
|
+
if not data:
|
|
164
|
+
logger.warning(f'No data found in {csv_file}.')
|
|
165
|
+
return
|
|
166
|
+
|
|
167
|
+
dump_jsonl_data(data, jsonl_file, dump_mode=DumpMode.OVERWRITE)
|
|
117
168
|
|
|
118
169
|
|
|
119
170
|
def yaml_to_dict(yaml_file) -> dict:
|
|
@@ -168,3 +219,9 @@ def dict_to_json(d: dict, json_file: str):
|
|
|
168
219
|
"""
|
|
169
220
|
with open(json_file, 'w') as f:
|
|
170
221
|
json.dump(d, f, indent=4, ensure_ascii=False)
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
if __name__ == '__main__':
|
|
225
|
+
csv_file = 'custom_eval/text/mcq/example_val.csv'
|
|
226
|
+
jsonl_file = 'custom_eval/text/mcq/example_val.jsonl'
|
|
227
|
+
csv_to_jsonl(csv_file, jsonl_file)
|
evalscope/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.16.
|
|
3
|
+
Version: 0.16.2
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -40,7 +40,6 @@ Requires-Dist: seaborn
|
|
|
40
40
|
Requires-Dist: sympy
|
|
41
41
|
Requires-Dist: tabulate
|
|
42
42
|
Requires-Dist: torch
|
|
43
|
-
Requires-Dist: torchvision
|
|
44
43
|
Requires-Dist: tqdm
|
|
45
44
|
Requires-Dist: transformers>=4.33
|
|
46
45
|
Requires-Dist: word2number
|
|
@@ -50,6 +49,7 @@ Requires-Dist: iopath; extra == "aigc"
|
|
|
50
49
|
Requires-Dist: omegaconf; extra == "aigc"
|
|
51
50
|
Requires-Dist: open-clip-torch; extra == "aigc"
|
|
52
51
|
Requires-Dist: opencv-python; extra == "aigc"
|
|
52
|
+
Requires-Dist: torchvision; extra == "aigc"
|
|
53
53
|
Provides-Extra: all
|
|
54
54
|
Requires-Dist: accelerate; extra == "all"
|
|
55
55
|
Requires-Dist: datasets>=3.0; extra == "all"
|
|
@@ -75,7 +75,6 @@ Requires-Dist: seaborn; extra == "all"
|
|
|
75
75
|
Requires-Dist: sympy; extra == "all"
|
|
76
76
|
Requires-Dist: tabulate; extra == "all"
|
|
77
77
|
Requires-Dist: torch; extra == "all"
|
|
78
|
-
Requires-Dist: torchvision; extra == "all"
|
|
79
78
|
Requires-Dist: tqdm; extra == "all"
|
|
80
79
|
Requires-Dist: transformers>=4.33; extra == "all"
|
|
81
80
|
Requires-Dist: word2number; extra == "all"
|
|
@@ -102,6 +101,7 @@ Requires-Dist: iopath; extra == "all"
|
|
|
102
101
|
Requires-Dist: omegaconf; extra == "all"
|
|
103
102
|
Requires-Dist: open-clip-torch; extra == "all"
|
|
104
103
|
Requires-Dist: opencv-python; extra == "all"
|
|
104
|
+
Requires-Dist: torchvision; extra == "all"
|
|
105
105
|
Provides-Extra: app
|
|
106
106
|
Requires-Dist: gradio==5.4.0; extra == "app"
|
|
107
107
|
Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
|
|
@@ -230,6 +230,7 @@ Please scan the QR code below to join our community groups:
|
|
|
230
230
|
|
|
231
231
|
## 🎉 News
|
|
232
232
|
|
|
233
|
+
- 🔥 **[2025.06.19]** Added support for the BFCL-v3 benchmark, designed to evaluate model function-calling capabilities across various scenarios. For more information, refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html).
|
|
233
234
|
- 🔥 **[2025.06.02]** Added support for the Needle-in-a-Haystack test. Simply specify `needle_haystack` to conduct the test, and a corresponding heatmap will be generated in the `outputs/reports` folder, providing a visual representation of the model's performance. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/needle_haystack.html) for more details.
|
|
234
235
|
- 🔥 **[2025.05.29]** Added support for two long document evaluation benchmarks: [DocMath](https://modelscope.cn/datasets/yale-nlp/DocMath-Eval/summary) and [FRAMES](https://modelscope.cn/datasets/iic/frames/summary). For usage guidelines, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html).
|
|
235
236
|
- 🔥 **[2025.05.16]** Model service performance stress testing now supports setting various levels of concurrency and outputs a performance test report. [Reference example](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#id3).
|