evalscope 0.8.0__py3-none-any.whl → 0.8.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/backend/base.py +1 -1
- evalscope/backend/rag_eval/utils/clip.py +2 -2
- evalscope/backend/rag_eval/utils/embedding.py +1 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +2 -1
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +2 -1
- evalscope/benchmarks/humaneval/humaneval_adapter.py +193 -7
- evalscope/benchmarks/race/race_adapter.py +2 -1
- evalscope/config.py +38 -2
- evalscope/constants.py +24 -38
- evalscope/evaluator/__init__.py +0 -1
- evalscope/evaluator/evaluator.py +6 -4
- evalscope/evaluator/rating_eval.py +1 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +2 -1
- evalscope/models/model_adapter.py +1 -1
- evalscope/perf/arguments.py +3 -1
- evalscope/perf/benchmark.py +3 -3
- evalscope/perf/main.py +5 -6
- evalscope/perf/plugin/api/openai_api.py +53 -49
- evalscope/perf/plugin/registry.py +3 -3
- evalscope/perf/utils/benchmark_util.py +4 -4
- evalscope/perf/utils/db_util.py +66 -22
- evalscope/perf/utils/local_server.py +4 -1
- evalscope/run.py +45 -82
- evalscope/run_arena.py +2 -1
- evalscope/summarizer.py +14 -26
- evalscope/third_party/longbench_write/eval.py +2 -1
- evalscope/third_party/longbench_write/longbench_write.py +2 -1
- evalscope/third_party/longbench_write/tools/data_etl.py +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/tools/combine_reports.py +2 -4
- evalscope/tools/rewrite_eval_results.py +1 -1
- evalscope/utils/__init__.py +1 -0
- evalscope/utils/chat_service.py +1 -1
- evalscope/utils/io_utils.py +162 -0
- evalscope/utils/logger.py +8 -0
- evalscope/utils/utils.py +0 -175
- evalscope/version.py +2 -2
- {evalscope-0.8.0.dist-info → evalscope-0.8.2.dist-info}/METADATA +15 -3
- {evalscope-0.8.0.dist-info → evalscope-0.8.2.dist-info}/RECORD +47 -67
- tests/cli/test_run.py +11 -12
- tests/perf/test_perf.py +3 -2
- tests/vlm/test_vlmeval.py +3 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +0 -87
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +0 -26
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +0 -41
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +0 -7
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +0 -60
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +0 -24
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +0 -35
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +0 -34
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +0 -25
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -24
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +0 -16
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +0 -24
- evalscope/evaluator/humaneval_evaluator.py +0 -158
- {evalscope-0.8.0.dist-info → evalscope-0.8.2.dist-info}/LICENSE +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.8.2.dist-info}/WHEEL +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.8.2.dist-info}/entry_points.txt +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.8.2.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
3
|
from transformers import AutoTokenizer
|
|
4
|
-
from typing import Any, Dict, Iterator, List
|
|
4
|
+
from typing import Any, Dict, Iterator, List, Union
|
|
5
5
|
|
|
6
6
|
from evalscope.perf.arguments import Arguments
|
|
7
7
|
from evalscope.perf.plugin.api.base import ApiPluginBase
|
|
@@ -29,7 +29,7 @@ class OpenaiPlugin(ApiPluginBase):
|
|
|
29
29
|
else:
|
|
30
30
|
self.tokenizer = None
|
|
31
31
|
|
|
32
|
-
def build_request(self, messages: List[Dict]
|
|
32
|
+
def build_request(self, messages: Union[List[Dict], str], param: Arguments) -> Dict:
|
|
33
33
|
"""Build the openai format request based on prompt, dataset
|
|
34
34
|
|
|
35
35
|
Args:
|
|
@@ -96,60 +96,64 @@ class OpenaiPlugin(ApiPluginBase):
|
|
|
96
96
|
|
|
97
97
|
def parse_responses(self, responses, request: Any = None, **kwargs) -> Dict:
|
|
98
98
|
"""Parser responses and return number of request and response tokens.
|
|
99
|
-
|
|
100
|
-
|
|
99
|
+
Only one response for non-stream, multiple responses for stream.
|
|
100
|
+
"""
|
|
101
101
|
|
|
102
|
+
# when stream, the last response is the full usage
|
|
103
|
+
# when non-stream, the last response is the first response
|
|
104
|
+
last_response_js = json.loads(responses[-1])
|
|
105
|
+
if 'usage' in last_response_js and last_response_js['usage']:
|
|
106
|
+
input_tokens = last_response_js['usage']['prompt_tokens']
|
|
107
|
+
output_tokens = last_response_js['usage']['completion_tokens']
|
|
108
|
+
return input_tokens, output_tokens
|
|
102
109
|
|
|
103
|
-
|
|
104
|
-
responses (List[bytes]): List of http response body, for stream output,
|
|
105
|
-
there are multiple responses, for general only one.
|
|
106
|
-
kwargs: (Any): The command line --parameter content.
|
|
107
|
-
Returns:
|
|
108
|
-
Tuple: Return number of prompt token and number of completion tokens.
|
|
109
|
-
"""
|
|
110
|
-
full_response_content = ''
|
|
110
|
+
# no usage information in the response, parse the response to get the tokens
|
|
111
111
|
delta_contents = {}
|
|
112
|
-
input_tokens = None
|
|
113
|
-
output_tokens = None
|
|
114
112
|
for response in responses:
|
|
115
113
|
js = json.loads(response)
|
|
116
|
-
if
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
114
|
+
if 'object' in js:
|
|
115
|
+
self.__process_response_object(js, delta_contents)
|
|
116
|
+
else:
|
|
117
|
+
self.__process_no_object(js, delta_contents)
|
|
118
|
+
|
|
119
|
+
input_tokens, output_tokens = self.__calculate_tokens_from_content(request, delta_contents)
|
|
120
|
+
return input_tokens, output_tokens
|
|
121
|
+
|
|
122
|
+
def __process_response_object(self, js, delta_contents):
|
|
123
|
+
if js['object'] == 'chat.completion':
|
|
124
|
+
for choice in js['choices']:
|
|
125
|
+
delta_contents[choice['index']] = [choice['message']['content']]
|
|
126
|
+
elif js['object'] == 'text_completion':
|
|
127
|
+
for choice in js['choices']:
|
|
128
|
+
delta_contents[choice['index']] = [choice['text']]
|
|
129
|
+
elif js['object'] == 'chat.completion.chunk':
|
|
130
|
+
for choice in js.get('choices', []):
|
|
131
|
+
if 'delta' in choice and 'index' in choice:
|
|
132
|
+
delta = choice['delta']
|
|
133
|
+
idx = choice['index']
|
|
134
|
+
if 'content' in delta:
|
|
135
|
+
delta_content = delta['content']
|
|
136
|
+
delta_contents.setdefault(idx, []).append(delta_content)
|
|
137
|
+
|
|
138
|
+
def __process_no_object(self, js, delta_contents):
|
|
139
|
+
# assume the response is a single choice
|
|
140
|
+
for choice in js['choices']:
|
|
141
|
+
if 'delta' in choice:
|
|
142
|
+
delta = choice['delta']
|
|
143
|
+
idx = choice['index']
|
|
144
|
+
if 'content' in delta:
|
|
145
|
+
delta_content = delta['content']
|
|
146
|
+
delta_contents.setdefault(idx, []).append(delta_content)
|
|
147
|
+
else:
|
|
148
|
+
delta_contents[choice['index']] = [choice['message']['content']]
|
|
149
|
+
|
|
150
|
+
def __calculate_tokens_from_content(self, request, delta_contents):
|
|
151
|
+
input_tokens = output_tokens = 0
|
|
152
|
+
if self.tokenizer is not None:
|
|
146
153
|
for idx, choice_contents in delta_contents.items():
|
|
147
|
-
full_response_content = ''.join(
|
|
154
|
+
full_response_content = ''.join(choice_contents)
|
|
148
155
|
input_tokens += len(self.tokenizer.encode(request['messages'][0]['content']))
|
|
149
156
|
output_tokens += len(self.tokenizer.encode(full_response_content))
|
|
150
|
-
|
|
151
|
-
input_tokens = 0
|
|
152
|
-
output_tokens = 0
|
|
157
|
+
else:
|
|
153
158
|
logger.warning('No usage information found. Please specify `--tokenizer-path` to generate usage details.')
|
|
154
|
-
|
|
155
159
|
return input_tokens, output_tokens
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Any, List, Type
|
|
1
|
+
from typing import Any, List, Type, Union
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
class PluginRegistry:
|
|
@@ -20,7 +20,7 @@ class PluginRegistry:
|
|
|
20
20
|
return self.get_class(name)
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
def register_dataset(name: str
|
|
23
|
+
def register_dataset(name: Union[str, List[str]]):
|
|
24
24
|
|
|
25
25
|
def class_decorator(cls: Type):
|
|
26
26
|
if isinstance(name, str):
|
|
@@ -35,7 +35,7 @@ def register_dataset(name: str | List[str]):
|
|
|
35
35
|
return class_decorator
|
|
36
36
|
|
|
37
37
|
|
|
38
|
-
def register_api(name: str
|
|
38
|
+
def register_api(name: Union[str, List[str]]):
|
|
39
39
|
|
|
40
40
|
def class_decorator(cls: Type):
|
|
41
41
|
if isinstance(name, str):
|
|
@@ -116,19 +116,19 @@ class BenchmarkMetrics:
|
|
|
116
116
|
|
|
117
117
|
def create_message(self, default_ndigits=3):
|
|
118
118
|
message = {
|
|
119
|
-
'Time taken for tests (
|
|
119
|
+
'Time taken for tests (s)': round(self.total_time, default_ndigits),
|
|
120
120
|
'Number of concurrency': self.concurrency,
|
|
121
121
|
'Total requests': int(self.n_total_queries),
|
|
122
122
|
'Succeed requests': self.n_succeed_queries,
|
|
123
123
|
'Failed requests': self.n_failed_queries,
|
|
124
|
+
'Throughput(average tokens/s)': round(self.avg_token_per_seconds, default_ndigits),
|
|
124
125
|
'Average QPS': round(self.qps, default_ndigits),
|
|
125
126
|
'Average latency (s)': round(self.avg_latency, default_ndigits),
|
|
126
127
|
'Average time to first token (s)': round(self.avg_first_chunk_latency, default_ndigits),
|
|
127
128
|
'Average time per output token (s)': round(self.avg_time_per_token, 5),
|
|
128
|
-
'Average package latency (s)': round(self.avg_chunk_time, default_ndigits),
|
|
129
|
-
'Average package per request': round(self.n_avg_chunks, default_ndigits),
|
|
130
|
-
'Throughput(average output tokens per second)': round(self.avg_token_per_seconds, default_ndigits),
|
|
131
129
|
'Average input tokens per request': round(self.avg_prompt_tokens, default_ndigits),
|
|
132
130
|
'Average output tokens per request': round(self.avg_completion_tokens, default_ndigits),
|
|
131
|
+
'Average package latency (s)': round(self.avg_chunk_time, default_ndigits),
|
|
132
|
+
'Average package per request': round(self.n_avg_chunks, default_ndigits),
|
|
133
133
|
}
|
|
134
134
|
return message
|
evalscope/perf/utils/db_util.py
CHANGED
|
@@ -6,6 +6,7 @@ import sqlite3
|
|
|
6
6
|
import sys
|
|
7
7
|
from datetime import datetime
|
|
8
8
|
from tabulate import tabulate
|
|
9
|
+
from typing import Dict, List
|
|
9
10
|
|
|
10
11
|
from evalscope.perf.arguments import Arguments
|
|
11
12
|
from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
|
|
@@ -107,44 +108,87 @@ def get_result_db_path(args: Arguments):
|
|
|
107
108
|
return result_db_path
|
|
108
109
|
|
|
109
110
|
|
|
110
|
-
def
|
|
111
|
+
def calculate_percentiles(data: List[float], percentiles: List[int]) -> Dict[int, float]:
|
|
112
|
+
"""
|
|
113
|
+
Calculate the percentiles for a specific list of data.
|
|
111
114
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
115
|
+
:param data: List of values for a specific metric.
|
|
116
|
+
:param percentiles: List of percentiles to calculate.
|
|
117
|
+
:return: Dictionary of calculated percentiles.
|
|
118
|
+
"""
|
|
119
|
+
results = {}
|
|
120
|
+
n_success_queries = len(data)
|
|
121
|
+
data.sort()
|
|
122
|
+
for percentile in percentiles:
|
|
123
|
+
try:
|
|
116
124
|
idx = int(n_success_queries * percentile / 100)
|
|
117
|
-
|
|
118
|
-
value = row[index] if row[index] is not None else float('inf')
|
|
125
|
+
value = data[idx] if data[idx] is not None else float('nan')
|
|
119
126
|
results[percentile] = round(value, 4)
|
|
120
|
-
|
|
127
|
+
except IndexError:
|
|
128
|
+
results[percentile] = float('nan')
|
|
129
|
+
return results
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
|
|
133
|
+
"""
|
|
134
|
+
Compute and return quantiles for various metrics from the database results.
|
|
135
|
+
|
|
136
|
+
:param result_db_path: Path to the SQLite database file.
|
|
137
|
+
:return: Dictionary of percentiles for various metrics.
|
|
138
|
+
"""
|
|
139
|
+
|
|
140
|
+
def inter_token_latencies(chunk_times_json: str) -> List[float]:
|
|
141
|
+
try:
|
|
142
|
+
chunk_times = json.loads(chunk_times_json)
|
|
143
|
+
return [t2 - t1 for t1, t2 in zip(chunk_times[:-1], chunk_times[1:])]
|
|
144
|
+
except (json.JSONDecodeError, TypeError) as e:
|
|
145
|
+
logger.error(f'Error parsing chunk times: {e}')
|
|
146
|
+
return []
|
|
121
147
|
|
|
122
148
|
query_sql = ('SELECT start_time, chunk_times, success, completed_time, latency, first_chunk_latency, '
|
|
123
149
|
'n_chunks, chunk_time, prompt_tokens, completion_tokens '
|
|
124
|
-
'FROM result WHERE success=1
|
|
150
|
+
'FROM result WHERE success=1')
|
|
151
|
+
|
|
125
152
|
percentiles = [10, 25, 50, 66, 75, 80, 90, 95, 98, 99]
|
|
126
153
|
|
|
127
154
|
with sqlite3.connect(result_db_path) as con:
|
|
128
155
|
rows = con.execute(query_sql).fetchall()
|
|
129
156
|
|
|
130
|
-
if len(rows)
|
|
157
|
+
if len(rows) < len(percentiles):
|
|
131
158
|
logger.info('Too little data to calculate quantiles!')
|
|
132
159
|
return {}
|
|
133
160
|
|
|
134
|
-
#
|
|
135
|
-
|
|
136
|
-
|
|
161
|
+
# Define index variables for columns
|
|
162
|
+
CHUNK_TIMES_INDEX = 1
|
|
163
|
+
LATENCY_INDEX = 4
|
|
164
|
+
FIRST_CHUNK_LATENCY_INDEX = 5
|
|
165
|
+
PROMPT_TOKENS_INDEX = 8
|
|
166
|
+
COMPLETION_TOKENS_INDEX = 9
|
|
167
|
+
|
|
168
|
+
# Prepare data for each metric
|
|
169
|
+
inter_token_latencies_all = []
|
|
170
|
+
for row in rows:
|
|
171
|
+
inter_token_latencies_all.extend(inter_token_latencies(row[CHUNK_TIMES_INDEX]))
|
|
172
|
+
|
|
173
|
+
metrics = {
|
|
174
|
+
'TTFT (s)': [row[FIRST_CHUNK_LATENCY_INDEX] for row in rows],
|
|
175
|
+
'TPOT (s)':
|
|
176
|
+
inter_token_latencies_all,
|
|
177
|
+
'Latency (s)': [row[LATENCY_INDEX] for row in rows],
|
|
178
|
+
'Input tokens': [row[PROMPT_TOKENS_INDEX] for row in rows],
|
|
179
|
+
'Output tokens': [row[COMPLETION_TOKENS_INDEX] for row in rows],
|
|
180
|
+
'Throughput(tokens/s)':
|
|
181
|
+
[(row[COMPLETION_TOKENS_INDEX] / row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan')
|
|
182
|
+
for row in rows]
|
|
183
|
+
}
|
|
137
184
|
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
185
|
+
# Calculate percentiles for each metric
|
|
186
|
+
results = {'Percentile': [f'{p}%' for p in percentiles]}
|
|
187
|
+
for metric_name, data in metrics.items():
|
|
188
|
+
metric_percentiles = calculate_percentiles(data, percentiles)
|
|
189
|
+
results[metric_name] = [metric_percentiles[p] for p in percentiles]
|
|
141
190
|
|
|
142
|
-
|
|
143
|
-
return {
|
|
144
|
-
'Percentile': [f'{p}%' for p in percentiles],
|
|
145
|
-
'First Chunk Latency (s)': [first_chunk_latency_results[p] for p in percentiles],
|
|
146
|
-
'Latency (s)': [latency_results[p] for p in percentiles]
|
|
147
|
-
}
|
|
191
|
+
return results
|
|
148
192
|
|
|
149
193
|
|
|
150
194
|
def summary_result(args: Arguments, metrics: BenchmarkMetrics, expected_number_of_queries: int, result_db_path: str):
|
|
@@ -102,6 +102,8 @@ def start_app(args: Arguments):
|
|
|
102
102
|
|
|
103
103
|
elif args.api == 'local_vllm':
|
|
104
104
|
os.environ['VLLM_USE_MODELSCOPE'] = 'True'
|
|
105
|
+
os.environ['VLLM_ALLOW_LONG_MAX_MODEL_LEN'] = '1'
|
|
106
|
+
os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
|
|
105
107
|
# yapf: disable
|
|
106
108
|
proc = subprocess.Popen([
|
|
107
109
|
'python', '-m', 'vllm.entrypoints.openai.api_server',
|
|
@@ -111,7 +113,8 @@ def start_app(args: Arguments):
|
|
|
111
113
|
'--max-model-len', '32768',
|
|
112
114
|
'--gpu-memory-utilization', '0.9',
|
|
113
115
|
'--host', '0.0.0.0',
|
|
114
|
-
'--port', args.port,
|
|
116
|
+
'--port', str(args.port),
|
|
117
|
+
'--trust-remote-code',
|
|
115
118
|
'--disable-log-requests',
|
|
116
119
|
'--disable-log-stats',
|
|
117
120
|
])
|
evalscope/run.py
CHANGED
|
@@ -10,12 +10,13 @@ from datetime import datetime
|
|
|
10
10
|
from typing import List, Optional, Union
|
|
11
11
|
|
|
12
12
|
from evalscope.arguments import parse_args
|
|
13
|
-
from evalscope.config import TaskConfig
|
|
14
|
-
from evalscope.constants import DEFAULT_MODEL_REVISION, DEFAULT_WORK_DIR, EvalBackend, EvalType
|
|
15
|
-
from evalscope.evaluator import Evaluator
|
|
13
|
+
from evalscope.config import TaskConfig, parse_task_config
|
|
14
|
+
from evalscope.constants import DEFAULT_MODEL_REVISION, DEFAULT_WORK_DIR, EvalBackend, EvalType
|
|
15
|
+
from evalscope.evaluator import Evaluator
|
|
16
16
|
from evalscope.models.custom import CustomModel
|
|
17
17
|
from evalscope.utils import import_module_util, seed_everything
|
|
18
|
-
from evalscope.utils.
|
|
18
|
+
from evalscope.utils.io_utils import OutputsStructure, are_paths_same
|
|
19
|
+
from evalscope.utils.logger import configure_logging, get_logger
|
|
19
20
|
|
|
20
21
|
logger = get_logger()
|
|
21
22
|
|
|
@@ -23,15 +24,6 @@ BENCHMARK_PATH_PREFIX = 'evalscope.benchmarks.'
|
|
|
23
24
|
MEMBERS_TO_IMPORT = ['DATASET_ID', 'SUBSET_LIST', 'DataAdapterClass', 'ModelAdapterClass']
|
|
24
25
|
|
|
25
26
|
|
|
26
|
-
def configure_logging(debug: bool, outputs: Optional[OutputsStructure]):
|
|
27
|
-
"""Configure logging level based on the debug flag."""
|
|
28
|
-
if outputs:
|
|
29
|
-
log_file = os.path.join(outputs.logs_dir, 'eval_log.log')
|
|
30
|
-
get_logger(log_file=log_file, force=True)
|
|
31
|
-
if debug:
|
|
32
|
-
get_logger(log_level=logging.DEBUG, force=True)
|
|
33
|
-
|
|
34
|
-
|
|
35
27
|
def run_task(task_cfg: Union[str, dict, TaskConfig, List[TaskConfig], Namespace]) -> Union[dict, List[dict]]:
|
|
36
28
|
"""Run evaluation task(s) based on the provided configuration."""
|
|
37
29
|
run_time = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
@@ -48,35 +40,15 @@ def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
|
|
|
48
40
|
"""Run a single evaluation task."""
|
|
49
41
|
seed_everything(task_cfg.seed)
|
|
50
42
|
outputs = setup_work_directory(task_cfg, run_time)
|
|
51
|
-
configure_logging(task_cfg.debug, outputs)
|
|
43
|
+
configure_logging(task_cfg.debug, os.path.join(outputs.logs_dir, 'eval_log.log'))
|
|
52
44
|
|
|
45
|
+
task_cfg.dump_yaml(outputs.configs_dir)
|
|
53
46
|
logger.info(task_cfg)
|
|
54
47
|
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
def parse_task_config(task_cfg) -> TaskConfig:
|
|
59
|
-
"""Parse task configuration from various formats into a TaskConfig object."""
|
|
60
|
-
if isinstance(task_cfg, TaskConfig):
|
|
61
|
-
logger.info('Args: Task config is provided with TaskConfig type.')
|
|
62
|
-
elif isinstance(task_cfg, dict):
|
|
63
|
-
logger.info('Args: Task config is provided with dictionary type.')
|
|
64
|
-
task_cfg = TaskConfig.from_dict(task_cfg)
|
|
65
|
-
elif isinstance(task_cfg, Namespace):
|
|
66
|
-
logger.info('Args: Task config is provided with CommandLine type.')
|
|
67
|
-
task_cfg = TaskConfig.from_args(task_cfg)
|
|
68
|
-
elif isinstance(task_cfg, str):
|
|
69
|
-
extension = task_cfg.split('.')[-1]
|
|
70
|
-
logger.info(f'Args: Task config is provided with {extension} file type.')
|
|
71
|
-
if extension in ['yaml', 'yml']:
|
|
72
|
-
task_cfg = TaskConfig.from_yaml(task_cfg)
|
|
73
|
-
elif extension == 'json':
|
|
74
|
-
task_cfg = TaskConfig.from_json(task_cfg)
|
|
75
|
-
else:
|
|
76
|
-
raise ValueError('Args: Unsupported file extension.')
|
|
48
|
+
if task_cfg.eval_backend != EvalBackend.NATIVE:
|
|
49
|
+
return run_non_native_backend(task_cfg)
|
|
77
50
|
else:
|
|
78
|
-
|
|
79
|
-
return task_cfg
|
|
51
|
+
return evaluate_model(task_cfg, outputs)
|
|
80
52
|
|
|
81
53
|
|
|
82
54
|
def setup_work_directory(task_cfg: TaskConfig, run_time: str):
|
|
@@ -84,10 +56,15 @@ def setup_work_directory(task_cfg: TaskConfig, run_time: str):
|
|
|
84
56
|
if task_cfg.use_cache:
|
|
85
57
|
task_cfg.work_dir = task_cfg.use_cache
|
|
86
58
|
logger.info(f'Set resume from {task_cfg.work_dir}')
|
|
87
|
-
elif task_cfg.work_dir
|
|
59
|
+
elif are_paths_same(task_cfg.work_dir, DEFAULT_WORK_DIR):
|
|
88
60
|
task_cfg.work_dir = os.path.join(task_cfg.work_dir, run_time)
|
|
89
61
|
|
|
90
62
|
outputs = OutputsStructure(outputs_dir=task_cfg.work_dir)
|
|
63
|
+
|
|
64
|
+
if task_cfg.eval_backend == EvalBackend.OPEN_COMPASS:
|
|
65
|
+
task_cfg.eval_config['time_str'] = run_time
|
|
66
|
+
elif task_cfg.eval_backend == EvalBackend.VLM_EVAL_KIT:
|
|
67
|
+
task_cfg.eval_config['work_dir'] = task_cfg.work_dir
|
|
91
68
|
return outputs
|
|
92
69
|
|
|
93
70
|
|
|
@@ -125,10 +102,6 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
|
|
|
125
102
|
"""Evaluate the model based on the provided task configuration."""
|
|
126
103
|
# Initialize evaluator
|
|
127
104
|
eval_results = {}
|
|
128
|
-
task_cfg.dump_yaml(outputs.configs_dir)
|
|
129
|
-
|
|
130
|
-
if task_cfg.eval_backend != EvalBackend.NATIVE:
|
|
131
|
-
return run_non_native_backend(task_cfg)
|
|
132
105
|
|
|
133
106
|
for dataset_name in task_cfg.datasets:
|
|
134
107
|
evaluator = create_evaluator(task_cfg, dataset_name, outputs)
|
|
@@ -143,45 +116,35 @@ def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsSt
|
|
|
143
116
|
imported_modules = import_module_util(BENCHMARK_PATH_PREFIX, dataset_name, MEMBERS_TO_IMPORT)
|
|
144
117
|
model_adapter = initialize_model_adapter(task_cfg, dataset_name, imported_modules)
|
|
145
118
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
data_adapter=data_adapter,
|
|
176
|
-
model_adapter=model_adapter,
|
|
177
|
-
use_cache=task_cfg.use_cache,
|
|
178
|
-
outputs=outputs,
|
|
179
|
-
datasets_dir=task_cfg.dataset_dir,
|
|
180
|
-
datasets_hub=task_cfg.dataset_hub,
|
|
181
|
-
stage=task_cfg.stage,
|
|
182
|
-
eval_type=task_cfg.eval_type,
|
|
183
|
-
overall_task_cfg=task_cfg,
|
|
184
|
-
)
|
|
119
|
+
dataset_config = task_cfg.dataset_args.get(dataset_name, {})
|
|
120
|
+
dataset_name_or_path = dataset_config.get('local_path') or imported_modules['DATASET_ID']
|
|
121
|
+
in_prompt_template = dataset_config.get('prompt_template', '')
|
|
122
|
+
few_shot_num = dataset_config.get('few_shot_num', None)
|
|
123
|
+
few_shot_random = dataset_config.get('few_shot_random', True)
|
|
124
|
+
|
|
125
|
+
data_adapter = imported_modules['DataAdapterClass'](
|
|
126
|
+
few_shot_num=few_shot_num,
|
|
127
|
+
few_shot_random=few_shot_random,
|
|
128
|
+
prompt_template=in_prompt_template,
|
|
129
|
+
outputs=outputs,
|
|
130
|
+
)
|
|
131
|
+
in_subset_list = dataset_config.get('subset_list', imported_modules['SUBSET_LIST'])
|
|
132
|
+
|
|
133
|
+
logger.info(f'Evaluating on subsets for {dataset_name}: {in_subset_list}\n')
|
|
134
|
+
|
|
135
|
+
return Evaluator(
|
|
136
|
+
dataset_name_or_path=dataset_name_or_path,
|
|
137
|
+
subset_list=in_subset_list,
|
|
138
|
+
data_adapter=data_adapter,
|
|
139
|
+
model_adapter=model_adapter,
|
|
140
|
+
use_cache=task_cfg.use_cache,
|
|
141
|
+
outputs=outputs,
|
|
142
|
+
datasets_dir=task_cfg.dataset_dir,
|
|
143
|
+
datasets_hub=task_cfg.dataset_hub,
|
|
144
|
+
stage=task_cfg.stage,
|
|
145
|
+
eval_type=task_cfg.eval_type,
|
|
146
|
+
overall_task_cfg=task_cfg,
|
|
147
|
+
)
|
|
185
148
|
|
|
186
149
|
|
|
187
150
|
def initialize_model_adapter(task_cfg: TaskConfig, dataset_name: str, imported_modules):
|
evalscope/run_arena.py
CHANGED
|
@@ -11,7 +11,8 @@ from tqdm import tqdm
|
|
|
11
11
|
from evalscope.constants import EvalConfigKeys
|
|
12
12
|
from evalscope.evaluator.rating_eval import RatingEvaluate
|
|
13
13
|
from evalscope.models.model_adapter import ChatGenerationModelAdapter
|
|
14
|
-
from evalscope.utils import
|
|
14
|
+
from evalscope.utils import get_obj_from_cfg
|
|
15
|
+
from evalscope.utils.io_utils import dump_jsonl_data, jsonl_to_list, yaml_to_dict
|
|
15
16
|
from evalscope.utils.logger import get_logger
|
|
16
17
|
|
|
17
18
|
logger = get_logger()
|
evalscope/summarizer.py
CHANGED
|
@@ -4,10 +4,11 @@ import json
|
|
|
4
4
|
import os
|
|
5
5
|
from typing import List, Union
|
|
6
6
|
|
|
7
|
-
from evalscope.config import TaskConfig
|
|
8
|
-
from evalscope.constants import EvalBackend
|
|
7
|
+
from evalscope.config import TaskConfig, parse_task_config
|
|
8
|
+
from evalscope.constants import EvalBackend
|
|
9
9
|
from evalscope.tools.combine_reports import gen_table
|
|
10
|
-
from evalscope.utils import csv_to_list, get_latest_folder_path
|
|
10
|
+
from evalscope.utils import csv_to_list, get_latest_folder_path
|
|
11
|
+
from evalscope.utils.io_utils import OutputsStructure, json_to_dict, yaml_to_dict
|
|
11
12
|
from evalscope.utils.logger import get_logger
|
|
12
13
|
|
|
13
14
|
logger = get_logger()
|
|
@@ -24,7 +25,7 @@ class Summarizer:
|
|
|
24
25
|
if reports_dir is None:
|
|
25
26
|
raise ValueError(f'No reports directory in {outputs_dir}')
|
|
26
27
|
|
|
27
|
-
report_files: list = glob.glob(os.path.join(reports_dir, '
|
|
28
|
+
report_files: list = glob.glob(os.path.join(reports_dir, '**/*.json'))
|
|
28
29
|
for report_file in report_files:
|
|
29
30
|
with open(report_file, 'r') as f:
|
|
30
31
|
res_list.append(json.load(f))
|
|
@@ -47,33 +48,20 @@ class Summarizer:
|
|
|
47
48
|
A report dict is overall report on a benchmark for specific model.
|
|
48
49
|
"""
|
|
49
50
|
final_res_list: List[dict] = []
|
|
50
|
-
candidate_task_cfgs: List[
|
|
51
|
-
|
|
52
|
-
if isinstance(task_cfg,
|
|
53
|
-
candidate_task_cfgs = [task_cfg]
|
|
54
|
-
elif isinstance(task_cfg, str):
|
|
55
|
-
task_cfg: dict = yaml_to_dict(task_cfg)
|
|
56
|
-
candidate_task_cfgs = [task_cfg]
|
|
57
|
-
elif isinstance(task_cfg, TaskConfig):
|
|
58
|
-
task_cfg: dict = task_cfg.to_dict()
|
|
59
|
-
candidate_task_cfgs = [task_cfg]
|
|
60
|
-
elif isinstance(task_cfg, list):
|
|
51
|
+
candidate_task_cfgs: List[TaskConfig] = []
|
|
52
|
+
|
|
53
|
+
if isinstance(task_cfg, list):
|
|
61
54
|
for task_cfg_item in task_cfg:
|
|
62
|
-
|
|
63
|
-
task_cfg_item: dict = yaml_to_dict(task_cfg_item)
|
|
64
|
-
elif isinstance(task_cfg_item, TaskConfig):
|
|
65
|
-
task_cfg_item: dict = task_cfg_item.to_dict()
|
|
66
|
-
candidate_task_cfgs.append(task_cfg_item)
|
|
55
|
+
candidate_task_cfgs.append(parse_task_config(task_cfg_item))
|
|
67
56
|
else:
|
|
68
|
-
|
|
57
|
+
candidate_task_cfgs.append(parse_task_config(task_cfg))
|
|
69
58
|
|
|
70
59
|
for candidate_task in candidate_task_cfgs:
|
|
71
60
|
logger.info(f'**Loading task cfg for summarizer: {candidate_task}')
|
|
72
|
-
eval_backend = candidate_task.
|
|
61
|
+
eval_backend = candidate_task.eval_backend
|
|
73
62
|
|
|
74
63
|
if eval_backend == EvalBackend.NATIVE:
|
|
75
|
-
outputs_dir: str = candidate_task.
|
|
76
|
-
outputs_dir: str = os.path.expanduser(outputs_dir)
|
|
64
|
+
outputs_dir: str = os.path.expanduser(candidate_task.work_dir)
|
|
77
65
|
if outputs_dir is None:
|
|
78
66
|
raise ValueError(f'No outputs_dir in {task_cfg}')
|
|
79
67
|
res_list: list = Summarizer.get_report(outputs_dir=outputs_dir)
|
|
@@ -128,8 +116,8 @@ class Summarizer:
|
|
|
128
116
|
return final_res_list
|
|
129
117
|
|
|
130
118
|
@staticmethod
|
|
131
|
-
def parse_eval_config(candidate_task):
|
|
132
|
-
eval_config: Union[str, dict] = candidate_task.
|
|
119
|
+
def parse_eval_config(candidate_task: TaskConfig):
|
|
120
|
+
eval_config: Union[str, dict] = candidate_task.eval_config
|
|
133
121
|
assert eval_config is not None, 'Please provide eval_config for specific evaluation backend.'
|
|
134
122
|
|
|
135
123
|
if isinstance(eval_config, str):
|
|
@@ -10,7 +10,8 @@ import requests
|
|
|
10
10
|
from concurrent.futures import ThreadPoolExecutor
|
|
11
11
|
from tqdm import tqdm
|
|
12
12
|
|
|
13
|
-
from evalscope.utils import get_logger
|
|
13
|
+
from evalscope.utils import get_logger
|
|
14
|
+
from evalscope.utils.io_utils import jsonl_to_list
|
|
14
15
|
|
|
15
16
|
logger = get_logger()
|
|
16
17
|
|
|
@@ -4,7 +4,8 @@ from typing import Union
|
|
|
4
4
|
|
|
5
5
|
from evalscope.third_party.longbench_write.eval import run_eval
|
|
6
6
|
from evalscope.third_party.longbench_write.infer import run_infer
|
|
7
|
-
from evalscope.utils import get_logger
|
|
7
|
+
from evalscope.utils import get_logger
|
|
8
|
+
from evalscope.utils.io_utils import json_to_dict, yaml_to_dict
|
|
8
9
|
|
|
9
10
|
logger = get_logger()
|
|
10
11
|
|
|
@@ -6,7 +6,7 @@ from typing import List
|
|
|
6
6
|
|
|
7
7
|
from evalscope.third_party.longbench_write.eval import EvalLength
|
|
8
8
|
from evalscope.third_party.longbench_write.utils import chinese_to_arabic, count_words
|
|
9
|
-
from evalscope.utils import jsonl_to_list
|
|
9
|
+
from evalscope.utils.io_utils import jsonl_to_list
|
|
10
10
|
from evalscope.utils.logger import get_logger
|
|
11
11
|
|
|
12
12
|
logger = get_logger()
|
|
@@ -5,7 +5,8 @@ from typing import Union
|
|
|
5
5
|
|
|
6
6
|
from evalscope.third_party.toolbench_static.eval import EvalArgs, run_eval
|
|
7
7
|
from evalscope.third_party.toolbench_static.infer import InferArgs, run_infer
|
|
8
|
-
from evalscope.utils import get_logger
|
|
8
|
+
from evalscope.utils import get_logger
|
|
9
|
+
from evalscope.utils.io_utils import json_to_dict, yaml_to_dict
|
|
9
10
|
|
|
10
11
|
logger = get_logger()
|
|
11
12
|
|
|
@@ -19,16 +19,14 @@ def get_report(report_file: str):
|
|
|
19
19
|
dataset_name = data_d['dataset_name']
|
|
20
20
|
model_name = data_d['model_name']
|
|
21
21
|
score = data_d['score'] # float or dict
|
|
22
|
+
metric = data_d['metric']
|
|
22
23
|
score_d = {}
|
|
23
24
|
if isinstance(score, dict):
|
|
24
|
-
# score_d = dict([(k, round(v, 4) * 100) for k, v in score.items()])
|
|
25
25
|
score_d = score
|
|
26
26
|
elif isinstance(score, float):
|
|
27
|
-
|
|
28
|
-
score_d['acc'] = score
|
|
27
|
+
score_d[metric] = score
|
|
29
28
|
else:
|
|
30
29
|
raise ValueError(f'Unknown score type: {type(score)}')
|
|
31
|
-
# score_str = '\n'.join([str(v) + ' (' + k + ')' for k, v in score_d.items()])
|
|
32
30
|
score_str = '\n'.join(['(' + dataset_name + '/' + k + ') ' + str(v) for k, v in score_d.items()])
|
|
33
31
|
|
|
34
32
|
return model_name, {'dataset_name': dataset_name, 'score': score_str}
|