evalscope 0.8.0__py3-none-any.whl → 0.10.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +2 -0
- evalscope/arguments.py +11 -3
- evalscope/backend/base.py +1 -1
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
- evalscope/backend/rag_eval/utils/clip.py +2 -2
- evalscope/backend/rag_eval/utils/embedding.py +1 -1
- evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope/benchmarks/__init__.py +20 -1
- evalscope/benchmarks/arc/__init__.py +0 -5
- evalscope/benchmarks/arc/arc_adapter.py +24 -102
- evalscope/benchmarks/bbh/__init__.py +0 -4
- evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
- evalscope/benchmarks/benchmark.py +70 -59
- evalscope/benchmarks/ceval/__init__.py +0 -5
- evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
- evalscope/benchmarks/cmmlu/__init__.py +0 -5
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
- evalscope/benchmarks/competition_math/__init__.py +0 -5
- evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
- evalscope/benchmarks/data_adapter.py +115 -87
- evalscope/benchmarks/general_qa/__init__.py +0 -5
- evalscope/benchmarks/general_qa/general_qa_adapter.py +24 -80
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +103 -0
- evalscope/benchmarks/gsm8k/__init__.py +0 -4
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +22 -101
- evalscope/benchmarks/hellaswag/__init__.py +0 -5
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +33 -99
- evalscope/benchmarks/humaneval/__init__.py +0 -4
- evalscope/benchmarks/humaneval/humaneval_adapter.py +93 -9
- evalscope/benchmarks/ifeval/__init__.py +0 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +56 -0
- evalscope/benchmarks/ifeval/instructions.py +1477 -0
- evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
- evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
- evalscope/benchmarks/ifeval/utils.py +134 -0
- evalscope/benchmarks/iquiz/__init__.py +0 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
- evalscope/benchmarks/mmlu/__init__.py +0 -5
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
- evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
- evalscope/benchmarks/race/__init__.py +0 -5
- evalscope/benchmarks/race/race_adapter.py +27 -123
- evalscope/benchmarks/trivia_qa/__init__.py +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
- evalscope/benchmarks/truthful_qa/__init__.py +0 -5
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +30 -0
- evalscope/collections/__init__.py +3 -0
- evalscope/collections/evaluator.py +198 -0
- evalscope/collections/sampler.py +138 -0
- evalscope/collections/schema.py +126 -0
- evalscope/config.py +45 -7
- evalscope/constants.py +7 -38
- evalscope/evaluator/__init__.py +0 -1
- evalscope/evaluator/evaluator.py +89 -121
- evalscope/evaluator/rating_eval.py +1 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +14 -5
- evalscope/metrics/__init__.py +3 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- evalscope/metrics/math_accuracy.py +193 -50
- evalscope/metrics/metrics.py +18 -6
- evalscope/metrics/named_metrics.py +17 -0
- evalscope/metrics/rouge_metric.py +13 -8
- evalscope/models/__init__.py +14 -1
- evalscope/models/base_adapter.py +52 -0
- evalscope/models/chat_adapter.py +140 -0
- evalscope/models/choice_adapter.py +211 -0
- evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +1 -1
- evalscope/models/custom_adapter.py +67 -0
- evalscope/models/local_model.py +74 -0
- evalscope/models/model.py +141 -0
- evalscope/models/server_adapter.py +111 -0
- evalscope/perf/__init__.py +1 -0
- evalscope/perf/arguments.py +3 -1
- evalscope/perf/benchmark.py +3 -3
- evalscope/perf/main.py +5 -7
- evalscope/perf/plugin/api/custom_api.py +1 -1
- evalscope/perf/plugin/api/openai_api.py +54 -50
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/longalpaca.py +1 -1
- evalscope/perf/plugin/registry.py +3 -3
- evalscope/perf/utils/benchmark_util.py +4 -4
- evalscope/perf/utils/db_util.py +66 -22
- evalscope/perf/utils/local_server.py +4 -1
- evalscope/report/__init__.py +5 -0
- evalscope/report/app.py +693 -0
- evalscope/report/combinator.py +73 -0
- evalscope/report/generator.py +80 -0
- evalscope/report/utils.py +133 -0
- evalscope/run.py +64 -125
- evalscope/run_arena.py +3 -2
- evalscope/summarizer.py +15 -27
- evalscope/third_party/longbench_write/eval.py +2 -1
- evalscope/third_party/longbench_write/longbench_write.py +2 -1
- evalscope/third_party/longbench_write/tools/data_etl.py +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +1 -0
- evalscope/utils/chat_service.py +6 -5
- evalscope/utils/io_utils.py +170 -0
- evalscope/utils/logger.py +13 -0
- evalscope/utils/model_utils.py +15 -2
- evalscope/utils/utils.py +3 -200
- evalscope/version.py +2 -2
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/METADATA +129 -23
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/RECORD +119 -115
- tests/cli/test_collection.py +57 -0
- tests/cli/test_run.py +57 -7
- tests/perf/test_perf.py +3 -2
- tests/rag/test_mteb.py +3 -2
- tests/vlm/test_vlmeval.py +3 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +0 -87
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +0 -26
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +0 -41
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +0 -7
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +0 -60
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +0 -24
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +0 -35
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +0 -34
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +0 -25
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -24
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +0 -16
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +0 -24
- evalscope/evaluator/humaneval_evaluator.py +0 -158
- evalscope/models/api/__init__.py +0 -3
- evalscope/models/dummy_chat_model.py +0 -49
- evalscope/models/model_adapter.py +0 -525
- evalscope/models/openai_model.py +0 -103
- evalscope/tools/__init__.py +0 -1
- evalscope/tools/combine_reports.py +0 -135
- evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
- /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/LICENSE +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/WHEEL +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/top_level.txt +0 -0
evalscope/perf/utils/db_util.py
CHANGED
|
@@ -6,6 +6,7 @@ import sqlite3
|
|
|
6
6
|
import sys
|
|
7
7
|
from datetime import datetime
|
|
8
8
|
from tabulate import tabulate
|
|
9
|
+
from typing import Dict, List
|
|
9
10
|
|
|
10
11
|
from evalscope.perf.arguments import Arguments
|
|
11
12
|
from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
|
|
@@ -107,44 +108,87 @@ def get_result_db_path(args: Arguments):
|
|
|
107
108
|
return result_db_path
|
|
108
109
|
|
|
109
110
|
|
|
110
|
-
def
|
|
111
|
+
def calculate_percentiles(data: List[float], percentiles: List[int]) -> Dict[int, float]:
|
|
112
|
+
"""
|
|
113
|
+
Calculate the percentiles for a specific list of data.
|
|
111
114
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
115
|
+
:param data: List of values for a specific metric.
|
|
116
|
+
:param percentiles: List of percentiles to calculate.
|
|
117
|
+
:return: Dictionary of calculated percentiles.
|
|
118
|
+
"""
|
|
119
|
+
results = {}
|
|
120
|
+
n_success_queries = len(data)
|
|
121
|
+
data.sort()
|
|
122
|
+
for percentile in percentiles:
|
|
123
|
+
try:
|
|
116
124
|
idx = int(n_success_queries * percentile / 100)
|
|
117
|
-
|
|
118
|
-
value = row[index] if row[index] is not None else float('inf')
|
|
125
|
+
value = data[idx] if data[idx] is not None else float('nan')
|
|
119
126
|
results[percentile] = round(value, 4)
|
|
120
|
-
|
|
127
|
+
except IndexError:
|
|
128
|
+
results[percentile] = float('nan')
|
|
129
|
+
return results
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
|
|
133
|
+
"""
|
|
134
|
+
Compute and return quantiles for various metrics from the database results.
|
|
135
|
+
|
|
136
|
+
:param result_db_path: Path to the SQLite database file.
|
|
137
|
+
:return: Dictionary of percentiles for various metrics.
|
|
138
|
+
"""
|
|
139
|
+
|
|
140
|
+
def inter_token_latencies(chunk_times_json: str) -> List[float]:
|
|
141
|
+
try:
|
|
142
|
+
chunk_times = json.loads(chunk_times_json)
|
|
143
|
+
return [t2 - t1 for t1, t2 in zip(chunk_times[:-1], chunk_times[1:])]
|
|
144
|
+
except (json.JSONDecodeError, TypeError) as e:
|
|
145
|
+
logger.error(f'Error parsing chunk times: {e}')
|
|
146
|
+
return []
|
|
121
147
|
|
|
122
148
|
query_sql = ('SELECT start_time, chunk_times, success, completed_time, latency, first_chunk_latency, '
|
|
123
149
|
'n_chunks, chunk_time, prompt_tokens, completion_tokens '
|
|
124
|
-
'FROM result WHERE success=1
|
|
150
|
+
'FROM result WHERE success=1')
|
|
151
|
+
|
|
125
152
|
percentiles = [10, 25, 50, 66, 75, 80, 90, 95, 98, 99]
|
|
126
153
|
|
|
127
154
|
with sqlite3.connect(result_db_path) as con:
|
|
128
155
|
rows = con.execute(query_sql).fetchall()
|
|
129
156
|
|
|
130
|
-
if len(rows)
|
|
157
|
+
if len(rows) < len(percentiles):
|
|
131
158
|
logger.info('Too little data to calculate quantiles!')
|
|
132
159
|
return {}
|
|
133
160
|
|
|
134
|
-
#
|
|
135
|
-
|
|
136
|
-
|
|
161
|
+
# Define index variables for columns
|
|
162
|
+
CHUNK_TIMES_INDEX = 1
|
|
163
|
+
LATENCY_INDEX = 4
|
|
164
|
+
FIRST_CHUNK_LATENCY_INDEX = 5
|
|
165
|
+
PROMPT_TOKENS_INDEX = 8
|
|
166
|
+
COMPLETION_TOKENS_INDEX = 9
|
|
167
|
+
|
|
168
|
+
# Prepare data for each metric
|
|
169
|
+
inter_token_latencies_all = []
|
|
170
|
+
for row in rows:
|
|
171
|
+
inter_token_latencies_all.extend(inter_token_latencies(row[CHUNK_TIMES_INDEX]))
|
|
172
|
+
|
|
173
|
+
metrics = {
|
|
174
|
+
'TTFT (s)': [row[FIRST_CHUNK_LATENCY_INDEX] for row in rows],
|
|
175
|
+
'TPOT (s)':
|
|
176
|
+
inter_token_latencies_all,
|
|
177
|
+
'Latency (s)': [row[LATENCY_INDEX] for row in rows],
|
|
178
|
+
'Input tokens': [row[PROMPT_TOKENS_INDEX] for row in rows],
|
|
179
|
+
'Output tokens': [row[COMPLETION_TOKENS_INDEX] for row in rows],
|
|
180
|
+
'Throughput(tokens/s)':
|
|
181
|
+
[(row[COMPLETION_TOKENS_INDEX] / row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan')
|
|
182
|
+
for row in rows]
|
|
183
|
+
}
|
|
137
184
|
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
185
|
+
# Calculate percentiles for each metric
|
|
186
|
+
results = {'Percentile': [f'{p}%' for p in percentiles]}
|
|
187
|
+
for metric_name, data in metrics.items():
|
|
188
|
+
metric_percentiles = calculate_percentiles(data, percentiles)
|
|
189
|
+
results[metric_name] = [metric_percentiles[p] for p in percentiles]
|
|
141
190
|
|
|
142
|
-
|
|
143
|
-
return {
|
|
144
|
-
'Percentile': [f'{p}%' for p in percentiles],
|
|
145
|
-
'First Chunk Latency (s)': [first_chunk_latency_results[p] for p in percentiles],
|
|
146
|
-
'Latency (s)': [latency_results[p] for p in percentiles]
|
|
147
|
-
}
|
|
191
|
+
return results
|
|
148
192
|
|
|
149
193
|
|
|
150
194
|
def summary_result(args: Arguments, metrics: BenchmarkMetrics, expected_number_of_queries: int, result_db_path: str):
|
|
@@ -102,6 +102,8 @@ def start_app(args: Arguments):
|
|
|
102
102
|
|
|
103
103
|
elif args.api == 'local_vllm':
|
|
104
104
|
os.environ['VLLM_USE_MODELSCOPE'] = 'True'
|
|
105
|
+
os.environ['VLLM_ALLOW_LONG_MAX_MODEL_LEN'] = '1'
|
|
106
|
+
os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
|
|
105
107
|
# yapf: disable
|
|
106
108
|
proc = subprocess.Popen([
|
|
107
109
|
'python', '-m', 'vllm.entrypoints.openai.api_server',
|
|
@@ -111,7 +113,8 @@ def start_app(args: Arguments):
|
|
|
111
113
|
'--max-model-len', '32768',
|
|
112
114
|
'--gpu-memory-utilization', '0.9',
|
|
113
115
|
'--host', '0.0.0.0',
|
|
114
|
-
'--port', args.port,
|
|
116
|
+
'--port', str(args.port),
|
|
117
|
+
'--trust-remote-code',
|
|
115
118
|
'--disable-log-requests',
|
|
116
119
|
'--disable-log-stats',
|
|
117
120
|
])
|