evalscope 0.6.1__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/backend/opencompass/tasks/eval_api.py +2 -1
- evalscope/backend/opencompass/tasks/eval_datasets.py +1 -0
- evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +230 -0
- evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +43 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +87 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +36 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +26 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +41 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +60 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +36 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +22 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +35 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +7 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +39 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +7 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +39 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +34 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +36 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +25 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +7 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +39 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +16 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +24 -0
- evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +18 -0
- evalscope/backend/vlm_eval_kit/backend_manager.py +23 -21
- evalscope/benchmarks/ceval/samples.jsonl +1 -0
- evalscope/benchmarks/cmmlu/samples.jsonl +5 -0
- evalscope/benchmarks/mmlu/samples.jsonl +5 -0
- evalscope/benchmarks/race/samples.jsonl +5 -0
- evalscope/benchmarks/trivia_qa/samples.jsonl +5 -0
- evalscope/cli/start_perf.py +8 -11
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +58485 -0
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +1 -0
- evalscope/metrics/rouge_metric.py +30 -15
- evalscope/perf/arguments.py +179 -0
- evalscope/perf/benchmark.py +245 -0
- evalscope/perf/http_client.py +127 -711
- evalscope/perf/main.py +35 -0
- evalscope/perf/plugin/__init__.py +2 -0
- evalscope/perf/plugin/api/__init__.py +3 -0
- evalscope/perf/{api_plugin_base.py → plugin/api/base.py} +17 -18
- evalscope/perf/{custom_api.py → plugin/api/custom_api.py} +25 -19
- evalscope/perf/{dashscope_api.py → plugin/api/dashscope_api.py} +28 -14
- evalscope/perf/{openai_api.py → plugin/api/openai_api.py} +51 -27
- evalscope/perf/plugin/datasets/__init__.py +6 -0
- evalscope/perf/{dataset_plugin_base.py → plugin/datasets/base.py} +13 -10
- evalscope/perf/plugin/datasets/custom.py +21 -0
- evalscope/perf/plugin/datasets/flickr8k.py +51 -0
- evalscope/perf/{datasets → plugin/datasets}/line_by_line.py +9 -5
- evalscope/perf/plugin/datasets/longalpaca.py +28 -0
- evalscope/perf/plugin/datasets/openqa.py +38 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +50 -0
- evalscope/perf/plugin/registry.py +54 -0
- evalscope/perf/{how_to_analysis_result.py → utils/analysis_result.py} +11 -5
- evalscope/perf/utils/benchmark_util.py +135 -0
- evalscope/perf/utils/chat_service.py +252 -0
- evalscope/perf/utils/db_util.py +200 -0
- evalscope/perf/utils/handler.py +46 -0
- evalscope/perf/utils/local_server.py +139 -0
- evalscope/registry/config/cfg_arena.yaml +77 -0
- evalscope/registry/config/cfg_arena_zhihu.yaml +63 -0
- evalscope/registry/config/cfg_pairwise_baseline.yaml +83 -0
- evalscope/registry/config/cfg_single.yaml +78 -0
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +8 -0
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +8 -0
- evalscope/registry/data/qa_browser/battle.jsonl +634 -0
- evalscope/registry/data/qa_browser/category_mapping.yaml +10 -0
- evalscope/registry/data/question.jsonl +80 -0
- evalscope/third_party/longbench_write/README.md +118 -0
- evalscope/third_party/longbench_write/default_task.json +27 -0
- evalscope/third_party/longbench_write/default_task.yaml +24 -0
- evalscope/third_party/toolbench_static/README.md +118 -0
- evalscope/third_party/toolbench_static/config_default.json +15 -0
- evalscope/third_party/toolbench_static/config_default.yaml +12 -0
- evalscope/third_party/toolbench_static/requirements.txt +2 -0
- evalscope/utils/logger.py +18 -20
- evalscope/utils/utils.py +41 -42
- evalscope/version.py +2 -2
- evalscope-0.7.1.dist-info/LICENSE +203 -0
- {evalscope-0.6.1.dist-info → evalscope-0.7.1.dist-info}/METADATA +93 -35
- {evalscope-0.6.1.dist-info → evalscope-0.7.1.dist-info}/RECORD +101 -31
- {evalscope-0.6.1.dist-info → evalscope-0.7.1.dist-info}/WHEEL +1 -1
- {evalscope-0.6.1.dist-info → evalscope-0.7.1.dist-info}/top_level.txt +1 -0
- tests/cli/__init__.py +1 -0
- tests/cli/test_run.py +76 -0
- tests/perf/__init__.py +1 -0
- tests/perf/test_perf.py +96 -0
- tests/rag/test_clip_benchmark.py +85 -0
- tests/rag/test_mteb.py +136 -0
- tests/rag/test_ragas.py +120 -0
- tests/swift/__init__.py +1 -0
- tests/swift/test_run_swift_eval.py +146 -0
- tests/swift/test_run_swift_vlm_eval.py +128 -0
- tests/swift/test_run_swift_vlm_jugde_eval.py +157 -0
- tests/test_run_all.py +12 -0
- tests/vlm/__init__.py +1 -0
- tests/vlm/test_vlmeval.py +59 -0
- evalscope/perf/_logging.py +0 -32
- evalscope/perf/datasets/longalpaca_12k.py +0 -20
- evalscope/perf/datasets/openqa.py +0 -22
- evalscope/perf/plugin_registry.py +0 -35
- evalscope/perf/query_parameters.py +0 -42
- evalscope/perf/server_sent_event.py +0 -43
- evalscope/preprocess/tokenizers/gpt2_tokenizer.py +0 -221
- /evalscope/perf/{datasets → utils}/__init__.py +0 -0
- {evalscope-0.6.1.dist-info → evalscope-0.7.1.dist-info}/entry_points.txt +0 -0
- {evalscope/preprocess → tests}/__init__.py +0 -0
- {evalscope/preprocess/tokenizers → tests/rag}/__init__.py +0 -0
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import os
|
|
3
|
+
import pickle
|
|
4
|
+
import sqlite3
|
|
5
|
+
import sys
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
from tabulate import tabulate
|
|
10
|
+
|
|
11
|
+
from evalscope.perf.arguments import Arguments
|
|
12
|
+
from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
|
|
13
|
+
from evalscope.utils.logger import get_logger
|
|
14
|
+
|
|
15
|
+
logger = get_logger()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def encode_data(data) -> str:
|
|
19
|
+
"""Encodes data using base64 and pickle."""
|
|
20
|
+
return base64.b64encode(pickle.dumps(data)).decode('utf-8')
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def write_json_file(data, output_path):
|
|
24
|
+
with open(output_path, 'w') as f:
|
|
25
|
+
json.dump(data, f, indent=4, ensure_ascii=False)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def transpose_results(data):
|
|
29
|
+
headers = data.keys()
|
|
30
|
+
rows = zip(*data.values())
|
|
31
|
+
|
|
32
|
+
return [dict(zip(headers, row)) for row in rows]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def create_result_table(cursor):
|
|
36
|
+
cursor.execute('''CREATE TABLE IF NOT EXISTS result(
|
|
37
|
+
request TEXT,
|
|
38
|
+
start_time REAL,
|
|
39
|
+
chunk_times TEXT,
|
|
40
|
+
success INTEGER,
|
|
41
|
+
response_messages TEXT,
|
|
42
|
+
completed_time REAL,
|
|
43
|
+
latency REAL,
|
|
44
|
+
first_chunk_latency REAL,
|
|
45
|
+
n_chunks INTEGER,
|
|
46
|
+
chunk_time REAL,
|
|
47
|
+
prompt_tokens INTEGER,
|
|
48
|
+
completion_tokens INTEGER,
|
|
49
|
+
max_gpu_memory_cost REAL)''')
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData):
|
|
53
|
+
request = encode_data(benchmark_data.request)
|
|
54
|
+
chunk_times = json.dumps(benchmark_data.chunk_times)
|
|
55
|
+
response_messages = encode_data(benchmark_data.response_messages)
|
|
56
|
+
|
|
57
|
+
# Columns common to both success and failure cases
|
|
58
|
+
common_columns = (
|
|
59
|
+
request,
|
|
60
|
+
benchmark_data.start_time,
|
|
61
|
+
chunk_times,
|
|
62
|
+
benchmark_data.success,
|
|
63
|
+
response_messages,
|
|
64
|
+
benchmark_data.completed_time,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
if benchmark_data.success:
|
|
68
|
+
# Add additional columns for success case
|
|
69
|
+
additional_columns = (
|
|
70
|
+
benchmark_data.query_latency,
|
|
71
|
+
benchmark_data.first_chunk_latency,
|
|
72
|
+
benchmark_data.n_chunks,
|
|
73
|
+
benchmark_data.n_chunks_time,
|
|
74
|
+
benchmark_data.prompt_tokens,
|
|
75
|
+
benchmark_data.completion_tokens,
|
|
76
|
+
benchmark_data.max_gpu_memory_cost,
|
|
77
|
+
)
|
|
78
|
+
query = """INSERT INTO result(
|
|
79
|
+
request, start_time, chunk_times, success, response_messages,
|
|
80
|
+
completed_time, latency, first_chunk_latency,
|
|
81
|
+
n_chunks, chunk_time, prompt_tokens, completion_tokens, max_gpu_memory_cost
|
|
82
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"""
|
|
83
|
+
cursor.execute(query, common_columns + additional_columns)
|
|
84
|
+
else:
|
|
85
|
+
query = """INSERT INTO result(
|
|
86
|
+
request, start_time, chunk_times, success, response_messages, completed_time
|
|
87
|
+
) VALUES (?, ?, ?, ?, ?, ?)"""
|
|
88
|
+
cursor.execute(query, common_columns)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def get_result_db_path(name, model):
|
|
92
|
+
current_time = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
93
|
+
output_dir = './outputs'
|
|
94
|
+
result_db_path = os.path.join(output_dir, f'{name or model}_perf', current_time, 'benchmark_data.db')
|
|
95
|
+
|
|
96
|
+
if not os.path.exists(os.path.dirname(result_db_path)):
|
|
97
|
+
os.makedirs(os.path.dirname(result_db_path), exist_ok=True)
|
|
98
|
+
|
|
99
|
+
logger.info(f'Save the result to: {result_db_path}')
|
|
100
|
+
if os.path.exists(result_db_path):
|
|
101
|
+
logger.warning('The db file exists, delete it and start again!.')
|
|
102
|
+
sys.exit(1)
|
|
103
|
+
|
|
104
|
+
return result_db_path
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def get_percentile_results(result_db_path: str):
|
|
108
|
+
|
|
109
|
+
def percentile_results(rows, index, percentiles):
|
|
110
|
+
results = {}
|
|
111
|
+
n_success_queries = len(rows)
|
|
112
|
+
for percentile in percentiles:
|
|
113
|
+
idx = int(n_success_queries * percentile / 100)
|
|
114
|
+
row = rows[idx]
|
|
115
|
+
value = row[index] if row[index] is not None else float('inf')
|
|
116
|
+
results[percentile] = round(value, 4)
|
|
117
|
+
return results
|
|
118
|
+
|
|
119
|
+
query_sql = ('SELECT start_time, chunk_times, success, completed_time, latency, first_chunk_latency, '
|
|
120
|
+
'n_chunks, chunk_time, prompt_tokens, completion_tokens '
|
|
121
|
+
'FROM result WHERE success=1 ORDER BY first_chunk_latency ASC')
|
|
122
|
+
percentiles = [10, 25, 50, 66, 75, 80, 90, 95, 98, 99]
|
|
123
|
+
|
|
124
|
+
with sqlite3.connect(result_db_path) as con:
|
|
125
|
+
rows = con.execute(query_sql).fetchall()
|
|
126
|
+
|
|
127
|
+
if len(rows) <= len(percentiles):
|
|
128
|
+
logger.info('Too little data to calculate quantiles!')
|
|
129
|
+
return {}
|
|
130
|
+
|
|
131
|
+
# Calculate percentiles for first chunk latency and latency
|
|
132
|
+
first_chunk_latency_index = 5
|
|
133
|
+
latency_index = 4
|
|
134
|
+
|
|
135
|
+
first_chunk_latency_results = percentile_results(rows, first_chunk_latency_index, percentiles)
|
|
136
|
+
rows.sort(key=lambda x: x[latency_index])
|
|
137
|
+
latency_results = percentile_results(rows, latency_index, percentiles)
|
|
138
|
+
|
|
139
|
+
# Prepare data for tabulation
|
|
140
|
+
return {
|
|
141
|
+
'Percentile': [f'{p}%' for p in percentiles],
|
|
142
|
+
'First Chunk Latency (s)': [first_chunk_latency_results[p] for p in percentiles],
|
|
143
|
+
'Latency (s)': [latency_results[p] for p in percentiles]
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def summary_result(args: Arguments, metrics: BenchmarkMetrics, expected_number_of_queries: int, result_db_path: str):
|
|
148
|
+
result_path = os.path.dirname(result_db_path)
|
|
149
|
+
write_json_file(args.to_dict(), os.path.join(result_path, 'benchmark_args.json'))
|
|
150
|
+
|
|
151
|
+
data = metrics.create_message()
|
|
152
|
+
data.update({'Expected number of requests': expected_number_of_queries, 'Result DB path': result_db_path})
|
|
153
|
+
write_json_file(data, os.path.join(result_path, 'benchmark_summary.json'))
|
|
154
|
+
|
|
155
|
+
# Print summary in a table
|
|
156
|
+
table = tabulate(list(data.items()), headers=['Key', 'Value'], tablefmt='grid')
|
|
157
|
+
logger.info('\nBenchmarking summary:\n' + table)
|
|
158
|
+
|
|
159
|
+
# Get percentile results
|
|
160
|
+
percentile_result = get_percentile_results(result_db_path)
|
|
161
|
+
if percentile_result:
|
|
162
|
+
write_json_file(transpose_results(percentile_result), os.path.join(result_path, 'benchmark_percentile.json'))
|
|
163
|
+
# Print percentile results in a table
|
|
164
|
+
table = tabulate(percentile_result, headers='keys', tablefmt='pretty')
|
|
165
|
+
logger.info('\nPercentile results:\n' + table)
|
|
166
|
+
|
|
167
|
+
if args.dataset.startswith('speed_benchmark'):
|
|
168
|
+
speed_benchmark_result(result_db_path)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def speed_benchmark_result(result_db_path: str):
|
|
172
|
+
query_sql = """
|
|
173
|
+
SELECT
|
|
174
|
+
prompt_tokens,
|
|
175
|
+
ROUND(AVG(completion_tokens / latency), 2) AS avg_completion_token_per_second,
|
|
176
|
+
ROUND(AVG(max_gpu_memory_cost), 2)
|
|
177
|
+
FROM
|
|
178
|
+
result
|
|
179
|
+
WHERE
|
|
180
|
+
success = 1 AND latency > 0
|
|
181
|
+
GROUP BY
|
|
182
|
+
prompt_tokens
|
|
183
|
+
"""
|
|
184
|
+
|
|
185
|
+
with sqlite3.connect(result_db_path) as con:
|
|
186
|
+
cursor = con.cursor()
|
|
187
|
+
cursor.execute(query_sql)
|
|
188
|
+
rows = cursor.fetchall()
|
|
189
|
+
|
|
190
|
+
# Prepare data for tabulation
|
|
191
|
+
headers = ['Prompt Tokens', 'Speed(tokens/s)', 'GPU Memory(GB)']
|
|
192
|
+
data = [dict(zip(headers, row)) for row in rows]
|
|
193
|
+
|
|
194
|
+
# Print results in a table
|
|
195
|
+
table = tabulate(data, headers='keys', tablefmt='pretty')
|
|
196
|
+
logger.info('\nSpeed Benchmark Results:\n' + table)
|
|
197
|
+
|
|
198
|
+
# Write results to JSON file
|
|
199
|
+
result_path = os.path.dirname(result_db_path)
|
|
200
|
+
write_json_file(data, os.path.join(result_path, 'speed_benchmark.json'))
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import functools
|
|
3
|
+
import signal
|
|
4
|
+
import sys
|
|
5
|
+
|
|
6
|
+
from evalscope.utils.logger import get_logger
|
|
7
|
+
|
|
8
|
+
logger = get_logger()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def exception_handler(func):
|
|
12
|
+
if asyncio.iscoroutinefunction(func):
|
|
13
|
+
|
|
14
|
+
@functools.wraps(func)
|
|
15
|
+
async def async_wrapper(*args, **kwargs):
|
|
16
|
+
try:
|
|
17
|
+
return await func(*args, **kwargs)
|
|
18
|
+
except Exception as e:
|
|
19
|
+
logger.exception(f"Exception in async function '{func.__name__}': {e}")
|
|
20
|
+
sys.exit(1)
|
|
21
|
+
|
|
22
|
+
return async_wrapper
|
|
23
|
+
else:
|
|
24
|
+
|
|
25
|
+
@functools.wraps(func)
|
|
26
|
+
def sync_wrapper(*args, **kwargs):
|
|
27
|
+
try:
|
|
28
|
+
return func(*args, **kwargs)
|
|
29
|
+
except Exception as e:
|
|
30
|
+
logger.exception(f"Exception in function '{func.__name__}': {e}")
|
|
31
|
+
sys.exit(1)
|
|
32
|
+
|
|
33
|
+
return sync_wrapper
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def signal_handler(signal_name, loop):
|
|
37
|
+
logger.info('Got signal %s: exit' % signal_name)
|
|
38
|
+
loop.stop()
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def add_signal_handlers(loop):
|
|
42
|
+
for signal_name in {'SIGINT', 'SIGTERM'}:
|
|
43
|
+
loop.add_signal_handler(
|
|
44
|
+
getattr(signal, signal_name),
|
|
45
|
+
functools.partial(signal_handler, signal_name, loop),
|
|
46
|
+
)
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import subprocess
|
|
3
|
+
from contextlib import asynccontextmanager
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
|
|
6
|
+
import torch
|
|
7
|
+
import uvicorn
|
|
8
|
+
from fastapi import FastAPI
|
|
9
|
+
from fastapi.middleware.cors import CORSMiddleware
|
|
10
|
+
from sse_starlette.sse import EventSourceResponse
|
|
11
|
+
|
|
12
|
+
from evalscope.perf.arguments import Arguments
|
|
13
|
+
from evalscope.perf.utils.chat_service import ChatCompletionRequest, ChatService, ModelList, TextCompletionRequest
|
|
14
|
+
from evalscope.utils.logger import get_logger
|
|
15
|
+
|
|
16
|
+
logger = get_logger()
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class ServerSentEvent(object):
|
|
21
|
+
|
|
22
|
+
def __init__(self, data='', event=None, id=None, retry=None):
|
|
23
|
+
self.data = data
|
|
24
|
+
self.event = event
|
|
25
|
+
self.id = id
|
|
26
|
+
self.retry = retry
|
|
27
|
+
|
|
28
|
+
@classmethod
|
|
29
|
+
def decode(cls, line):
|
|
30
|
+
"""Decode line to ServerSentEvent
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
line (str): The line.
|
|
35
|
+
|
|
36
|
+
Return:
|
|
37
|
+
ServerSentEvent (obj:`ServerSentEvent`): The ServerSentEvent object.
|
|
38
|
+
|
|
39
|
+
"""
|
|
40
|
+
if not line:
|
|
41
|
+
return None
|
|
42
|
+
sse_msg = cls()
|
|
43
|
+
# format data:xxx
|
|
44
|
+
field_type, _, field_value = line.partition(':')
|
|
45
|
+
if field_value.startswith(' '): # compatible with openai api
|
|
46
|
+
field_value = field_value[1:]
|
|
47
|
+
if field_type == 'event':
|
|
48
|
+
sse_msg.event = field_value
|
|
49
|
+
elif field_type == 'data':
|
|
50
|
+
field_value = field_value.rstrip()
|
|
51
|
+
sse_msg.data = field_value
|
|
52
|
+
elif field_type == 'id':
|
|
53
|
+
sse_msg.id = field_value
|
|
54
|
+
elif field_type == 'retry':
|
|
55
|
+
sse_msg.retry = field_value
|
|
56
|
+
else:
|
|
57
|
+
pass
|
|
58
|
+
|
|
59
|
+
return sse_msg
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@asynccontextmanager
|
|
63
|
+
async def lifespan(app: FastAPI):
|
|
64
|
+
yield
|
|
65
|
+
if torch.cuda.is_available():
|
|
66
|
+
torch.cuda.empty_cache()
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def create_app(args) -> FastAPI:
|
|
70
|
+
app = FastAPI(lifespan=lifespan)
|
|
71
|
+
chat_service = ChatService(model_path=args.model, attn_implementation=args.attn_implementation)
|
|
72
|
+
|
|
73
|
+
app.add_middleware(
|
|
74
|
+
CORSMiddleware,
|
|
75
|
+
allow_origins=['*'],
|
|
76
|
+
allow_credentials=True,
|
|
77
|
+
allow_methods=['*'],
|
|
78
|
+
allow_headers=['*'],
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
@app.get('/v1/models', response_model=ModelList)
|
|
82
|
+
async def list_models():
|
|
83
|
+
return await chat_service.list_models()
|
|
84
|
+
|
|
85
|
+
@app.post('/v1/completions')
|
|
86
|
+
async def create_text_completion(request: TextCompletionRequest):
|
|
87
|
+
return await chat_service._text_completion(request)
|
|
88
|
+
|
|
89
|
+
@app.post('/v1/chat/completions')
|
|
90
|
+
async def create_chat_completion(request: ChatCompletionRequest):
|
|
91
|
+
if request.stream:
|
|
92
|
+
return EventSourceResponse(chat_service._stream_chat(request))
|
|
93
|
+
else:
|
|
94
|
+
return await chat_service._chat(request)
|
|
95
|
+
|
|
96
|
+
return app
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def start_app(args: Arguments):
|
|
100
|
+
if args.api == 'local':
|
|
101
|
+
app = create_app(args)
|
|
102
|
+
uvicorn.run(app, host='0.0.0.0', port=8877, workers=1)
|
|
103
|
+
|
|
104
|
+
elif args.api == 'local_vllm':
|
|
105
|
+
os.environ['VLLM_USE_MODELSCOPE'] = 'True'
|
|
106
|
+
|
|
107
|
+
proc = subprocess.Popen([
|
|
108
|
+
'python', '-m', 'vllm.entrypoints.openai.api_server', '--model', args.model, '--served-model-name',
|
|
109
|
+
os.path.basename(args.model), '--tensor-parallel-size',
|
|
110
|
+
str(torch.cuda.device_count()), '--max-model-len', '32768', '--gpu-memory-utilization', '0.9', '--host',
|
|
111
|
+
'0.0.0.0', '--port', '8877', '--disable-log-requests', '--disable-log-stats'
|
|
112
|
+
])
|
|
113
|
+
import atexit
|
|
114
|
+
|
|
115
|
+
def on_exit():
|
|
116
|
+
if proc.poll() is None:
|
|
117
|
+
logger.info('Terminating the child process...')
|
|
118
|
+
proc.terminate()
|
|
119
|
+
try:
|
|
120
|
+
proc.wait(timeout=10)
|
|
121
|
+
except subprocess.TimeoutExpired:
|
|
122
|
+
logger.warning('Child process did not terminate within the timeout, killing it forcefully...')
|
|
123
|
+
proc.kill()
|
|
124
|
+
proc.wait()
|
|
125
|
+
logger.info('Child process terminated.')
|
|
126
|
+
else:
|
|
127
|
+
logger.info('Child process has already terminated.')
|
|
128
|
+
|
|
129
|
+
atexit.register(on_exit)
|
|
130
|
+
else:
|
|
131
|
+
raise ValueError(f'Unknown API type: {args.api}')
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
if __name__ == '__main__':
|
|
135
|
+
from collections import namedtuple
|
|
136
|
+
|
|
137
|
+
args = namedtuple('Args', ['model', 'attn_implementation', 'api'])
|
|
138
|
+
|
|
139
|
+
start_app(args(model='Qwen/Qwen2.5-0.5B-Instruct', attn_implementation=None, api='local_vllm'))
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# input raw data
|
|
2
|
+
question_file: registry/data/question.jsonl
|
|
3
|
+
|
|
4
|
+
# candidate models to be battled
|
|
5
|
+
answers_gen:
|
|
6
|
+
chatglm3-6b:
|
|
7
|
+
# model_id_or_path could be local absolute path, e.g. /to/path/.cache/modelscope/ZhipuAI/chatglm3-6b
|
|
8
|
+
model_id_or_path: ZhipuAI/chatglm3-6b # model_id on modelscope
|
|
9
|
+
revision: v1.0.2 # revision of model, default is NULL
|
|
10
|
+
precision: torch.float16
|
|
11
|
+
enable: true # enable or disable this model
|
|
12
|
+
template_type: chatglm3 # see: https://github.com/modelscope/swift/blob/main/docs/source/LLM/%E6%94%AF%E6%8C%81%E7%9A%84%E6%A8%A1%E5%9E%8B%E5%92%8C%E6%95%B0%E6%8D%AE%E9%9B%86.md
|
|
13
|
+
generation_config:
|
|
14
|
+
do_sample: true
|
|
15
|
+
max_new_tokens: 256
|
|
16
|
+
top_k: 20
|
|
17
|
+
top_p: 0.75
|
|
18
|
+
temperature: 0.333
|
|
19
|
+
# output predicted answer file name
|
|
20
|
+
output_file: registry/data/arena/answers/answer_chatglm3-6b.jsonl
|
|
21
|
+
Baichuan2-7B-Base:
|
|
22
|
+
model_id_or_path: baichuan-inc/Baichuan2-7B-Base
|
|
23
|
+
revision: v1.0.2 # revision of model, default is NULL
|
|
24
|
+
precision: torch.float16
|
|
25
|
+
enable: false # enable or disable this model
|
|
26
|
+
template_type: default-generation
|
|
27
|
+
generation_config:
|
|
28
|
+
do_sample: true
|
|
29
|
+
max_new_tokens: 256
|
|
30
|
+
top_k: 20
|
|
31
|
+
top_p: 0.75
|
|
32
|
+
temperature: 0.3
|
|
33
|
+
output_file: registry/data/arena/answers/answer_Baichuan2-7B-Base.jsonl
|
|
34
|
+
Qwen-7B:
|
|
35
|
+
model_id_or_path: qwen/Qwen-7B
|
|
36
|
+
revision: v1.1.8 # revision of model, default is NULL
|
|
37
|
+
precision: torch.float16
|
|
38
|
+
enable: true # enable or disable this model # TODO: tokenizer issue
|
|
39
|
+
template_type: default-generation
|
|
40
|
+
generation_config:
|
|
41
|
+
do_sample: true
|
|
42
|
+
max_new_tokens: 256
|
|
43
|
+
top_k: 20
|
|
44
|
+
top_p: 0.75
|
|
45
|
+
temperature: 0.3
|
|
46
|
+
output_file: registry/data/arena/answers/answer_Qwen-7B.jsonl
|
|
47
|
+
|
|
48
|
+
# Auto-reviewer(GPT-4) config
|
|
49
|
+
reviews_gen:
|
|
50
|
+
enable: true
|
|
51
|
+
reviewer:
|
|
52
|
+
# class reference of auto reviewer(GPT-4)
|
|
53
|
+
ref: evalscope.evaluator.reviewer.auto_reviewer:AutoReviewerGpt4
|
|
54
|
+
args:
|
|
55
|
+
max_tokens: 1024
|
|
56
|
+
temperature: 0.2
|
|
57
|
+
# options: pairwise, pairwise_baseline, single (default is pairwise)
|
|
58
|
+
mode: pairwise
|
|
59
|
+
# position bias mitigation strategy, options: swap_position, randomize_order, NULL. default is NULL
|
|
60
|
+
position_bias_mitigation: NULL
|
|
61
|
+
# completion parser config, default is lmsys_parser
|
|
62
|
+
fn_completion_parser: lmsys_parser
|
|
63
|
+
# prompt templates for auto reviewer(GPT-4)
|
|
64
|
+
prompt_file: registry/data/prompt_template/prompt_templates.jsonl
|
|
65
|
+
# target answer files list to be reviewed,
|
|
66
|
+
# could be replaced by your own path: ['/path/to/answers_model_1.jsonl', '/path/to/answers_model_2.jsonl', ...]
|
|
67
|
+
# Default is NULL, which means all answers in answers_gen will be reviewed
|
|
68
|
+
target_answers: NULL
|
|
69
|
+
# output file name of auto reviewer
|
|
70
|
+
review_file: registry/data/arena/reviews/review_gpt4.jsonl
|
|
71
|
+
|
|
72
|
+
# rating results
|
|
73
|
+
rating_gen:
|
|
74
|
+
enable: true
|
|
75
|
+
metrics: ['elo']
|
|
76
|
+
# elo rating report file name
|
|
77
|
+
report_file: registry/data/arena/reports/elo_rating_origin.csv
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# input raw data
|
|
2
|
+
question_file: registry/data/question.jsonl
|
|
3
|
+
|
|
4
|
+
# candidate models to be battled
|
|
5
|
+
answers_gen:
|
|
6
|
+
Qwen2-7B-Instruct:
|
|
7
|
+
model_id_or_path: /mnt/data/data/user/maoyunlin.myl/models/Qwen2-7B-Instruct # model_id on modelscope
|
|
8
|
+
revision: NULL # revision of model, default is NULL
|
|
9
|
+
precision: torch.float16
|
|
10
|
+
enable: true # enable or disable this model
|
|
11
|
+
template_type: default-generation # see: https://github.com/modelscope/swift/blob/main/docs/source/LLM/%E6%94%AF%E6%8C%81%E7%9A%84%E6%A8%A1%E5%9E%8B%E5%92%8C%E6%95%B0%E6%8D%AE%E9%9B%86.md
|
|
12
|
+
generation_config:
|
|
13
|
+
do_sample: true
|
|
14
|
+
max_new_tokens: 512
|
|
15
|
+
top_k: 20
|
|
16
|
+
top_p: 0.9
|
|
17
|
+
temperature: 0.7
|
|
18
|
+
# output predicted answer file name
|
|
19
|
+
output_file: registry/data/arena/answers/answer_qwen2.jsonl
|
|
20
|
+
Qwen-7B:
|
|
21
|
+
model_id_or_path: /mnt/data/data/user/maoyunlin.myl/output/qwen2-7b-instruct/v25-20240809-113533/checkpoint-309-merged
|
|
22
|
+
revision: NULL # revision of model, default is NULL
|
|
23
|
+
precision: torch.float16
|
|
24
|
+
enable: true # enable or disable this model
|
|
25
|
+
template_type: default-generation
|
|
26
|
+
generation_config:
|
|
27
|
+
do_sample: true
|
|
28
|
+
max_new_tokens: 512
|
|
29
|
+
top_k: 20
|
|
30
|
+
top_p: 0.9
|
|
31
|
+
temperature: 0.7
|
|
32
|
+
output_file: registry/data/arena/answers/answer_Qwen-7B.jsonl
|
|
33
|
+
|
|
34
|
+
# Auto-reviewer(GPT-4) config
|
|
35
|
+
reviews_gen:
|
|
36
|
+
enable: true
|
|
37
|
+
reviewer:
|
|
38
|
+
# class reference of auto reviewer(GPT-4)
|
|
39
|
+
ref: evalscope.evaluator.reviewer.auto_reviewer:AutoReviewerGpt4
|
|
40
|
+
args:
|
|
41
|
+
max_tokens: 1024
|
|
42
|
+
temperature: 0.2
|
|
43
|
+
# options: pairwise, pairwise_baseline, single (default is pairwise)
|
|
44
|
+
mode: pairwise
|
|
45
|
+
# position bias mitigation strategy, options: swap_position, randomize_order, NULL. default is NULL
|
|
46
|
+
position_bias_mitigation: NULL
|
|
47
|
+
# completion parser config, default is lmsys_parser
|
|
48
|
+
fn_completion_parser: lmsys_parser
|
|
49
|
+
# prompt templates for auto reviewer(GPT-4)
|
|
50
|
+
prompt_file: registry/data/prompt_template/prompt_templates.jsonl
|
|
51
|
+
# target answer files list to be reviewed,
|
|
52
|
+
# could be replaced by your own path: ['/path/to/answers_model_1.jsonl', '/path/to/answers_model_2.jsonl', ...]
|
|
53
|
+
# Default is NULL, which means all answers in answers_gen will be reviewed
|
|
54
|
+
target_answers: NULL
|
|
55
|
+
# output file name of auto reviewer
|
|
56
|
+
review_file: registry/data/arena/reviews/review_gpt4.jsonl
|
|
57
|
+
|
|
58
|
+
# rating results
|
|
59
|
+
rating_gen:
|
|
60
|
+
enable: true
|
|
61
|
+
metrics: ['elo']
|
|
62
|
+
# elo rating report file name
|
|
63
|
+
report_file: registry/data/arena/reports/elo_rating_origin.csv
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# input raw data
|
|
2
|
+
question_file: registry/data/question.jsonl
|
|
3
|
+
|
|
4
|
+
# candidate models to be battled
|
|
5
|
+
answers_gen:
|
|
6
|
+
chatglm3-6b:
|
|
7
|
+
# model_id_or_path could be local absolute path, e.g. /to/path/.cache/modelscope/ZhipuAI/chatglm3-6b
|
|
8
|
+
model_id_or_path: ZhipuAI/chatglm3-6b # model_id on modelscope
|
|
9
|
+
revision: v1.0.2 # revision of model, default is NULL
|
|
10
|
+
precision: torch.float16
|
|
11
|
+
enable: true # enable or disable this model
|
|
12
|
+
template_type: chatglm3 # see: https://github.com/modelscope/swift/blob/main/docs/source/LLM/%E6%94%AF%E6%8C%81%E7%9A%84%E6%A8%A1%E5%9E%8B%E5%92%8C%E6%95%B0%E6%8D%AE%E9%9B%86.md
|
|
13
|
+
generation_config:
|
|
14
|
+
do_sample: true
|
|
15
|
+
max_new_tokens: 256
|
|
16
|
+
top_k: 20
|
|
17
|
+
top_p: 0.75
|
|
18
|
+
temperature: 0.3
|
|
19
|
+
# output predicted answer file name
|
|
20
|
+
output_file: registry/data/arena/answers/answer_chatglm3-6b.jsonl
|
|
21
|
+
Baichuan2-7B-Base:
|
|
22
|
+
model_id_or_path: baichuan-inc/Baichuan2-7B-Base
|
|
23
|
+
revision: v1.0.2 # revision of model, default is NULL
|
|
24
|
+
precision: torch.float16
|
|
25
|
+
enable: false # enable or disable this model
|
|
26
|
+
template_type: default-generation
|
|
27
|
+
generation_config:
|
|
28
|
+
do_sample: true
|
|
29
|
+
max_new_tokens: 256
|
|
30
|
+
top_k: 20
|
|
31
|
+
top_p: 0.75
|
|
32
|
+
temperature: 0.3
|
|
33
|
+
output_file: registry/data/arena/answers/answer_Baichuan2-7B-Base.jsonl
|
|
34
|
+
Qwen-7B:
|
|
35
|
+
model_id_or_path: qwen/Qwen-7B
|
|
36
|
+
revision: v1.1.8 # revision of model, default is NULL
|
|
37
|
+
precision: torch.float16
|
|
38
|
+
enable: true # enable or disable this model # TODO: tokenizer issue
|
|
39
|
+
template_type: default-generation
|
|
40
|
+
generation_config:
|
|
41
|
+
do_sample: true
|
|
42
|
+
max_new_tokens: 256
|
|
43
|
+
top_k: 20
|
|
44
|
+
top_p: 0.75
|
|
45
|
+
temperature: 0.3
|
|
46
|
+
output_file: registry/data/arena/answers/answer_Qwen-7B.jsonl
|
|
47
|
+
|
|
48
|
+
# model of auto-reviewer
|
|
49
|
+
reviews_gen:
|
|
50
|
+
enable: true
|
|
51
|
+
reviewer:
|
|
52
|
+
ref: evalscope.evaluator.reviewer.auto_reviewer:AutoReviewerGpt4
|
|
53
|
+
args:
|
|
54
|
+
model: gpt-4
|
|
55
|
+
max_tokens: 1024
|
|
56
|
+
temperature: 0
|
|
57
|
+
# pairwise comparison against baseline
|
|
58
|
+
mode: pairwise_baseline
|
|
59
|
+
# position bias mitigation strategy, options: swap_position, randomize_order, None. default is None
|
|
60
|
+
position_bias_mitigation: swap_position
|
|
61
|
+
# completion parser config, default is lmsys_parser
|
|
62
|
+
fn_completion_parser: lmsys_parser
|
|
63
|
+
# target answers list to be reviewed, could be replaced by your own path: /path/to/answers.jsonl
|
|
64
|
+
target_answers: [registry/data/arena/answers/answer_chatglm3-6b.jsonl,
|
|
65
|
+
registry/data/arena/answers/answer_Baichuan2-7B-Base.jsonl]
|
|
66
|
+
# the path to the outputs of the baseline model
|
|
67
|
+
baseline_file: registry/data/arena/answers/answer_text_davinci_003.jsonl
|
|
68
|
+
# the path to the reference answers
|
|
69
|
+
reference_file:
|
|
70
|
+
# prompt templates for auto reviewer(GPT-4)
|
|
71
|
+
prompt_file: registry/data/prompt_template/lmsys_v2.jsonl
|
|
72
|
+
# output file of auto reviewer
|
|
73
|
+
review_file: registry/data/arena/reviews/review_gpt4_pair_baseline.jsonl
|
|
74
|
+
# cache file of auto reviewer
|
|
75
|
+
cache_file: registry/data/arena/reviews/review_gpt4_pair_baseline.jsonl
|
|
76
|
+
|
|
77
|
+
# rating results
|
|
78
|
+
rating_gen:
|
|
79
|
+
enable: true
|
|
80
|
+
metrics: ['pairwise']
|
|
81
|
+
baseline_model: text_davinci_003
|
|
82
|
+
# elo rating report file
|
|
83
|
+
report_file: registry/data/arena/reports/rating_pairwise_baseline.csv
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# input raw data
|
|
2
|
+
question_file: registry/data/question.jsonl
|
|
3
|
+
|
|
4
|
+
# candidate models to be battled
|
|
5
|
+
answers_gen:
|
|
6
|
+
chatglm3-6b:
|
|
7
|
+
# model_id_or_path could be local absolute path, e.g. /to/path/.cache/modelscope/ZhipuAI/chatglm3-6b
|
|
8
|
+
model_id_or_path: ZhipuAI/chatglm3-6b # model_id on modelscope
|
|
9
|
+
revision: v1.0.2 # revision of model, default is NULL
|
|
10
|
+
precision: torch.float16
|
|
11
|
+
enable: true # enable or disable this model
|
|
12
|
+
template_type: chatglm3
|
|
13
|
+
generation_config:
|
|
14
|
+
do_sample: true
|
|
15
|
+
max_new_tokens: 256
|
|
16
|
+
top_k: 20
|
|
17
|
+
top_p: 0.75
|
|
18
|
+
temperature: 0.3
|
|
19
|
+
# output predicted answer file name
|
|
20
|
+
output_file: registry/data/arena/answers/answer_chatglm3-6b.jsonl
|
|
21
|
+
Baichuan2-7B-Base:
|
|
22
|
+
model_id_or_path: baichuan-inc/Baichuan2-7B-Base
|
|
23
|
+
revision: v1.0.2 # revision of model, default is NULL
|
|
24
|
+
precision: torch.float16
|
|
25
|
+
enable: false # enable or disable this model
|
|
26
|
+
template_type: default-generation
|
|
27
|
+
generation_config:
|
|
28
|
+
do_sample: true
|
|
29
|
+
max_new_tokens: 256
|
|
30
|
+
top_k: 20
|
|
31
|
+
top_p: 0.75
|
|
32
|
+
temperature: 0.3
|
|
33
|
+
output_file: registry/data/arena/answers/answer_Baichuan2-7B-Base.jsonl
|
|
34
|
+
Qwen-7B:
|
|
35
|
+
model_id_or_path: qwen/Qwen-7B
|
|
36
|
+
revision: v1.1.8 # revision of model, default is NULL
|
|
37
|
+
precision: torch.float16
|
|
38
|
+
enable: true # enable or disable this model # TODO: tokenizer issue
|
|
39
|
+
template_type: default-generation
|
|
40
|
+
generation_config:
|
|
41
|
+
do_sample: true
|
|
42
|
+
max_new_tokens: 256
|
|
43
|
+
top_k: 20
|
|
44
|
+
top_p: 0.75
|
|
45
|
+
temperature: 0.3
|
|
46
|
+
output_file: registry/data/arena/answers/answer_Qwen-7B.jsonl
|
|
47
|
+
|
|
48
|
+
# model of auto-reviewer
|
|
49
|
+
reviews_gen:
|
|
50
|
+
enable: true
|
|
51
|
+
reviewer:
|
|
52
|
+
ref: evalscope.evaluator.reviewer.auto_reviewer:AutoReviewerGpt4
|
|
53
|
+
args:
|
|
54
|
+
model: gpt-4
|
|
55
|
+
max_tokens: 1024
|
|
56
|
+
temperature: 0
|
|
57
|
+
# pairwise comparison against baseline
|
|
58
|
+
mode: single
|
|
59
|
+
# completion parser config, default is lmsys_parser
|
|
60
|
+
fn_completion_parser: lmsys_parser
|
|
61
|
+
# target answers list to be reviewed, could be replaced by your own path: /path/to/answers.jsonl
|
|
62
|
+
target_answers: [registry/data/arena/answers/answer_chatglm3-6b.jsonl,
|
|
63
|
+
registry/data/arena/answers/answer_Baichuan2-7B-Base.jsonl]
|
|
64
|
+
# the path to the reference answers
|
|
65
|
+
reference_file:
|
|
66
|
+
# prompt templates for auto reviewer(GPT-4)
|
|
67
|
+
prompt_file: registry/data/prompt_template/lmsys_v2.jsonl
|
|
68
|
+
# output file of auto reviewer
|
|
69
|
+
review_file: registry/data/arena/reviews/review_gpt4_single.jsonl
|
|
70
|
+
# cache file of auto reviewer
|
|
71
|
+
cache_file: registry/data/arena/reviews/review_gpt4_single.jsonl
|
|
72
|
+
|
|
73
|
+
# rating results
|
|
74
|
+
rating_gen:
|
|
75
|
+
enable: true
|
|
76
|
+
metrics: ['score']
|
|
77
|
+
# elo rating report file
|
|
78
|
+
report_file: registry/data/arena/reports/rating_single.csv
|