evalscope 0.6.1__py3-none-any.whl → 0.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (108) hide show
  1. evalscope/backend/opencompass/tasks/eval_api.py +2 -1
  2. evalscope/backend/opencompass/tasks/eval_datasets.py +1 -0
  3. evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +230 -0
  4. evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +43 -0
  5. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +87 -0
  6. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +36 -0
  7. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +26 -0
  8. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +41 -0
  9. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +60 -0
  10. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +36 -0
  11. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +22 -0
  12. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +35 -0
  13. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +7 -0
  14. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +39 -0
  15. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +7 -0
  16. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +39 -0
  17. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +34 -0
  18. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +36 -0
  19. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +25 -0
  20. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +7 -0
  21. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +39 -0
  22. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +16 -0
  23. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +24 -0
  24. evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +18 -0
  25. evalscope/backend/vlm_eval_kit/backend_manager.py +23 -21
  26. evalscope/benchmarks/ceval/samples.jsonl +1 -0
  27. evalscope/benchmarks/cmmlu/samples.jsonl +5 -0
  28. evalscope/benchmarks/mmlu/samples.jsonl +5 -0
  29. evalscope/benchmarks/race/samples.jsonl +5 -0
  30. evalscope/benchmarks/trivia_qa/samples.jsonl +5 -0
  31. evalscope/cli/start_perf.py +8 -11
  32. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +58485 -0
  33. evalscope/metrics/resources/gpt2-zhcn3-v4.json +1 -0
  34. evalscope/metrics/rouge_metric.py +30 -15
  35. evalscope/perf/arguments.py +179 -0
  36. evalscope/perf/benchmark.py +245 -0
  37. evalscope/perf/http_client.py +127 -711
  38. evalscope/perf/main.py +35 -0
  39. evalscope/perf/plugin/__init__.py +2 -0
  40. evalscope/perf/plugin/api/__init__.py +3 -0
  41. evalscope/perf/{api_plugin_base.py → plugin/api/base.py} +17 -18
  42. evalscope/perf/{custom_api.py → plugin/api/custom_api.py} +25 -19
  43. evalscope/perf/{dashscope_api.py → plugin/api/dashscope_api.py} +28 -14
  44. evalscope/perf/{openai_api.py → plugin/api/openai_api.py} +51 -27
  45. evalscope/perf/plugin/datasets/__init__.py +6 -0
  46. evalscope/perf/{dataset_plugin_base.py → plugin/datasets/base.py} +13 -10
  47. evalscope/perf/plugin/datasets/custom.py +21 -0
  48. evalscope/perf/plugin/datasets/flickr8k.py +51 -0
  49. evalscope/perf/{datasets → plugin/datasets}/line_by_line.py +9 -5
  50. evalscope/perf/plugin/datasets/longalpaca.py +28 -0
  51. evalscope/perf/plugin/datasets/openqa.py +38 -0
  52. evalscope/perf/plugin/datasets/speed_benchmark.py +50 -0
  53. evalscope/perf/plugin/registry.py +54 -0
  54. evalscope/perf/{how_to_analysis_result.py → utils/analysis_result.py} +11 -5
  55. evalscope/perf/utils/benchmark_util.py +135 -0
  56. evalscope/perf/utils/chat_service.py +252 -0
  57. evalscope/perf/utils/db_util.py +200 -0
  58. evalscope/perf/utils/handler.py +46 -0
  59. evalscope/perf/utils/local_server.py +139 -0
  60. evalscope/registry/config/cfg_arena.yaml +77 -0
  61. evalscope/registry/config/cfg_arena_zhihu.yaml +63 -0
  62. evalscope/registry/config/cfg_pairwise_baseline.yaml +83 -0
  63. evalscope/registry/config/cfg_single.yaml +78 -0
  64. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +8 -0
  65. evalscope/registry/data/prompt_template/prompt_templates.jsonl +8 -0
  66. evalscope/registry/data/qa_browser/battle.jsonl +634 -0
  67. evalscope/registry/data/qa_browser/category_mapping.yaml +10 -0
  68. evalscope/registry/data/question.jsonl +80 -0
  69. evalscope/third_party/longbench_write/README.md +118 -0
  70. evalscope/third_party/longbench_write/default_task.json +27 -0
  71. evalscope/third_party/longbench_write/default_task.yaml +24 -0
  72. evalscope/third_party/toolbench_static/README.md +118 -0
  73. evalscope/third_party/toolbench_static/config_default.json +15 -0
  74. evalscope/third_party/toolbench_static/config_default.yaml +12 -0
  75. evalscope/third_party/toolbench_static/requirements.txt +2 -0
  76. evalscope/utils/logger.py +18 -20
  77. evalscope/utils/utils.py +41 -42
  78. evalscope/version.py +2 -2
  79. evalscope-0.7.1.dist-info/LICENSE +203 -0
  80. {evalscope-0.6.1.dist-info → evalscope-0.7.1.dist-info}/METADATA +93 -35
  81. {evalscope-0.6.1.dist-info → evalscope-0.7.1.dist-info}/RECORD +101 -31
  82. {evalscope-0.6.1.dist-info → evalscope-0.7.1.dist-info}/WHEEL +1 -1
  83. {evalscope-0.6.1.dist-info → evalscope-0.7.1.dist-info}/top_level.txt +1 -0
  84. tests/cli/__init__.py +1 -0
  85. tests/cli/test_run.py +76 -0
  86. tests/perf/__init__.py +1 -0
  87. tests/perf/test_perf.py +96 -0
  88. tests/rag/test_clip_benchmark.py +85 -0
  89. tests/rag/test_mteb.py +136 -0
  90. tests/rag/test_ragas.py +120 -0
  91. tests/swift/__init__.py +1 -0
  92. tests/swift/test_run_swift_eval.py +146 -0
  93. tests/swift/test_run_swift_vlm_eval.py +128 -0
  94. tests/swift/test_run_swift_vlm_jugde_eval.py +157 -0
  95. tests/test_run_all.py +12 -0
  96. tests/vlm/__init__.py +1 -0
  97. tests/vlm/test_vlmeval.py +59 -0
  98. evalscope/perf/_logging.py +0 -32
  99. evalscope/perf/datasets/longalpaca_12k.py +0 -20
  100. evalscope/perf/datasets/openqa.py +0 -22
  101. evalscope/perf/plugin_registry.py +0 -35
  102. evalscope/perf/query_parameters.py +0 -42
  103. evalscope/perf/server_sent_event.py +0 -43
  104. evalscope/preprocess/tokenizers/gpt2_tokenizer.py +0 -221
  105. /evalscope/perf/{datasets → utils}/__init__.py +0 -0
  106. {evalscope-0.6.1.dist-info → evalscope-0.7.1.dist-info}/entry_points.txt +0 -0
  107. {evalscope/preprocess → tests}/__init__.py +0 -0
  108. {evalscope/preprocess/tokenizers → tests/rag}/__init__.py +0 -0
@@ -0,0 +1,200 @@
1
+ import base64
2
+ import os
3
+ import pickle
4
+ import sqlite3
5
+ import sys
6
+ from datetime import datetime
7
+
8
+ import json
9
+ from tabulate import tabulate
10
+
11
+ from evalscope.perf.arguments import Arguments
12
+ from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
13
+ from evalscope.utils.logger import get_logger
14
+
15
+ logger = get_logger()
16
+
17
+
18
+ def encode_data(data) -> str:
19
+ """Encodes data using base64 and pickle."""
20
+ return base64.b64encode(pickle.dumps(data)).decode('utf-8')
21
+
22
+
23
+ def write_json_file(data, output_path):
24
+ with open(output_path, 'w') as f:
25
+ json.dump(data, f, indent=4, ensure_ascii=False)
26
+
27
+
28
+ def transpose_results(data):
29
+ headers = data.keys()
30
+ rows = zip(*data.values())
31
+
32
+ return [dict(zip(headers, row)) for row in rows]
33
+
34
+
35
+ def create_result_table(cursor):
36
+ cursor.execute('''CREATE TABLE IF NOT EXISTS result(
37
+ request TEXT,
38
+ start_time REAL,
39
+ chunk_times TEXT,
40
+ success INTEGER,
41
+ response_messages TEXT,
42
+ completed_time REAL,
43
+ latency REAL,
44
+ first_chunk_latency REAL,
45
+ n_chunks INTEGER,
46
+ chunk_time REAL,
47
+ prompt_tokens INTEGER,
48
+ completion_tokens INTEGER,
49
+ max_gpu_memory_cost REAL)''')
50
+
51
+
52
+ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData):
53
+ request = encode_data(benchmark_data.request)
54
+ chunk_times = json.dumps(benchmark_data.chunk_times)
55
+ response_messages = encode_data(benchmark_data.response_messages)
56
+
57
+ # Columns common to both success and failure cases
58
+ common_columns = (
59
+ request,
60
+ benchmark_data.start_time,
61
+ chunk_times,
62
+ benchmark_data.success,
63
+ response_messages,
64
+ benchmark_data.completed_time,
65
+ )
66
+
67
+ if benchmark_data.success:
68
+ # Add additional columns for success case
69
+ additional_columns = (
70
+ benchmark_data.query_latency,
71
+ benchmark_data.first_chunk_latency,
72
+ benchmark_data.n_chunks,
73
+ benchmark_data.n_chunks_time,
74
+ benchmark_data.prompt_tokens,
75
+ benchmark_data.completion_tokens,
76
+ benchmark_data.max_gpu_memory_cost,
77
+ )
78
+ query = """INSERT INTO result(
79
+ request, start_time, chunk_times, success, response_messages,
80
+ completed_time, latency, first_chunk_latency,
81
+ n_chunks, chunk_time, prompt_tokens, completion_tokens, max_gpu_memory_cost
82
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"""
83
+ cursor.execute(query, common_columns + additional_columns)
84
+ else:
85
+ query = """INSERT INTO result(
86
+ request, start_time, chunk_times, success, response_messages, completed_time
87
+ ) VALUES (?, ?, ?, ?, ?, ?)"""
88
+ cursor.execute(query, common_columns)
89
+
90
+
91
+ def get_result_db_path(name, model):
92
+ current_time = datetime.now().strftime('%Y%m%d_%H%M%S')
93
+ output_dir = './outputs'
94
+ result_db_path = os.path.join(output_dir, f'{name or model}_perf', current_time, 'benchmark_data.db')
95
+
96
+ if not os.path.exists(os.path.dirname(result_db_path)):
97
+ os.makedirs(os.path.dirname(result_db_path), exist_ok=True)
98
+
99
+ logger.info(f'Save the result to: {result_db_path}')
100
+ if os.path.exists(result_db_path):
101
+ logger.warning('The db file exists, delete it and start again!.')
102
+ sys.exit(1)
103
+
104
+ return result_db_path
105
+
106
+
107
+ def get_percentile_results(result_db_path: str):
108
+
109
+ def percentile_results(rows, index, percentiles):
110
+ results = {}
111
+ n_success_queries = len(rows)
112
+ for percentile in percentiles:
113
+ idx = int(n_success_queries * percentile / 100)
114
+ row = rows[idx]
115
+ value = row[index] if row[index] is not None else float('inf')
116
+ results[percentile] = round(value, 4)
117
+ return results
118
+
119
+ query_sql = ('SELECT start_time, chunk_times, success, completed_time, latency, first_chunk_latency, '
120
+ 'n_chunks, chunk_time, prompt_tokens, completion_tokens '
121
+ 'FROM result WHERE success=1 ORDER BY first_chunk_latency ASC')
122
+ percentiles = [10, 25, 50, 66, 75, 80, 90, 95, 98, 99]
123
+
124
+ with sqlite3.connect(result_db_path) as con:
125
+ rows = con.execute(query_sql).fetchall()
126
+
127
+ if len(rows) <= len(percentiles):
128
+ logger.info('Too little data to calculate quantiles!')
129
+ return {}
130
+
131
+ # Calculate percentiles for first chunk latency and latency
132
+ first_chunk_latency_index = 5
133
+ latency_index = 4
134
+
135
+ first_chunk_latency_results = percentile_results(rows, first_chunk_latency_index, percentiles)
136
+ rows.sort(key=lambda x: x[latency_index])
137
+ latency_results = percentile_results(rows, latency_index, percentiles)
138
+
139
+ # Prepare data for tabulation
140
+ return {
141
+ 'Percentile': [f'{p}%' for p in percentiles],
142
+ 'First Chunk Latency (s)': [first_chunk_latency_results[p] for p in percentiles],
143
+ 'Latency (s)': [latency_results[p] for p in percentiles]
144
+ }
145
+
146
+
147
+ def summary_result(args: Arguments, metrics: BenchmarkMetrics, expected_number_of_queries: int, result_db_path: str):
148
+ result_path = os.path.dirname(result_db_path)
149
+ write_json_file(args.to_dict(), os.path.join(result_path, 'benchmark_args.json'))
150
+
151
+ data = metrics.create_message()
152
+ data.update({'Expected number of requests': expected_number_of_queries, 'Result DB path': result_db_path})
153
+ write_json_file(data, os.path.join(result_path, 'benchmark_summary.json'))
154
+
155
+ # Print summary in a table
156
+ table = tabulate(list(data.items()), headers=['Key', 'Value'], tablefmt='grid')
157
+ logger.info('\nBenchmarking summary:\n' + table)
158
+
159
+ # Get percentile results
160
+ percentile_result = get_percentile_results(result_db_path)
161
+ if percentile_result:
162
+ write_json_file(transpose_results(percentile_result), os.path.join(result_path, 'benchmark_percentile.json'))
163
+ # Print percentile results in a table
164
+ table = tabulate(percentile_result, headers='keys', tablefmt='pretty')
165
+ logger.info('\nPercentile results:\n' + table)
166
+
167
+ if args.dataset.startswith('speed_benchmark'):
168
+ speed_benchmark_result(result_db_path)
169
+
170
+
171
+ def speed_benchmark_result(result_db_path: str):
172
+ query_sql = """
173
+ SELECT
174
+ prompt_tokens,
175
+ ROUND(AVG(completion_tokens / latency), 2) AS avg_completion_token_per_second,
176
+ ROUND(AVG(max_gpu_memory_cost), 2)
177
+ FROM
178
+ result
179
+ WHERE
180
+ success = 1 AND latency > 0
181
+ GROUP BY
182
+ prompt_tokens
183
+ """
184
+
185
+ with sqlite3.connect(result_db_path) as con:
186
+ cursor = con.cursor()
187
+ cursor.execute(query_sql)
188
+ rows = cursor.fetchall()
189
+
190
+ # Prepare data for tabulation
191
+ headers = ['Prompt Tokens', 'Speed(tokens/s)', 'GPU Memory(GB)']
192
+ data = [dict(zip(headers, row)) for row in rows]
193
+
194
+ # Print results in a table
195
+ table = tabulate(data, headers='keys', tablefmt='pretty')
196
+ logger.info('\nSpeed Benchmark Results:\n' + table)
197
+
198
+ # Write results to JSON file
199
+ result_path = os.path.dirname(result_db_path)
200
+ write_json_file(data, os.path.join(result_path, 'speed_benchmark.json'))
@@ -0,0 +1,46 @@
1
+ import asyncio
2
+ import functools
3
+ import signal
4
+ import sys
5
+
6
+ from evalscope.utils.logger import get_logger
7
+
8
+ logger = get_logger()
9
+
10
+
11
+ def exception_handler(func):
12
+ if asyncio.iscoroutinefunction(func):
13
+
14
+ @functools.wraps(func)
15
+ async def async_wrapper(*args, **kwargs):
16
+ try:
17
+ return await func(*args, **kwargs)
18
+ except Exception as e:
19
+ logger.exception(f"Exception in async function '{func.__name__}': {e}")
20
+ sys.exit(1)
21
+
22
+ return async_wrapper
23
+ else:
24
+
25
+ @functools.wraps(func)
26
+ def sync_wrapper(*args, **kwargs):
27
+ try:
28
+ return func(*args, **kwargs)
29
+ except Exception as e:
30
+ logger.exception(f"Exception in function '{func.__name__}': {e}")
31
+ sys.exit(1)
32
+
33
+ return sync_wrapper
34
+
35
+
36
+ def signal_handler(signal_name, loop):
37
+ logger.info('Got signal %s: exit' % signal_name)
38
+ loop.stop()
39
+
40
+
41
+ def add_signal_handlers(loop):
42
+ for signal_name in {'SIGINT', 'SIGTERM'}:
43
+ loop.add_signal_handler(
44
+ getattr(signal, signal_name),
45
+ functools.partial(signal_handler, signal_name, loop),
46
+ )
@@ -0,0 +1,139 @@
1
+ import os
2
+ import subprocess
3
+ from contextlib import asynccontextmanager
4
+ from dataclasses import dataclass
5
+
6
+ import torch
7
+ import uvicorn
8
+ from fastapi import FastAPI
9
+ from fastapi.middleware.cors import CORSMiddleware
10
+ from sse_starlette.sse import EventSourceResponse
11
+
12
+ from evalscope.perf.arguments import Arguments
13
+ from evalscope.perf.utils.chat_service import ChatCompletionRequest, ChatService, ModelList, TextCompletionRequest
14
+ from evalscope.utils.logger import get_logger
15
+
16
+ logger = get_logger()
17
+
18
+
19
+ @dataclass
20
+ class ServerSentEvent(object):
21
+
22
+ def __init__(self, data='', event=None, id=None, retry=None):
23
+ self.data = data
24
+ self.event = event
25
+ self.id = id
26
+ self.retry = retry
27
+
28
+ @classmethod
29
+ def decode(cls, line):
30
+ """Decode line to ServerSentEvent
31
+
32
+
33
+ Args:
34
+ line (str): The line.
35
+
36
+ Return:
37
+ ServerSentEvent (obj:`ServerSentEvent`): The ServerSentEvent object.
38
+
39
+ """
40
+ if not line:
41
+ return None
42
+ sse_msg = cls()
43
+ # format data:xxx
44
+ field_type, _, field_value = line.partition(':')
45
+ if field_value.startswith(' '): # compatible with openai api
46
+ field_value = field_value[1:]
47
+ if field_type == 'event':
48
+ sse_msg.event = field_value
49
+ elif field_type == 'data':
50
+ field_value = field_value.rstrip()
51
+ sse_msg.data = field_value
52
+ elif field_type == 'id':
53
+ sse_msg.id = field_value
54
+ elif field_type == 'retry':
55
+ sse_msg.retry = field_value
56
+ else:
57
+ pass
58
+
59
+ return sse_msg
60
+
61
+
62
+ @asynccontextmanager
63
+ async def lifespan(app: FastAPI):
64
+ yield
65
+ if torch.cuda.is_available():
66
+ torch.cuda.empty_cache()
67
+
68
+
69
+ def create_app(args) -> FastAPI:
70
+ app = FastAPI(lifespan=lifespan)
71
+ chat_service = ChatService(model_path=args.model, attn_implementation=args.attn_implementation)
72
+
73
+ app.add_middleware(
74
+ CORSMiddleware,
75
+ allow_origins=['*'],
76
+ allow_credentials=True,
77
+ allow_methods=['*'],
78
+ allow_headers=['*'],
79
+ )
80
+
81
+ @app.get('/v1/models', response_model=ModelList)
82
+ async def list_models():
83
+ return await chat_service.list_models()
84
+
85
+ @app.post('/v1/completions')
86
+ async def create_text_completion(request: TextCompletionRequest):
87
+ return await chat_service._text_completion(request)
88
+
89
+ @app.post('/v1/chat/completions')
90
+ async def create_chat_completion(request: ChatCompletionRequest):
91
+ if request.stream:
92
+ return EventSourceResponse(chat_service._stream_chat(request))
93
+ else:
94
+ return await chat_service._chat(request)
95
+
96
+ return app
97
+
98
+
99
+ def start_app(args: Arguments):
100
+ if args.api == 'local':
101
+ app = create_app(args)
102
+ uvicorn.run(app, host='0.0.0.0', port=8877, workers=1)
103
+
104
+ elif args.api == 'local_vllm':
105
+ os.environ['VLLM_USE_MODELSCOPE'] = 'True'
106
+
107
+ proc = subprocess.Popen([
108
+ 'python', '-m', 'vllm.entrypoints.openai.api_server', '--model', args.model, '--served-model-name',
109
+ os.path.basename(args.model), '--tensor-parallel-size',
110
+ str(torch.cuda.device_count()), '--max-model-len', '32768', '--gpu-memory-utilization', '0.9', '--host',
111
+ '0.0.0.0', '--port', '8877', '--disable-log-requests', '--disable-log-stats'
112
+ ])
113
+ import atexit
114
+
115
+ def on_exit():
116
+ if proc.poll() is None:
117
+ logger.info('Terminating the child process...')
118
+ proc.terminate()
119
+ try:
120
+ proc.wait(timeout=10)
121
+ except subprocess.TimeoutExpired:
122
+ logger.warning('Child process did not terminate within the timeout, killing it forcefully...')
123
+ proc.kill()
124
+ proc.wait()
125
+ logger.info('Child process terminated.')
126
+ else:
127
+ logger.info('Child process has already terminated.')
128
+
129
+ atexit.register(on_exit)
130
+ else:
131
+ raise ValueError(f'Unknown API type: {args.api}')
132
+
133
+
134
+ if __name__ == '__main__':
135
+ from collections import namedtuple
136
+
137
+ args = namedtuple('Args', ['model', 'attn_implementation', 'api'])
138
+
139
+ start_app(args(model='Qwen/Qwen2.5-0.5B-Instruct', attn_implementation=None, api='local_vllm'))
@@ -0,0 +1,77 @@
1
+ # input raw data
2
+ question_file: registry/data/question.jsonl
3
+
4
+ # candidate models to be battled
5
+ answers_gen:
6
+ chatglm3-6b:
7
+ # model_id_or_path could be local absolute path, e.g. /to/path/.cache/modelscope/ZhipuAI/chatglm3-6b
8
+ model_id_or_path: ZhipuAI/chatglm3-6b # model_id on modelscope
9
+ revision: v1.0.2 # revision of model, default is NULL
10
+ precision: torch.float16
11
+ enable: true # enable or disable this model
12
+ template_type: chatglm3 # see: https://github.com/modelscope/swift/blob/main/docs/source/LLM/%E6%94%AF%E6%8C%81%E7%9A%84%E6%A8%A1%E5%9E%8B%E5%92%8C%E6%95%B0%E6%8D%AE%E9%9B%86.md
13
+ generation_config:
14
+ do_sample: true
15
+ max_new_tokens: 256
16
+ top_k: 20
17
+ top_p: 0.75
18
+ temperature: 0.333
19
+ # output predicted answer file name
20
+ output_file: registry/data/arena/answers/answer_chatglm3-6b.jsonl
21
+ Baichuan2-7B-Base:
22
+ model_id_or_path: baichuan-inc/Baichuan2-7B-Base
23
+ revision: v1.0.2 # revision of model, default is NULL
24
+ precision: torch.float16
25
+ enable: false # enable or disable this model
26
+ template_type: default-generation
27
+ generation_config:
28
+ do_sample: true
29
+ max_new_tokens: 256
30
+ top_k: 20
31
+ top_p: 0.75
32
+ temperature: 0.3
33
+ output_file: registry/data/arena/answers/answer_Baichuan2-7B-Base.jsonl
34
+ Qwen-7B:
35
+ model_id_or_path: qwen/Qwen-7B
36
+ revision: v1.1.8 # revision of model, default is NULL
37
+ precision: torch.float16
38
+ enable: true # enable or disable this model # TODO: tokenizer issue
39
+ template_type: default-generation
40
+ generation_config:
41
+ do_sample: true
42
+ max_new_tokens: 256
43
+ top_k: 20
44
+ top_p: 0.75
45
+ temperature: 0.3
46
+ output_file: registry/data/arena/answers/answer_Qwen-7B.jsonl
47
+
48
+ # Auto-reviewer(GPT-4) config
49
+ reviews_gen:
50
+ enable: true
51
+ reviewer:
52
+ # class reference of auto reviewer(GPT-4)
53
+ ref: evalscope.evaluator.reviewer.auto_reviewer:AutoReviewerGpt4
54
+ args:
55
+ max_tokens: 1024
56
+ temperature: 0.2
57
+ # options: pairwise, pairwise_baseline, single (default is pairwise)
58
+ mode: pairwise
59
+ # position bias mitigation strategy, options: swap_position, randomize_order, NULL. default is NULL
60
+ position_bias_mitigation: NULL
61
+ # completion parser config, default is lmsys_parser
62
+ fn_completion_parser: lmsys_parser
63
+ # prompt templates for auto reviewer(GPT-4)
64
+ prompt_file: registry/data/prompt_template/prompt_templates.jsonl
65
+ # target answer files list to be reviewed,
66
+ # could be replaced by your own path: ['/path/to/answers_model_1.jsonl', '/path/to/answers_model_2.jsonl', ...]
67
+ # Default is NULL, which means all answers in answers_gen will be reviewed
68
+ target_answers: NULL
69
+ # output file name of auto reviewer
70
+ review_file: registry/data/arena/reviews/review_gpt4.jsonl
71
+
72
+ # rating results
73
+ rating_gen:
74
+ enable: true
75
+ metrics: ['elo']
76
+ # elo rating report file name
77
+ report_file: registry/data/arena/reports/elo_rating_origin.csv
@@ -0,0 +1,63 @@
1
+ # input raw data
2
+ question_file: registry/data/question.jsonl
3
+
4
+ # candidate models to be battled
5
+ answers_gen:
6
+ Qwen2-7B-Instruct:
7
+ model_id_or_path: /mnt/data/data/user/maoyunlin.myl/models/Qwen2-7B-Instruct # model_id on modelscope
8
+ revision: NULL # revision of model, default is NULL
9
+ precision: torch.float16
10
+ enable: true # enable or disable this model
11
+ template_type: default-generation # see: https://github.com/modelscope/swift/blob/main/docs/source/LLM/%E6%94%AF%E6%8C%81%E7%9A%84%E6%A8%A1%E5%9E%8B%E5%92%8C%E6%95%B0%E6%8D%AE%E9%9B%86.md
12
+ generation_config:
13
+ do_sample: true
14
+ max_new_tokens: 512
15
+ top_k: 20
16
+ top_p: 0.9
17
+ temperature: 0.7
18
+ # output predicted answer file name
19
+ output_file: registry/data/arena/answers/answer_qwen2.jsonl
20
+ Qwen-7B:
21
+ model_id_or_path: /mnt/data/data/user/maoyunlin.myl/output/qwen2-7b-instruct/v25-20240809-113533/checkpoint-309-merged
22
+ revision: NULL # revision of model, default is NULL
23
+ precision: torch.float16
24
+ enable: true # enable or disable this model
25
+ template_type: default-generation
26
+ generation_config:
27
+ do_sample: true
28
+ max_new_tokens: 512
29
+ top_k: 20
30
+ top_p: 0.9
31
+ temperature: 0.7
32
+ output_file: registry/data/arena/answers/answer_Qwen-7B.jsonl
33
+
34
+ # Auto-reviewer(GPT-4) config
35
+ reviews_gen:
36
+ enable: true
37
+ reviewer:
38
+ # class reference of auto reviewer(GPT-4)
39
+ ref: evalscope.evaluator.reviewer.auto_reviewer:AutoReviewerGpt4
40
+ args:
41
+ max_tokens: 1024
42
+ temperature: 0.2
43
+ # options: pairwise, pairwise_baseline, single (default is pairwise)
44
+ mode: pairwise
45
+ # position bias mitigation strategy, options: swap_position, randomize_order, NULL. default is NULL
46
+ position_bias_mitigation: NULL
47
+ # completion parser config, default is lmsys_parser
48
+ fn_completion_parser: lmsys_parser
49
+ # prompt templates for auto reviewer(GPT-4)
50
+ prompt_file: registry/data/prompt_template/prompt_templates.jsonl
51
+ # target answer files list to be reviewed,
52
+ # could be replaced by your own path: ['/path/to/answers_model_1.jsonl', '/path/to/answers_model_2.jsonl', ...]
53
+ # Default is NULL, which means all answers in answers_gen will be reviewed
54
+ target_answers: NULL
55
+ # output file name of auto reviewer
56
+ review_file: registry/data/arena/reviews/review_gpt4.jsonl
57
+
58
+ # rating results
59
+ rating_gen:
60
+ enable: true
61
+ metrics: ['elo']
62
+ # elo rating report file name
63
+ report_file: registry/data/arena/reports/elo_rating_origin.csv
@@ -0,0 +1,83 @@
1
+ # input raw data
2
+ question_file: registry/data/question.jsonl
3
+
4
+ # candidate models to be battled
5
+ answers_gen:
6
+ chatglm3-6b:
7
+ # model_id_or_path could be local absolute path, e.g. /to/path/.cache/modelscope/ZhipuAI/chatglm3-6b
8
+ model_id_or_path: ZhipuAI/chatglm3-6b # model_id on modelscope
9
+ revision: v1.0.2 # revision of model, default is NULL
10
+ precision: torch.float16
11
+ enable: true # enable or disable this model
12
+ template_type: chatglm3 # see: https://github.com/modelscope/swift/blob/main/docs/source/LLM/%E6%94%AF%E6%8C%81%E7%9A%84%E6%A8%A1%E5%9E%8B%E5%92%8C%E6%95%B0%E6%8D%AE%E9%9B%86.md
13
+ generation_config:
14
+ do_sample: true
15
+ max_new_tokens: 256
16
+ top_k: 20
17
+ top_p: 0.75
18
+ temperature: 0.3
19
+ # output predicted answer file name
20
+ output_file: registry/data/arena/answers/answer_chatglm3-6b.jsonl
21
+ Baichuan2-7B-Base:
22
+ model_id_or_path: baichuan-inc/Baichuan2-7B-Base
23
+ revision: v1.0.2 # revision of model, default is NULL
24
+ precision: torch.float16
25
+ enable: false # enable or disable this model
26
+ template_type: default-generation
27
+ generation_config:
28
+ do_sample: true
29
+ max_new_tokens: 256
30
+ top_k: 20
31
+ top_p: 0.75
32
+ temperature: 0.3
33
+ output_file: registry/data/arena/answers/answer_Baichuan2-7B-Base.jsonl
34
+ Qwen-7B:
35
+ model_id_or_path: qwen/Qwen-7B
36
+ revision: v1.1.8 # revision of model, default is NULL
37
+ precision: torch.float16
38
+ enable: true # enable or disable this model # TODO: tokenizer issue
39
+ template_type: default-generation
40
+ generation_config:
41
+ do_sample: true
42
+ max_new_tokens: 256
43
+ top_k: 20
44
+ top_p: 0.75
45
+ temperature: 0.3
46
+ output_file: registry/data/arena/answers/answer_Qwen-7B.jsonl
47
+
48
+ # model of auto-reviewer
49
+ reviews_gen:
50
+ enable: true
51
+ reviewer:
52
+ ref: evalscope.evaluator.reviewer.auto_reviewer:AutoReviewerGpt4
53
+ args:
54
+ model: gpt-4
55
+ max_tokens: 1024
56
+ temperature: 0
57
+ # pairwise comparison against baseline
58
+ mode: pairwise_baseline
59
+ # position bias mitigation strategy, options: swap_position, randomize_order, None. default is None
60
+ position_bias_mitigation: swap_position
61
+ # completion parser config, default is lmsys_parser
62
+ fn_completion_parser: lmsys_parser
63
+ # target answers list to be reviewed, could be replaced by your own path: /path/to/answers.jsonl
64
+ target_answers: [registry/data/arena/answers/answer_chatglm3-6b.jsonl,
65
+ registry/data/arena/answers/answer_Baichuan2-7B-Base.jsonl]
66
+ # the path to the outputs of the baseline model
67
+ baseline_file: registry/data/arena/answers/answer_text_davinci_003.jsonl
68
+ # the path to the reference answers
69
+ reference_file:
70
+ # prompt templates for auto reviewer(GPT-4)
71
+ prompt_file: registry/data/prompt_template/lmsys_v2.jsonl
72
+ # output file of auto reviewer
73
+ review_file: registry/data/arena/reviews/review_gpt4_pair_baseline.jsonl
74
+ # cache file of auto reviewer
75
+ cache_file: registry/data/arena/reviews/review_gpt4_pair_baseline.jsonl
76
+
77
+ # rating results
78
+ rating_gen:
79
+ enable: true
80
+ metrics: ['pairwise']
81
+ baseline_model: text_davinci_003
82
+ # elo rating report file
83
+ report_file: registry/data/arena/reports/rating_pairwise_baseline.csv
@@ -0,0 +1,78 @@
1
+ # input raw data
2
+ question_file: registry/data/question.jsonl
3
+
4
+ # candidate models to be battled
5
+ answers_gen:
6
+ chatglm3-6b:
7
+ # model_id_or_path could be local absolute path, e.g. /to/path/.cache/modelscope/ZhipuAI/chatglm3-6b
8
+ model_id_or_path: ZhipuAI/chatglm3-6b # model_id on modelscope
9
+ revision: v1.0.2 # revision of model, default is NULL
10
+ precision: torch.float16
11
+ enable: true # enable or disable this model
12
+ template_type: chatglm3
13
+ generation_config:
14
+ do_sample: true
15
+ max_new_tokens: 256
16
+ top_k: 20
17
+ top_p: 0.75
18
+ temperature: 0.3
19
+ # output predicted answer file name
20
+ output_file: registry/data/arena/answers/answer_chatglm3-6b.jsonl
21
+ Baichuan2-7B-Base:
22
+ model_id_or_path: baichuan-inc/Baichuan2-7B-Base
23
+ revision: v1.0.2 # revision of model, default is NULL
24
+ precision: torch.float16
25
+ enable: false # enable or disable this model
26
+ template_type: default-generation
27
+ generation_config:
28
+ do_sample: true
29
+ max_new_tokens: 256
30
+ top_k: 20
31
+ top_p: 0.75
32
+ temperature: 0.3
33
+ output_file: registry/data/arena/answers/answer_Baichuan2-7B-Base.jsonl
34
+ Qwen-7B:
35
+ model_id_or_path: qwen/Qwen-7B
36
+ revision: v1.1.8 # revision of model, default is NULL
37
+ precision: torch.float16
38
+ enable: true # enable or disable this model # TODO: tokenizer issue
39
+ template_type: default-generation
40
+ generation_config:
41
+ do_sample: true
42
+ max_new_tokens: 256
43
+ top_k: 20
44
+ top_p: 0.75
45
+ temperature: 0.3
46
+ output_file: registry/data/arena/answers/answer_Qwen-7B.jsonl
47
+
48
+ # model of auto-reviewer
49
+ reviews_gen:
50
+ enable: true
51
+ reviewer:
52
+ ref: evalscope.evaluator.reviewer.auto_reviewer:AutoReviewerGpt4
53
+ args:
54
+ model: gpt-4
55
+ max_tokens: 1024
56
+ temperature: 0
57
+ # pairwise comparison against baseline
58
+ mode: single
59
+ # completion parser config, default is lmsys_parser
60
+ fn_completion_parser: lmsys_parser
61
+ # target answers list to be reviewed, could be replaced by your own path: /path/to/answers.jsonl
62
+ target_answers: [registry/data/arena/answers/answer_chatglm3-6b.jsonl,
63
+ registry/data/arena/answers/answer_Baichuan2-7B-Base.jsonl]
64
+ # the path to the reference answers
65
+ reference_file:
66
+ # prompt templates for auto reviewer(GPT-4)
67
+ prompt_file: registry/data/prompt_template/lmsys_v2.jsonl
68
+ # output file of auto reviewer
69
+ review_file: registry/data/arena/reviews/review_gpt4_single.jsonl
70
+ # cache file of auto reviewer
71
+ cache_file: registry/data/arena/reviews/review_gpt4_single.jsonl
72
+
73
+ # rating results
74
+ rating_gen:
75
+ enable: true
76
+ metrics: ['score']
77
+ # elo rating report file
78
+ report_file: registry/data/arena/reports/rating_single.csv