evalscope 1.0.2__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/api/benchmark/__init__.py +8 -1
- evalscope/api/benchmark/adapters/__init__.py +1 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +12 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/benchmark.py +14 -0
- evalscope/api/dataset/dataset.py +21 -0
- evalscope/api/dataset/loader.py +6 -2
- evalscope/api/mixin/sandbox_mixin.py +32 -54
- evalscope/api/model/generate_config.py +6 -0
- evalscope/app/ui/multi_model.py +6 -1
- evalscope/app/ui/single_model.py +8 -2
- evalscope/app/utils/data_utils.py +3 -2
- evalscope/app/utils/visualization.py +2 -2
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +3 -2
- evalscope/benchmarks/bfcl/bfcl_adapter.py +11 -46
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +2 -1
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +1 -1
- evalscope/benchmarks/general_arena/utils.py +2 -1
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +23 -4
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +158 -0
- evalscope/benchmarks/hle/hle_adapter.py +3 -2
- evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -1
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +3 -1
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +100 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +111 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +6 -26
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +2 -2
- evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -1
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +127 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +111 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +1 -1
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +1 -1
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/constants.py +4 -0
- evalscope/evaluator/evaluator.py +72 -79
- evalscope/metrics/math_parser.py +14 -0
- evalscope/metrics/metric.py +52 -1
- evalscope/metrics/metrics.py +16 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
- evalscope/models/utils/openai.py +4 -0
- evalscope/perf/arguments.py +24 -4
- evalscope/perf/benchmark.py +74 -89
- evalscope/perf/http_client.py +31 -16
- evalscope/perf/main.py +15 -2
- evalscope/perf/plugin/api/base.py +9 -7
- evalscope/perf/plugin/api/custom_api.py +13 -58
- evalscope/perf/plugin/api/default_api.py +179 -79
- evalscope/perf/plugin/api/openai_api.py +4 -3
- evalscope/perf/plugin/datasets/base.py +21 -0
- evalscope/perf/plugin/datasets/custom.py +2 -3
- evalscope/perf/plugin/datasets/line_by_line.py +2 -3
- evalscope/perf/plugin/datasets/longalpaca.py +2 -3
- evalscope/perf/plugin/datasets/openqa.py +2 -4
- evalscope/perf/plugin/datasets/random_dataset.py +1 -3
- evalscope/perf/utils/benchmark_util.py +36 -22
- evalscope/perf/utils/db_util.py +14 -19
- evalscope/perf/utils/local_server.py +0 -44
- evalscope/perf/utils/log_utils.py +21 -6
- evalscope/report/__init__.py +11 -2
- evalscope/report/combinator.py +52 -2
- evalscope/run.py +4 -0
- evalscope/utils/function_utils.py +195 -12
- evalscope/utils/io_utils.py +74 -0
- evalscope/utils/json_schema.py +8 -6
- evalscope/utils/logger.py +49 -17
- evalscope/utils/multi_choices.py +16 -1
- evalscope/utils/ner.py +377 -0
- evalscope/version.py +2 -2
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/METADATA +239 -393
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/RECORD +140 -98
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/WHEEL +1 -1
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/top_level.txt +0 -1
- tests/__init__.py +0 -1
- tests/benchmark/__init__.py +0 -1
- tests/benchmark/test_eval.py +0 -429
- tests/benchmark/test_image_edit.py +0 -65
- tests/benchmark/test_sandbox.py +0 -81
- tests/benchmark/test_t2i.py +0 -142
- tests/benchmark/test_vlm.py +0 -137
- tests/cli/__init__.py +0 -1
- tests/cli/test_all.py +0 -269
- tests/cli/test_collection.py +0 -99
- tests/cli/test_custom.py +0 -268
- tests/cli/test_reasoning.py +0 -81
- tests/common.py +0 -73
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -206
- tests/rag/test_clip_benchmark.py +0 -87
- tests/rag/test_mteb.py +0 -213
- tests/rag/test_ragas.py +0 -128
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -146
- tests/swift/test_run_swift_vlm_eval.py +0 -128
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
- tests/test_run_all.py +0 -12
- tests/utils.py +0 -13
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -102
- {tests/rag → evalscope/benchmarks/aa_lcr}/__init__.py +0 -0
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/entry_points.txt +0 -0
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info/licenses}/LICENSE +0 -0
|
@@ -1,24 +1,68 @@
|
|
|
1
1
|
import aiohttp
|
|
2
2
|
import json
|
|
3
|
-
|
|
4
|
-
|
|
3
|
+
import sys
|
|
4
|
+
import time
|
|
5
|
+
import traceback
|
|
6
|
+
from typing import Any, Dict
|
|
5
7
|
|
|
6
8
|
from evalscope.perf.arguments import Arguments
|
|
7
9
|
from evalscope.perf.plugin.api.base import ApiPluginBase
|
|
8
|
-
from evalscope.perf.utils.
|
|
10
|
+
from evalscope.perf.utils.benchmark_util import BenchmarkData
|
|
9
11
|
from evalscope.utils.logger import get_logger
|
|
10
12
|
|
|
11
13
|
logger = get_logger()
|
|
12
14
|
|
|
13
15
|
|
|
16
|
+
class StreamedResponseHandler:
|
|
17
|
+
"""Handles streaming HTTP responses by accumulating chunks until complete
|
|
18
|
+
messages are available."""
|
|
19
|
+
|
|
20
|
+
def __init__(self):
|
|
21
|
+
self.buffer = ''
|
|
22
|
+
|
|
23
|
+
def add_chunk(self, chunk_bytes: bytes) -> list[str]:
|
|
24
|
+
"""Add a chunk of bytes to the buffer and return any complete
|
|
25
|
+
messages."""
|
|
26
|
+
chunk_str = chunk_bytes.decode('utf-8')
|
|
27
|
+
self.buffer += chunk_str
|
|
28
|
+
|
|
29
|
+
messages = []
|
|
30
|
+
|
|
31
|
+
# Split by double newlines (SSE message separator)
|
|
32
|
+
while '\n\n' in self.buffer:
|
|
33
|
+
message, self.buffer = self.buffer.split('\n\n', 1)
|
|
34
|
+
message = message.strip()
|
|
35
|
+
if message:
|
|
36
|
+
messages.append(message)
|
|
37
|
+
|
|
38
|
+
# if self.buffer is not empty, check if it is a complete message
|
|
39
|
+
# by removing data: prefix and check if it is a valid JSON
|
|
40
|
+
if self.buffer.startswith('data: '):
|
|
41
|
+
message_content = self.buffer.removeprefix('data: ').strip()
|
|
42
|
+
if message_content == '[DONE]':
|
|
43
|
+
messages.append(self.buffer.strip())
|
|
44
|
+
self.buffer = ''
|
|
45
|
+
elif message_content:
|
|
46
|
+
try:
|
|
47
|
+
json.loads(message_content)
|
|
48
|
+
messages.append(self.buffer.strip())
|
|
49
|
+
self.buffer = ''
|
|
50
|
+
except json.JSONDecodeError:
|
|
51
|
+
# Incomplete JSON, wait for more chunks.
|
|
52
|
+
pass
|
|
53
|
+
|
|
54
|
+
return messages
|
|
55
|
+
|
|
56
|
+
|
|
14
57
|
class DefaultApiPlugin(ApiPluginBase):
|
|
15
58
|
"""Default implementation of API plugin with common HTTP handling methods."""
|
|
16
59
|
|
|
17
60
|
def __init__(self, param: Arguments):
|
|
18
61
|
super().__init__(param)
|
|
19
62
|
|
|
20
|
-
async def process_request(
|
|
21
|
-
|
|
63
|
+
async def process_request(
|
|
64
|
+
self, client_session: aiohttp.ClientSession, url: str, headers: Dict, body: Dict
|
|
65
|
+
) -> BenchmarkData:
|
|
22
66
|
"""Process the HTTP request and handle the response.
|
|
23
67
|
|
|
24
68
|
Args:
|
|
@@ -27,79 +71,135 @@ class DefaultApiPlugin(ApiPluginBase):
|
|
|
27
71
|
headers: The request headers
|
|
28
72
|
body: The request body
|
|
29
73
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
"""
|
|
33
|
-
try:
|
|
34
|
-
headers = {'Content-Type': 'application/json', **headers}
|
|
35
|
-
data = json.dumps(body, ensure_ascii=False) # serialize to JSON
|
|
36
|
-
async with client_session.request('POST', url=url, data=data, headers=headers) as response:
|
|
37
|
-
async for result in self._handle_response(response):
|
|
38
|
-
yield result
|
|
39
|
-
except Exception as e:
|
|
40
|
-
logger.error(f'Error in process_request: {e}')
|
|
41
|
-
yield (True, None, str(e))
|
|
42
|
-
|
|
43
|
-
async def _handle_stream(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, Any], None]:
|
|
44
|
-
"""Handle streaming response from server-sent events.
|
|
45
|
-
|
|
46
|
-
Args:
|
|
47
|
-
response: The aiohttp response object containing a stream
|
|
48
|
-
|
|
49
|
-
Yields:
|
|
50
|
-
Tuple[bool, int, Any]: (is_error, status_code, data)
|
|
74
|
+
Returns:
|
|
75
|
+
BenchmarkData: Aggregated benchmarking data for the request/response.
|
|
51
76
|
"""
|
|
77
|
+
headers = {'Content-Type': 'application/json', **headers}
|
|
78
|
+
data = json.dumps(body, ensure_ascii=False) # serialize to JSON
|
|
79
|
+
|
|
80
|
+
output = BenchmarkData()
|
|
81
|
+
ttft = 0.0
|
|
82
|
+
generated_text = ''
|
|
83
|
+
st = time.perf_counter()
|
|
84
|
+
output.start_time = st
|
|
85
|
+
output.request = data
|
|
86
|
+
most_recent_timestamp = st
|
|
52
87
|
try:
|
|
53
|
-
async
|
|
54
|
-
|
|
55
|
-
if
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
88
|
+
async with client_session.post(url=url, data=data, headers=headers) as response:
|
|
89
|
+
content_type = response.headers.get('Content-Type', '')
|
|
90
|
+
if response.status == 200:
|
|
91
|
+
# Handle streaming responses (SSE)
|
|
92
|
+
if 'text/event-stream' in content_type:
|
|
93
|
+
handler = StreamedResponseHandler()
|
|
94
|
+
async for chunk_bytes in response.content.iter_any():
|
|
95
|
+
chunk_bytes = chunk_bytes.strip()
|
|
96
|
+
if not chunk_bytes:
|
|
97
|
+
continue
|
|
98
|
+
|
|
99
|
+
messages = handler.add_chunk(chunk_bytes)
|
|
100
|
+
for message in messages:
|
|
101
|
+
# NOTE: SSE comments (often used as pings) start with
|
|
102
|
+
# a colon. These are not JSON data payload and should
|
|
103
|
+
# be skipped.
|
|
104
|
+
if message.startswith(':'):
|
|
105
|
+
continue
|
|
106
|
+
|
|
107
|
+
chunk = message.removeprefix('data: ')
|
|
108
|
+
|
|
109
|
+
if chunk != '[DONE]':
|
|
110
|
+
timestamp = time.perf_counter()
|
|
111
|
+
data = json.loads(chunk)
|
|
112
|
+
|
|
113
|
+
if choices := data.get('choices'):
|
|
114
|
+
content = choices[0]['delta'].get('content')
|
|
115
|
+
# First token
|
|
116
|
+
if ttft == 0.0:
|
|
117
|
+
ttft = timestamp - st
|
|
118
|
+
output.first_chunk_latency = ttft
|
|
119
|
+
|
|
120
|
+
# Decoding phase
|
|
121
|
+
else:
|
|
122
|
+
output.inter_chunk_latency.append(timestamp - most_recent_timestamp)
|
|
123
|
+
|
|
124
|
+
generated_text += content or ''
|
|
125
|
+
output.response_messages.append(data)
|
|
126
|
+
elif usage := data.get('usage'):
|
|
127
|
+
output.prompt_tokens = usage.get('prompt_tokens')
|
|
128
|
+
output.completion_tokens = usage.get('completion_tokens')
|
|
129
|
+
|
|
130
|
+
most_recent_timestamp = timestamp
|
|
131
|
+
|
|
132
|
+
output.generated_text = generated_text
|
|
133
|
+
output.success = True
|
|
134
|
+
output.completed_time = most_recent_timestamp
|
|
135
|
+
output.query_latency = most_recent_timestamp - st
|
|
136
|
+
|
|
137
|
+
# Handle non-stream JSON responses
|
|
138
|
+
elif 'application/json' in content_type or 'application/' in content_type:
|
|
139
|
+
payload: Any
|
|
140
|
+
try:
|
|
141
|
+
payload = await response.json()
|
|
142
|
+
except Exception:
|
|
143
|
+
# Fallback to text if JSON parsing fails
|
|
144
|
+
payload = await response.text()
|
|
145
|
+
|
|
146
|
+
timestamp = time.perf_counter()
|
|
147
|
+
output.completed_time = timestamp
|
|
148
|
+
output.query_latency = timestamp - st
|
|
149
|
+
# For non-stream, first chunk equals full latency
|
|
150
|
+
output.first_chunk_latency = output.query_latency
|
|
151
|
+
|
|
152
|
+
if isinstance(payload, dict):
|
|
153
|
+
# Extract generated text from choices
|
|
154
|
+
text = ''
|
|
155
|
+
if choices := payload.get('choices'):
|
|
156
|
+
first = choices[0] if choices else {}
|
|
157
|
+
# Chat Completions format
|
|
158
|
+
msg = first.get('message') or {}
|
|
159
|
+
if isinstance(msg, dict) and msg.get('content') is not None:
|
|
160
|
+
text = msg.get('content') or ''
|
|
161
|
+
else:
|
|
162
|
+
# Legacy Completions format
|
|
163
|
+
text = first.get('text') or ''
|
|
164
|
+
generated_text = text
|
|
165
|
+
|
|
166
|
+
# Extract usage if provided
|
|
167
|
+
if usage := payload.get('usage'):
|
|
168
|
+
output.prompt_tokens = usage.get('prompt_tokens')
|
|
169
|
+
output.completion_tokens = usage.get('completion_tokens')
|
|
170
|
+
|
|
171
|
+
output.response_messages.append(payload)
|
|
172
|
+
else:
|
|
173
|
+
generated_text = str(payload)
|
|
174
|
+
|
|
175
|
+
output.generated_text = generated_text
|
|
176
|
+
output.success = True
|
|
177
|
+
|
|
178
|
+
else:
|
|
179
|
+
# Unknown successful content-type: read as text
|
|
180
|
+
raw = await response.text()
|
|
181
|
+
timestamp = time.perf_counter()
|
|
182
|
+
output.completed_time = timestamp
|
|
183
|
+
output.query_latency = timestamp - st
|
|
184
|
+
output.first_chunk_latency = output.query_latency
|
|
185
|
+
output.generated_text = raw
|
|
186
|
+
output.response_messages.append(raw)
|
|
187
|
+
output.success = True
|
|
188
|
+
else:
|
|
189
|
+
# Try to parse structured error, fallback to reason/text
|
|
190
|
+
try:
|
|
191
|
+
err_payload = await response.json()
|
|
192
|
+
output.error = json.dumps(err_payload, ensure_ascii=False)
|
|
193
|
+
except Exception:
|
|
194
|
+
try:
|
|
195
|
+
output.error = await response.text()
|
|
196
|
+
except Exception:
|
|
197
|
+
output.error = response.reason or ''
|
|
198
|
+
output.success = False
|
|
199
|
+
except Exception:
|
|
200
|
+
output.success = False
|
|
201
|
+
exc_info = sys.exc_info()
|
|
202
|
+
output.error = ''.join(traceback.format_exception(*exc_info))
|
|
203
|
+
logger.error(output.error)
|
|
204
|
+
|
|
205
|
+
return output
|
|
@@ -102,7 +102,7 @@ class OpenaiPlugin(DefaultApiPlugin):
|
|
|
102
102
|
payload.update(param.extra_args)
|
|
103
103
|
return payload
|
|
104
104
|
|
|
105
|
-
def parse_responses(self, responses, request:
|
|
105
|
+
def parse_responses(self, responses: List[Dict], request: str = None, **kwargs) -> tuple[int, int]:
|
|
106
106
|
"""Parser responses and return number of request and response tokens.
|
|
107
107
|
Only one response for non-stream, multiple responses for stream.
|
|
108
108
|
"""
|
|
@@ -180,7 +180,7 @@ class OpenaiPlugin(DefaultApiPlugin):
|
|
|
180
180
|
)
|
|
181
181
|
return input_tokens, output_tokens
|
|
182
182
|
|
|
183
|
-
def _count_input_tokens(self,
|
|
183
|
+
def _count_input_tokens(self, request_str: str) -> int:
|
|
184
184
|
"""Count the number of input tokens in the request.
|
|
185
185
|
|
|
186
186
|
This method handles different types of requests and calculates tokens for:
|
|
@@ -188,13 +188,14 @@ class OpenaiPlugin(DefaultApiPlugin):
|
|
|
188
188
|
- Images in multimodal messages (converted to patch tokens)
|
|
189
189
|
|
|
190
190
|
Args:
|
|
191
|
-
|
|
191
|
+
request_str (str): The request json str containing either 'messages' for chat
|
|
192
192
|
completion or 'prompt' for text completion.
|
|
193
193
|
|
|
194
194
|
Returns:
|
|
195
195
|
int: The total number of input tokens including text and image tokens.
|
|
196
196
|
"""
|
|
197
197
|
input_tokens = 0
|
|
198
|
+
request = json.loads(request_str)
|
|
198
199
|
if 'messages' in request:
|
|
199
200
|
input_content = self.tokenizer.apply_chat_template(
|
|
200
201
|
request['messages'], tokenize=True, add_generation_prompt=True
|
|
@@ -15,6 +15,11 @@ class DatasetPluginBase:
|
|
|
15
15
|
dataset_path (str, optional): The input dataset path. Defaults to None.
|
|
16
16
|
"""
|
|
17
17
|
self.query_parameters = query_parameters
|
|
18
|
+
if query_parameters.tokenizer_path:
|
|
19
|
+
from modelscope import AutoTokenizer
|
|
20
|
+
self.tokenizer = AutoTokenizer.from_pretrained(query_parameters.tokenizer_path, trust_remote_code=True)
|
|
21
|
+
else:
|
|
22
|
+
self.tokenizer = None
|
|
18
23
|
|
|
19
24
|
def __next__(self):
|
|
20
25
|
for item in self.build_messages():
|
|
@@ -85,3 +90,19 @@ class DatasetPluginBase:
|
|
|
85
90
|
for url in image_urls:
|
|
86
91
|
message['content'].append({'type': 'image_url', 'image_url': {'url': url}})
|
|
87
92
|
return message
|
|
93
|
+
|
|
94
|
+
def check_prompt_length(self, prompt: str) -> Tuple[bool, int]:
|
|
95
|
+
"""Check if the prompt length is within the specified range.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
prompt (str): The input prompt string.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Tuple[bool, int]: A tuple containing a boolean indicating whether the prompt is valid and its length.
|
|
102
|
+
"""
|
|
103
|
+
if self.tokenizer is None:
|
|
104
|
+
prompt_length = len(prompt)
|
|
105
|
+
else:
|
|
106
|
+
prompt_length = len(self.tokenizer.encode(prompt))
|
|
107
|
+
is_valid = self.query_parameters.min_prompt_length <= prompt_length <= self.query_parameters.max_prompt_length
|
|
108
|
+
return is_valid, prompt_length
|
|
@@ -16,9 +16,8 @@ class CustomDatasetPlugin(DatasetPluginBase):
|
|
|
16
16
|
def build_messages(self) -> Iterator[List[Dict]]:
|
|
17
17
|
for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
|
|
18
18
|
prompt = item.strip()
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
) < self.query_parameters.max_prompt_length:
|
|
19
|
+
is_valid, _ = self.check_prompt_length(prompt)
|
|
20
|
+
if is_valid:
|
|
22
21
|
if self.query_parameters.apply_chat_template:
|
|
23
22
|
message = self.create_message(prompt)
|
|
24
23
|
yield [message]
|
|
@@ -17,9 +17,8 @@ class LineByLineDatasetPlugin(DatasetPluginBase):
|
|
|
17
17
|
def build_messages(self) -> Iterator[List[Dict]]:
|
|
18
18
|
for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
|
|
19
19
|
prompt = item.strip()
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
) < self.query_parameters.max_prompt_length:
|
|
20
|
+
is_valid, _ = self.check_prompt_length(prompt)
|
|
21
|
+
if is_valid:
|
|
23
22
|
if self.query_parameters.apply_chat_template:
|
|
24
23
|
message = self.create_message(prompt)
|
|
25
24
|
yield [message]
|
|
@@ -22,9 +22,8 @@ class LongAlpacaDatasetPlugin(DatasetPluginBase):
|
|
|
22
22
|
ds = self.dataset_json_list(self.query_parameters.dataset_path)
|
|
23
23
|
for item in ds:
|
|
24
24
|
prompt = item['instruction'].strip()
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
) < self.query_parameters.max_prompt_length:
|
|
25
|
+
is_valid, _ = self.check_prompt_length(prompt)
|
|
26
|
+
if is_valid:
|
|
28
27
|
if self.query_parameters.apply_chat_template:
|
|
29
28
|
message = self.create_message(prompt)
|
|
30
29
|
yield [message]
|
|
@@ -27,10 +27,8 @@ class OpenqaDatasetPlugin(DatasetPluginBase):
|
|
|
27
27
|
for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
|
|
28
28
|
item = json.loads(item)
|
|
29
29
|
prompt = item['question'].strip()
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
and len(prompt) < self.query_parameters.max_prompt_length
|
|
33
|
-
):
|
|
30
|
+
is_valid, _ = self.check_prompt_length(prompt)
|
|
31
|
+
if is_valid:
|
|
34
32
|
if self.query_parameters.apply_chat_template:
|
|
35
33
|
message = self.create_message(prompt)
|
|
36
34
|
yield [message]
|
|
@@ -12,11 +12,9 @@ class RandomDatasetPlugin(DatasetPluginBase):
|
|
|
12
12
|
"""
|
|
13
13
|
|
|
14
14
|
def __init__(self, query_parameters: Arguments):
|
|
15
|
+
assert query_parameters.tokenizer_path, 'Tokenizer path is required for random data generation, please provide it with `--tokenizer-path`.' # noqa: E501
|
|
15
16
|
super().__init__(query_parameters)
|
|
16
|
-
assert self.query_parameters.tokenizer_path, 'Tokenizer path is required for random data generation, please provide it with `--tokenizer_path`.' # noqa: E501
|
|
17
17
|
|
|
18
|
-
from modelscope import AutoTokenizer
|
|
19
|
-
self.tokenizer = AutoTokenizer.from_pretrained(self.query_parameters.tokenizer_path, trust_remote_code=True)
|
|
20
18
|
self.prefix_length = self.query_parameters.prefix_length
|
|
21
19
|
self.prefix_ids = self.get_random_inputs(self.prefix_length)
|
|
22
20
|
self.template_len = self.get_template_len()
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import time
|
|
2
1
|
from dataclasses import dataclass, field
|
|
3
2
|
from typing import Any, List, Optional, Tuple
|
|
4
3
|
|
|
@@ -10,7 +9,7 @@ logger = get_logger()
|
|
|
10
9
|
|
|
11
10
|
@dataclass
|
|
12
11
|
class BenchmarkData:
|
|
13
|
-
request:
|
|
12
|
+
request: str = None # json serialized request body
|
|
14
13
|
start_time: float = 0.0
|
|
15
14
|
completed_time: float = 0.0
|
|
16
15
|
chunk_times: List[float] = field(default_factory=list)
|
|
@@ -24,24 +23,26 @@ class BenchmarkData:
|
|
|
24
23
|
time_per_output_token: float = 0.0
|
|
25
24
|
inter_chunk_latency: List[float] = field(default_factory=list)
|
|
26
25
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
# only for stream responses
|
|
33
|
-
if len(self.chunk_times) > 1:
|
|
34
|
-
self.first_chunk_latency = self.chunk_times[0] - self.start_time
|
|
35
|
-
# remove the first chunk time from the total latency
|
|
36
|
-
self.time_per_output_token = (self.query_latency - self.first_chunk_latency
|
|
37
|
-
) / (self.completion_tokens - 1) if self.completion_tokens > 1 else 0.0
|
|
38
|
-
self.inter_chunk_latency = [t2 - t1 for t1, t2 in zip(self.chunk_times[:-1], self.chunk_times[1:])]
|
|
39
|
-
else:
|
|
40
|
-
self.first_chunk_latency = self.query_latency
|
|
26
|
+
# response content
|
|
27
|
+
generated_text: str = ''
|
|
28
|
+
error: Optional[str] = None
|
|
29
|
+
prompt_tokens: Optional[int] = None
|
|
30
|
+
completion_tokens: Optional[int] = None
|
|
41
31
|
|
|
42
32
|
def _calculate_tokens(self, api_plugin):
|
|
43
|
-
self.prompt_tokens
|
|
44
|
-
|
|
33
|
+
if self.prompt_tokens is None or self.completion_tokens is None:
|
|
34
|
+
self.prompt_tokens, self.completion_tokens = api_plugin.parse_responses(
|
|
35
|
+
self.response_messages, request=self.request
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
# Calculate time per output token
|
|
39
|
+
if self.completion_tokens and self.completion_tokens > 1:
|
|
40
|
+
# tpot = (latency - ttft) / (output_len - 1)
|
|
41
|
+
self.time_per_output_token = (self.query_latency - self.first_chunk_latency) / (self.completion_tokens - 1)
|
|
42
|
+
|
|
43
|
+
# Ensure inter-chunk latency is available (compute from chunk_times if needed)
|
|
44
|
+
if not self.inter_chunk_latency and self.chunk_times:
|
|
45
|
+
self.inter_chunk_latency = [t2 - t1 for t1, t2 in zip(self.chunk_times[:-1], self.chunk_times[1:])]
|
|
45
46
|
|
|
46
47
|
def update_gpu_usage(self):
|
|
47
48
|
if check_import('torch', raise_warning=False):
|
|
@@ -79,6 +80,7 @@ class BenchmarkMetrics:
|
|
|
79
80
|
n_total_prompt_tokens: int = 0
|
|
80
81
|
n_total_completion_tokens: int = 0
|
|
81
82
|
start_time: Optional[float] = None
|
|
83
|
+
last_completed_time: Optional[float] = None
|
|
82
84
|
total_time: float = 1.0
|
|
83
85
|
n_total_queries: int = 0
|
|
84
86
|
n_time_per_output_token: float = 0.0
|
|
@@ -97,9 +99,6 @@ class BenchmarkMetrics:
|
|
|
97
99
|
|
|
98
100
|
def update_metrics(self, benchmark_data: BenchmarkData, api_plugin):
|
|
99
101
|
self.n_total_queries += 1
|
|
100
|
-
if self.start_time is None:
|
|
101
|
-
self.start_time = benchmark_data.start_time
|
|
102
|
-
self.total_time = time.perf_counter() - self.start_time
|
|
103
102
|
|
|
104
103
|
if benchmark_data.success:
|
|
105
104
|
self.n_succeed_queries += 1
|
|
@@ -108,7 +107,6 @@ class BenchmarkMetrics:
|
|
|
108
107
|
self.n_total_prompt_tokens += benchmark_data.prompt_tokens
|
|
109
108
|
self.n_total_completion_tokens += benchmark_data.completion_tokens
|
|
110
109
|
|
|
111
|
-
benchmark_data._calculate_query_stream_metric()
|
|
112
110
|
self.total_latency += benchmark_data.query_latency
|
|
113
111
|
self.total_first_chunk_latency += benchmark_data.first_chunk_latency
|
|
114
112
|
self.n_time_per_output_token += benchmark_data.time_per_output_token
|
|
@@ -117,6 +115,22 @@ class BenchmarkMetrics:
|
|
|
117
115
|
self.n_failed_queries += 1
|
|
118
116
|
|
|
119
117
|
self.calculate_averages()
|
|
118
|
+
self.update_total_time(benchmark_data)
|
|
119
|
+
|
|
120
|
+
def update_total_time(self, benchmark_data: BenchmarkData):
|
|
121
|
+
# Use the earliest start_time seen so far
|
|
122
|
+
if self.start_time is None:
|
|
123
|
+
self.start_time = benchmark_data.start_time
|
|
124
|
+
else:
|
|
125
|
+
self.start_time = min(self.start_time, benchmark_data.start_time)
|
|
126
|
+
# Track the latest completion time
|
|
127
|
+
if self.last_completed_time is None:
|
|
128
|
+
self.last_completed_time = benchmark_data.completed_time
|
|
129
|
+
else:
|
|
130
|
+
self.last_completed_time = max(self.last_completed_time, benchmark_data.completed_time)
|
|
131
|
+
# Compute total_time from request lifecycle timestamps to avoid consumer overhead
|
|
132
|
+
if self.start_time is not None and self.last_completed_time is not None:
|
|
133
|
+
self.total_time = max(self.last_completed_time - self.start_time, 0.0)
|
|
120
134
|
|
|
121
135
|
def calculate_averages(self):
|
|
122
136
|
if self.n_succeed_queries == 0:
|
evalscope/perf/utils/db_util.py
CHANGED
|
@@ -19,7 +19,7 @@ logger = get_logger()
|
|
|
19
19
|
class DatabaseColumns:
|
|
20
20
|
REQUEST = 'request'
|
|
21
21
|
START_TIME = 'start_time'
|
|
22
|
-
|
|
22
|
+
INTER_TOKEN_LATENCIES = 'inter_token_latencies'
|
|
23
23
|
SUCCESS = 'success'
|
|
24
24
|
RESPONSE_MESSAGES = 'response_messages'
|
|
25
25
|
COMPLETED_TIME = 'completed_time'
|
|
@@ -60,7 +60,7 @@ def create_result_table(cursor):
|
|
|
60
60
|
f'''CREATE TABLE IF NOT EXISTS result(
|
|
61
61
|
{DatabaseColumns.REQUEST} TEXT,
|
|
62
62
|
{DatabaseColumns.START_TIME} REAL,
|
|
63
|
-
{DatabaseColumns.
|
|
63
|
+
{DatabaseColumns.INTER_TOKEN_LATENCIES} TEXT,
|
|
64
64
|
{DatabaseColumns.SUCCESS} INTEGER,
|
|
65
65
|
{DatabaseColumns.RESPONSE_MESSAGES} TEXT,
|
|
66
66
|
{DatabaseColumns.COMPLETED_TIME} REAL,
|
|
@@ -75,15 +75,15 @@ def create_result_table(cursor):
|
|
|
75
75
|
|
|
76
76
|
|
|
77
77
|
def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData):
|
|
78
|
-
request =
|
|
79
|
-
|
|
78
|
+
request = benchmark_data.request
|
|
79
|
+
inter_token_latencies = json.dumps(benchmark_data.inter_chunk_latency)
|
|
80
80
|
response_messages = encode_data(benchmark_data.response_messages)
|
|
81
81
|
|
|
82
82
|
# Columns common to both success and failure cases
|
|
83
83
|
common_columns = (
|
|
84
84
|
request,
|
|
85
85
|
benchmark_data.start_time,
|
|
86
|
-
|
|
86
|
+
inter_token_latencies,
|
|
87
87
|
benchmark_data.success,
|
|
88
88
|
response_messages,
|
|
89
89
|
benchmark_data.completed_time,
|
|
@@ -96,7 +96,7 @@ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData)
|
|
|
96
96
|
benchmark_data.completion_tokens, benchmark_data.max_gpu_memory_cost, benchmark_data.time_per_output_token
|
|
97
97
|
)
|
|
98
98
|
query = f"""INSERT INTO result(
|
|
99
|
-
{DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.
|
|
99
|
+
{DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.INTER_TOKEN_LATENCIES},
|
|
100
100
|
{DatabaseColumns.SUCCESS}, {DatabaseColumns.RESPONSE_MESSAGES}, {DatabaseColumns.COMPLETED_TIME},
|
|
101
101
|
{DatabaseColumns.LATENCY}, {DatabaseColumns.FIRST_CHUNK_LATENCY}, {DatabaseColumns.PROMPT_TOKENS},
|
|
102
102
|
{DatabaseColumns.COMPLETION_TOKENS}, {DatabaseColumns.MAX_GPU_MEMORY_COST},
|
|
@@ -105,7 +105,7 @@ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData)
|
|
|
105
105
|
cursor.execute(query, common_columns + additional_columns)
|
|
106
106
|
else:
|
|
107
107
|
query = f"""INSERT INTO result(
|
|
108
|
-
{DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.
|
|
108
|
+
{DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.INTER_TOKEN_LATENCIES},
|
|
109
109
|
{DatabaseColumns.SUCCESS}, {DatabaseColumns.RESPONSE_MESSAGES}, {DatabaseColumns.COMPLETED_TIME}
|
|
110
110
|
) VALUES (?, ?, ?, ?, ?, ?)"""
|
|
111
111
|
cursor.execute(query, common_columns)
|
|
@@ -173,20 +173,11 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
|
|
|
173
173
|
:param result_db_path: Path to the SQLite database file.
|
|
174
174
|
:return: Dictionary of percentiles for various metrics.
|
|
175
175
|
"""
|
|
176
|
-
|
|
177
|
-
def inter_token_latencies(chunk_times_json: str) -> List[float]:
|
|
178
|
-
try:
|
|
179
|
-
chunk_times = json.loads(chunk_times_json)
|
|
180
|
-
return [t2 - t1 for t1, t2 in zip(chunk_times[:-1], chunk_times[1:])]
|
|
181
|
-
except (json.JSONDecodeError, TypeError) as e:
|
|
182
|
-
logger.error(f'Error parsing chunk times: {e}')
|
|
183
|
-
return []
|
|
184
|
-
|
|
185
|
-
query_sql = f'''SELECT {DatabaseColumns.START_TIME}, {DatabaseColumns.CHUNK_TIMES}, {DatabaseColumns.SUCCESS},
|
|
176
|
+
query_sql = f'''SELECT {DatabaseColumns.START_TIME}, {DatabaseColumns.INTER_TOKEN_LATENCIES}, {DatabaseColumns.SUCCESS},
|
|
186
177
|
{DatabaseColumns.COMPLETED_TIME}, {DatabaseColumns.LATENCY}, {DatabaseColumns.FIRST_CHUNK_LATENCY},
|
|
187
178
|
{DatabaseColumns.PROMPT_TOKENS},
|
|
188
179
|
{DatabaseColumns.COMPLETION_TOKENS}, {DatabaseColumns.TIME_PER_OUTPUT_TOKEN}
|
|
189
|
-
FROM result WHERE {DatabaseColumns.SUCCESS}=1'''
|
|
180
|
+
FROM result WHERE {DatabaseColumns.SUCCESS}=1''' # noqa: E501
|
|
190
181
|
|
|
191
182
|
percentiles = [10, 25, 50, 66, 75, 80, 90, 95, 98, 99]
|
|
192
183
|
|
|
@@ -202,7 +193,11 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
|
|
|
202
193
|
# Prepare data for each metric
|
|
203
194
|
inter_token_latencies_all = []
|
|
204
195
|
for row in rows:
|
|
205
|
-
|
|
196
|
+
try:
|
|
197
|
+
itl = json.loads(row[col_indices[DatabaseColumns.INTER_TOKEN_LATENCIES]]) or []
|
|
198
|
+
inter_token_latencies_all.extend(itl)
|
|
199
|
+
except (json.JSONDecodeError, TypeError) as e:
|
|
200
|
+
logger.error(f'Error parsing inter token latencies: {e}')
|
|
206
201
|
|
|
207
202
|
metrics = {
|
|
208
203
|
PercentileMetrics.TTFT: [row[col_indices[DatabaseColumns.FIRST_CHUNK_LATENCY]] for row in rows],
|