evalscope 0.16.3__py3-none-any.whl → 0.17.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/app/app.py +9 -762
- evalscope/app/constants.py +1 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +52 -0
- evalscope/app/ui/multi_model.py +323 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +202 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +178 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +91 -0
- evalscope/backend/opencompass/backend_manager.py +2 -1
- evalscope/backend/rag_eval/backend_manager.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +1 -1
- evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
- evalscope/benchmarks/__init__.py +15 -1
- evalscope/benchmarks/aime/aime24_adapter.py +2 -1
- evalscope/benchmarks/aime/aime25_adapter.py +2 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
- evalscope/benchmarks/arc/arc_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/utils.py +0 -12
- evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
- evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
- evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
- evalscope/benchmarks/data_adapter.py +29 -9
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
- evalscope/benchmarks/general_arena/utils.py +226 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +3 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +44 -30
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +118 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -21
- evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
- evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
- evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
- evalscope/benchmarks/mmlu/mmlu_adapter.py +2 -2
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
- evalscope/benchmarks/musr/musr_adapter.py +1 -1
- evalscope/benchmarks/race/race_adapter.py +1 -1
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +110 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +7 -1
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
- evalscope/benchmarks/utils.py +2 -2
- evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
- evalscope/config.py +8 -123
- evalscope/constants.py +5 -21
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +20 -15
- evalscope/metrics/__init__.py +9 -1
- evalscope/{utils/utils.py → metrics/completion_parsers.py} +71 -176
- evalscope/metrics/llm_judge.py +106 -20
- evalscope/metrics/metrics.py +20 -8
- evalscope/models/__init__.py +4 -8
- evalscope/models/adapters/__init__.py +4 -9
- evalscope/models/adapters/base_adapter.py +4 -0
- evalscope/models/adapters/bfcl_adapter.py +2 -0
- evalscope/models/adapters/chat_adapter.py +3 -0
- evalscope/models/adapters/choice_adapter.py +4 -0
- evalscope/models/adapters/custom_adapter.py +7 -3
- evalscope/models/adapters/server_adapter.py +4 -2
- evalscope/models/adapters/t2i_adapter.py +3 -0
- evalscope/models/adapters/tau_bench_adapter.py +189 -0
- evalscope/models/custom/dummy_model.py +3 -3
- evalscope/models/register.py +0 -14
- evalscope/perf/arguments.py +15 -16
- evalscope/perf/benchmark.py +38 -39
- evalscope/perf/http_client.py +30 -86
- evalscope/perf/main.py +3 -3
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +22 -4
- evalscope/perf/plugin/api/custom_api.py +212 -55
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +105 -0
- evalscope/perf/plugin/api/openai_api.py +17 -19
- evalscope/perf/plugin/datasets/__init__.py +10 -7
- evalscope/perf/plugin/datasets/base.py +22 -1
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +4 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +2 -1
- evalscope/perf/plugin/datasets/random_dataset.py +15 -4
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +14 -20
- evalscope/perf/utils/db_util.py +79 -61
- evalscope/report/__init__.py +1 -1
- evalscope/report/utils.py +34 -15
- evalscope/run.py +1 -1
- evalscope/summarizer.py +1 -2
- evalscope/utils/__init__.py +63 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/import_utils.py +16 -0
- evalscope/utils/io_utils.py +55 -4
- evalscope/utils/model_utils.py +37 -1
- evalscope/version.py +2 -2
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/METADATA +100 -51
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/RECORD +129 -133
- tests/aigc/test_t2i.py +1 -1
- tests/cli/test_all.py +68 -4
- tests/cli/test_collection.py +1 -1
- tests/cli/test_custom.py +261 -0
- tests/cli/test_run.py +34 -70
- tests/perf/test_perf.py +31 -4
- tests/rag/test_clip_benchmark.py +2 -1
- tests/rag/test_mteb.py +3 -1
- tests/rag/test_ragas.py +3 -1
- tests/swift/test_run_swift_eval.py +2 -1
- tests/swift/test_run_swift_vlm_eval.py +2 -1
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
- tests/utils.py +13 -0
- tests/vlm/test_vlmeval.py +8 -2
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/models/model.py +0 -189
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- /evalscope/{utils → benchmarks}/filters.py +0 -0
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/LICENSE +0 -0
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/WHEEL +0 -0
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/top_level.txt +0 -0
|
@@ -20,6 +20,7 @@ class LineByLineDatasetPlugin(DatasetPluginBase):
|
|
|
20
20
|
if len(prompt) > self.query_parameters.min_prompt_length and len(
|
|
21
21
|
prompt) < self.query_parameters.max_prompt_length:
|
|
22
22
|
if self.query_parameters.apply_chat_template:
|
|
23
|
-
|
|
23
|
+
message = self.create_message(prompt)
|
|
24
|
+
yield [message]
|
|
24
25
|
else:
|
|
25
26
|
yield prompt
|
|
@@ -25,6 +25,7 @@ class LongAlpacaDatasetPlugin(DatasetPluginBase):
|
|
|
25
25
|
if len(prompt) > self.query_parameters.min_prompt_length and len(
|
|
26
26
|
prompt) < self.query_parameters.max_prompt_length:
|
|
27
27
|
if self.query_parameters.apply_chat_template:
|
|
28
|
-
|
|
28
|
+
message = self.create_message(prompt)
|
|
29
|
+
yield [message]
|
|
29
30
|
else:
|
|
30
31
|
yield prompt
|
|
@@ -30,6 +30,7 @@ class OpenqaDatasetPlugin(DatasetPluginBase):
|
|
|
30
30
|
if (len(prompt) > self.query_parameters.min_prompt_length
|
|
31
31
|
and len(prompt) < self.query_parameters.max_prompt_length):
|
|
32
32
|
if self.query_parameters.apply_chat_template:
|
|
33
|
-
|
|
33
|
+
message = self.create_message(prompt)
|
|
34
|
+
yield [message]
|
|
34
35
|
else:
|
|
35
36
|
yield prompt
|
|
@@ -37,12 +37,23 @@ class RandomDatasetPlugin(DatasetPluginBase):
|
|
|
37
37
|
input_lens = np.random.randint(min_prompt_length, max_prompt_length, size=self.number)
|
|
38
38
|
offsets = np.random.randint(0, self.tokenizer.vocab_size, size=self.number)
|
|
39
39
|
|
|
40
|
+
vocab_size = self.tokenizer.vocab_size
|
|
41
|
+
|
|
40
42
|
for i in range(self.number):
|
|
41
|
-
|
|
42
|
-
|
|
43
|
+
inner_seq = ((offsets[i] + i + np.arange(input_lens[i])) % vocab_size).tolist()
|
|
44
|
+
token_sequence = self.prefix_ids + inner_seq
|
|
45
|
+
prompt = self.tokenizer.decode(token_sequence)
|
|
46
|
+
|
|
47
|
+
# After decoding the prompt we have to encode and decode it again.
|
|
48
|
+
# This is done because in some cases N consecutive tokens
|
|
49
|
+
# give a string tokenized into != N number of tokens.
|
|
50
|
+
total_input_len = self.prefix_length + int(input_lens[i])
|
|
51
|
+
re_encoded_sequence = self.tokenizer.encode(prompt, add_special_tokens=False)[:total_input_len]
|
|
52
|
+
prompt = self.tokenizer.decode(re_encoded_sequence)
|
|
43
53
|
|
|
44
54
|
if self.query_parameters.apply_chat_template:
|
|
45
|
-
|
|
55
|
+
message = self.create_message(prompt)
|
|
56
|
+
yield [message]
|
|
46
57
|
else:
|
|
47
58
|
yield prompt
|
|
48
59
|
|
|
@@ -53,6 +64,6 @@ class RandomDatasetPlugin(DatasetPluginBase):
|
|
|
53
64
|
return input_ids
|
|
54
65
|
|
|
55
66
|
def get_template_len(self):
|
|
56
|
-
empty_message = [
|
|
67
|
+
empty_message = [self.create_message(text='')]
|
|
57
68
|
template = self.tokenizer.apply_chat_template(empty_message, tokenize=True, add_generation_prompt=True)
|
|
58
69
|
return len(template)
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import random
|
|
2
|
+
from PIL import Image, ImageDraw
|
|
3
|
+
from typing import Dict, Iterator, List
|
|
4
|
+
|
|
5
|
+
from evalscope.perf.arguments import Arguments
|
|
6
|
+
from evalscope.perf.plugin.datasets.random_dataset import RandomDatasetPlugin
|
|
7
|
+
from evalscope.perf.plugin.registry import register_dataset
|
|
8
|
+
from evalscope.utils.io_utils import PIL_to_base64
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@register_dataset('random_vl')
|
|
12
|
+
class RandomVLDatasetPlugin(RandomDatasetPlugin):
|
|
13
|
+
"""Random Vision-Language Dataset Plugin for multimodal model stress testing."""
|
|
14
|
+
|
|
15
|
+
def __init__(self, query_parameters: Arguments):
|
|
16
|
+
super().__init__(query_parameters)
|
|
17
|
+
|
|
18
|
+
# Vision-language specific parameters
|
|
19
|
+
self.image_width = query_parameters.image_width
|
|
20
|
+
self.image_height = query_parameters.image_height
|
|
21
|
+
self.image_format = query_parameters.image_format
|
|
22
|
+
self.image_num = query_parameters.image_num
|
|
23
|
+
|
|
24
|
+
assert self.image_num > 0, 'image_num must be greater than 0.'
|
|
25
|
+
|
|
26
|
+
def build_messages(self) -> Iterator[List[Dict]]:
|
|
27
|
+
# Reuse parent's message generation logic
|
|
28
|
+
for messages in super().build_messages():
|
|
29
|
+
prompt = messages[0]['content'] if isinstance(messages[0], dict) else messages[0]
|
|
30
|
+
|
|
31
|
+
# Generate random images based on image_num
|
|
32
|
+
images_b64 = []
|
|
33
|
+
for _ in range(self.image_num):
|
|
34
|
+
images_b64.append(f'data:image/png;base64,{self._generate_random_image_b64()}')
|
|
35
|
+
|
|
36
|
+
message = self.create_message(text=prompt, image_urls=images_b64)
|
|
37
|
+
yield [message]
|
|
38
|
+
|
|
39
|
+
def _generate_random_image_b64(self) -> str:
|
|
40
|
+
"""Generate a random image and return as base64 string."""
|
|
41
|
+
# Create a random colored image
|
|
42
|
+
color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
|
|
43
|
+
image = Image.new(self.image_format, (self.image_width, self.image_height), color)
|
|
44
|
+
|
|
45
|
+
# Add some random shapes for variety
|
|
46
|
+
draw = ImageDraw.Draw(image)
|
|
47
|
+
for _ in range(random.randint(1, 5)):
|
|
48
|
+
shape_type = random.choice(['rectangle', 'ellipse', 'line'])
|
|
49
|
+
|
|
50
|
+
# Generate two random points
|
|
51
|
+
x1 = random.randint(0, self.image_width - 1)
|
|
52
|
+
y1 = random.randint(0, self.image_height - 1)
|
|
53
|
+
x2 = random.randint(0, self.image_width - 1)
|
|
54
|
+
y2 = random.randint(0, self.image_height - 1)
|
|
55
|
+
|
|
56
|
+
# Ensure proper coordinate ordering (x1 <= x2, y1 <= y2)
|
|
57
|
+
if x1 > x2:
|
|
58
|
+
x1, x2 = x2, x1
|
|
59
|
+
if y1 > y2:
|
|
60
|
+
y1, y2 = y2, y1
|
|
61
|
+
|
|
62
|
+
# Ensure we have at least a 1-pixel difference
|
|
63
|
+
if x1 == x2:
|
|
64
|
+
x2 = min(x1 + 1, self.image_width - 1)
|
|
65
|
+
if y1 == y2:
|
|
66
|
+
y2 = min(y1 + 1, self.image_height - 1)
|
|
67
|
+
|
|
68
|
+
coords = [x1, y1, x2, y2]
|
|
69
|
+
|
|
70
|
+
shape_color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
|
|
71
|
+
|
|
72
|
+
if shape_type == 'rectangle':
|
|
73
|
+
draw.rectangle(coords, fill=shape_color)
|
|
74
|
+
elif shape_type == 'ellipse':
|
|
75
|
+
draw.ellipse(coords, fill=shape_color)
|
|
76
|
+
else:
|
|
77
|
+
draw.line(coords, fill=shape_color, width=random.randint(1, 5))
|
|
78
|
+
|
|
79
|
+
# Convert to base64
|
|
80
|
+
return PIL_to_base64(image, format='PNG')
|
|
@@ -1,23 +1,25 @@
|
|
|
1
|
-
from typing import Any, List, Type, Union
|
|
1
|
+
from typing import TYPE_CHECKING, Any, List, Type, Union
|
|
2
2
|
|
|
3
|
+
if TYPE_CHECKING:
|
|
4
|
+
from .api import ApiPluginBase
|
|
5
|
+
from .datasets import DatasetPluginBase
|
|
3
6
|
|
|
4
|
-
class PluginRegistry:
|
|
5
|
-
|
|
6
|
-
def __init__(self):
|
|
7
|
-
self._registry = {}
|
|
8
7
|
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
return cls
|
|
8
|
+
class PluginRegistry:
|
|
9
|
+
_registry = {}
|
|
12
10
|
|
|
13
|
-
|
|
14
|
-
|
|
11
|
+
@classmethod
|
|
12
|
+
def register(cls, name, plugin_cls):
|
|
13
|
+
cls._registry[name] = plugin_cls
|
|
14
|
+
return plugin_cls
|
|
15
15
|
|
|
16
|
-
|
|
17
|
-
|
|
16
|
+
@classmethod
|
|
17
|
+
def get_class(cls, name):
|
|
18
|
+
return cls._registry[name]
|
|
18
19
|
|
|
19
|
-
|
|
20
|
-
|
|
20
|
+
@classmethod
|
|
21
|
+
def all_classes(cls):
|
|
22
|
+
return list(cls._registry.keys())
|
|
21
23
|
|
|
22
24
|
|
|
23
25
|
def register_dataset(name: Union[str, List[str]]):
|
|
@@ -50,5 +52,23 @@ def register_api(name: Union[str, List[str]]):
|
|
|
50
52
|
return class_decorator
|
|
51
53
|
|
|
52
54
|
|
|
53
|
-
DatasetRegistry
|
|
54
|
-
|
|
55
|
+
class DatasetRegistry(PluginRegistry):
|
|
56
|
+
"""Registry for dataset plugins."""
|
|
57
|
+
_registry = {}
|
|
58
|
+
|
|
59
|
+
@classmethod
|
|
60
|
+
def get_class(cls, name: str) -> Type['DatasetPluginBase']:
|
|
61
|
+
if name not in cls._registry:
|
|
62
|
+
raise ValueError(f"Dataset plugin '{name}' is not registered.")
|
|
63
|
+
return cls._registry[name]
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class ApiRegistry(PluginRegistry):
|
|
67
|
+
"""Registry for API plugins."""
|
|
68
|
+
_registry = {}
|
|
69
|
+
|
|
70
|
+
@classmethod
|
|
71
|
+
def get_class(cls, name: str) -> Type['ApiPluginBase']:
|
|
72
|
+
if name not in cls._registry:
|
|
73
|
+
raise ValueError(f"API plugin '{name}' is not registered.")
|
|
74
|
+
return cls._registry[name]
|
|
@@ -3,27 +3,28 @@ import json
|
|
|
3
3
|
import pickle
|
|
4
4
|
import sqlite3
|
|
5
5
|
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
FROM result WHERE success='1'"
|
|
6
|
+
db_path = 'your db path'
|
|
7
|
+
conn = sqlite3.connect(db_path)
|
|
8
|
+
cursor = conn.cursor()
|
|
10
9
|
|
|
11
|
-
#
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
10
|
+
# 获取列名
|
|
11
|
+
cursor.execute('PRAGMA table_info(result)')
|
|
12
|
+
columns = [info[1] for info in cursor.fetchall()]
|
|
13
|
+
print('列名:', columns)
|
|
14
|
+
|
|
15
|
+
cursor.execute('SELECT * FROM result WHERE success=1 AND first_chunk_latency > 1')
|
|
16
|
+
rows = cursor.fetchall()
|
|
17
|
+
print(f'len(rows): {len(rows)}')
|
|
18
|
+
|
|
19
|
+
for row in rows:
|
|
20
|
+
row_dict = dict(zip(columns, row))
|
|
21
|
+
# 解码request
|
|
22
|
+
row_dict['request'] = pickle.loads(base64.b64decode(row_dict['request']))
|
|
23
|
+
# 解码response_messages
|
|
24
|
+
row_dict['response_messages'] = pickle.loads(base64.b64decode(row_dict['response_messages']))
|
|
25
|
+
# print(row_dict)
|
|
26
|
+
print(
|
|
27
|
+
f"request_id: {json.loads(row_dict['response_messages'][0])['id']}, first_chunk_latency: {row_dict['first_chunk_latency']}" # noqa: E501
|
|
28
|
+
)
|
|
29
|
+
# 如果只想看一个可以break
|
|
30
|
+
# break
|
|
@@ -20,25 +20,24 @@ class BenchmarkData:
|
|
|
20
20
|
# late init
|
|
21
21
|
query_latency: float = 0.0
|
|
22
22
|
first_chunk_latency: float = 0.0
|
|
23
|
-
n_chunks: int = 0
|
|
24
|
-
n_chunks_time: float = 0.0
|
|
25
23
|
max_gpu_memory_cost = 0
|
|
26
24
|
time_per_output_token: float = 0.0
|
|
25
|
+
inter_chunk_latency: List[float] = field(default_factory=list)
|
|
27
26
|
|
|
28
27
|
prompt_tokens = None
|
|
29
28
|
completion_tokens = None
|
|
30
29
|
|
|
31
|
-
def _calculate_query_stream_metric(self) ->
|
|
30
|
+
def _calculate_query_stream_metric(self) -> None:
|
|
32
31
|
self.query_latency = self.completed_time - self.start_time
|
|
32
|
+
# only for stream responses
|
|
33
33
|
if len(self.chunk_times) > 1:
|
|
34
34
|
self.first_chunk_latency = self.chunk_times[0] - self.start_time
|
|
35
|
-
|
|
36
|
-
self.
|
|
35
|
+
# remove the first chunk time from the total latency
|
|
36
|
+
self.time_per_output_token = (self.query_latency - self.first_chunk_latency) / (
|
|
37
|
+
self.completion_tokens - 1) if self.completion_tokens > 1 else 0.0
|
|
38
|
+
self.inter_chunk_latency = [t2 - t1 for t1, t2 in zip(self.chunk_times[:-1], self.chunk_times[1:])]
|
|
37
39
|
else:
|
|
38
40
|
self.first_chunk_latency = self.query_latency
|
|
39
|
-
self.n_chunks = 1
|
|
40
|
-
self.n_chunks_time = self.query_latency
|
|
41
|
-
self.time_per_output_token = self.n_chunks_time / self.n_chunks
|
|
42
41
|
|
|
43
42
|
def _calculate_tokens(self, api_plugin):
|
|
44
43
|
self.prompt_tokens, self.completion_tokens = \
|
|
@@ -63,10 +62,9 @@ class Metrics:
|
|
|
63
62
|
AVERAGE_LATENCY = 'Average latency (s)'
|
|
64
63
|
AVERAGE_TIME_TO_FIRST_TOKEN = 'Average time to first token (s)'
|
|
65
64
|
AVERAGE_TIME_PER_OUTPUT_TOKEN = 'Average time per output token (s)'
|
|
65
|
+
AVERAGE_INTER_TOKEN_LATENCY = 'Average inter-token latency (s)'
|
|
66
66
|
AVERAGE_INPUT_TOKENS_PER_REQUEST = 'Average input tokens per request'
|
|
67
67
|
AVERAGE_OUTPUT_TOKENS_PER_REQUEST = 'Average output tokens per request'
|
|
68
|
-
AVERAGE_PACKAGE_LATENCY = 'Average package latency (s)'
|
|
69
|
-
AVERAGE_PACKAGE_PER_REQUEST = 'Average package per request'
|
|
70
68
|
|
|
71
69
|
|
|
72
70
|
@dataclass
|
|
@@ -76,25 +74,23 @@ class BenchmarkMetrics:
|
|
|
76
74
|
n_failed_queries: int = 0
|
|
77
75
|
total_first_chunk_latency: float = 0.0
|
|
78
76
|
total_latency: float = 0.0
|
|
79
|
-
n_total_chunks: int = 0
|
|
80
77
|
n_total_prompt_tokens: int = 0
|
|
81
78
|
n_total_completion_tokens: int = 0
|
|
82
|
-
total_chunks_time: float = 0.0
|
|
83
79
|
start_time: Optional[float] = None
|
|
84
80
|
total_time: float = 1.0
|
|
85
81
|
n_total_queries: int = 0
|
|
86
82
|
n_time_per_output_token: float = 0.0
|
|
83
|
+
n_total_inter_token_latency: List[float] = field(default_factory=list)
|
|
87
84
|
|
|
88
85
|
avg_first_chunk_latency: float = -1
|
|
89
86
|
avg_latency: float = -1
|
|
90
|
-
n_avg_chunks: float = -1
|
|
91
|
-
avg_chunk_time: float = -1
|
|
92
87
|
avg_prompt_tokens: float = -1
|
|
93
88
|
avg_completion_tokens: float = -1
|
|
94
89
|
avg_input_token_per_seconds: float = -1
|
|
95
90
|
avg_output_token_per_seconds: float = -1
|
|
96
91
|
avg_total_token_per_seconds: float = -1
|
|
97
92
|
avg_time_per_token: float = -1
|
|
93
|
+
avg_inter_token_latency: float = -1
|
|
98
94
|
qps: float = -1
|
|
99
95
|
|
|
100
96
|
def update_metrics(self, benchmark_data: BenchmarkData, api_plugin):
|
|
@@ -113,9 +109,8 @@ class BenchmarkMetrics:
|
|
|
113
109
|
benchmark_data._calculate_query_stream_metric()
|
|
114
110
|
self.total_latency += benchmark_data.query_latency
|
|
115
111
|
self.total_first_chunk_latency += benchmark_data.first_chunk_latency
|
|
116
|
-
self.n_total_chunks += benchmark_data.n_chunks
|
|
117
|
-
self.total_chunks_time += benchmark_data.n_chunks_time
|
|
118
112
|
self.n_time_per_output_token += benchmark_data.time_per_output_token
|
|
113
|
+
self.n_total_inter_token_latency += benchmark_data.inter_chunk_latency
|
|
119
114
|
else:
|
|
120
115
|
self.n_failed_queries += 1
|
|
121
116
|
|
|
@@ -127,8 +122,6 @@ class BenchmarkMetrics:
|
|
|
127
122
|
try:
|
|
128
123
|
self.avg_first_chunk_latency = self.total_first_chunk_latency / self.n_succeed_queries
|
|
129
124
|
self.avg_latency = self.total_latency / self.n_succeed_queries
|
|
130
|
-
self.n_avg_chunks = self.n_total_chunks / self.n_succeed_queries
|
|
131
|
-
self.avg_chunk_time = self.total_chunks_time / self.n_total_chunks
|
|
132
125
|
self.avg_prompt_tokens = self.n_total_prompt_tokens / self.n_succeed_queries
|
|
133
126
|
self.avg_completion_tokens = self.n_total_completion_tokens / self.n_succeed_queries
|
|
134
127
|
self.avg_input_token_per_seconds = self.n_total_prompt_tokens / self.total_first_chunk_latency
|
|
@@ -136,6 +129,8 @@ class BenchmarkMetrics:
|
|
|
136
129
|
self.avg_total_token_per_seconds = (self.n_total_prompt_tokens
|
|
137
130
|
+ self.n_total_completion_tokens) / self.total_time
|
|
138
131
|
self.avg_time_per_token = self.n_time_per_output_token / self.n_succeed_queries
|
|
132
|
+
self.avg_inter_token_latency = sum(self.n_total_inter_token_latency) / len(
|
|
133
|
+
self.n_total_inter_token_latency) if self.n_total_inter_token_latency else 0.0
|
|
139
134
|
self.qps = self.n_succeed_queries / self.total_time
|
|
140
135
|
except ZeroDivisionError as e:
|
|
141
136
|
logger.exception(e)
|
|
@@ -154,9 +149,8 @@ class BenchmarkMetrics:
|
|
|
154
149
|
Metrics.AVERAGE_LATENCY: round(self.avg_latency, default_ndigits),
|
|
155
150
|
Metrics.AVERAGE_TIME_TO_FIRST_TOKEN: round(self.avg_first_chunk_latency, default_ndigits),
|
|
156
151
|
Metrics.AVERAGE_TIME_PER_OUTPUT_TOKEN: round(self.avg_time_per_token, default_ndigits),
|
|
152
|
+
Metrics.AVERAGE_INTER_TOKEN_LATENCY: round(self.avg_inter_token_latency, default_ndigits),
|
|
157
153
|
Metrics.AVERAGE_INPUT_TOKENS_PER_REQUEST: round(self.avg_prompt_tokens, default_ndigits),
|
|
158
154
|
Metrics.AVERAGE_OUTPUT_TOKENS_PER_REQUEST: round(self.avg_completion_tokens, default_ndigits),
|
|
159
|
-
Metrics.AVERAGE_PACKAGE_LATENCY: round(self.avg_chunk_time, default_ndigits),
|
|
160
|
-
Metrics.AVERAGE_PACKAGE_PER_REQUEST: round(self.n_avg_chunks, default_ndigits),
|
|
161
155
|
}
|
|
162
156
|
return message
|
evalscope/perf/utils/db_util.py
CHANGED
|
@@ -16,6 +16,28 @@ from evalscope.utils.logger import get_logger
|
|
|
16
16
|
logger = get_logger()
|
|
17
17
|
|
|
18
18
|
|
|
19
|
+
class DatabaseColumns:
|
|
20
|
+
REQUEST = 'request'
|
|
21
|
+
START_TIME = 'start_time'
|
|
22
|
+
CHUNK_TIMES = 'chunk_times'
|
|
23
|
+
SUCCESS = 'success'
|
|
24
|
+
RESPONSE_MESSAGES = 'response_messages'
|
|
25
|
+
COMPLETED_TIME = 'completed_time'
|
|
26
|
+
LATENCY = 'latency'
|
|
27
|
+
FIRST_CHUNK_LATENCY = 'first_chunk_latency'
|
|
28
|
+
PROMPT_TOKENS = 'prompt_tokens'
|
|
29
|
+
COMPLETION_TOKENS = 'completion_tokens'
|
|
30
|
+
MAX_GPU_MEMORY_COST = 'max_gpu_memory_cost'
|
|
31
|
+
TIME_PER_OUTPUT_TOKEN = 'time_per_output_token'
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def load_prompt(prompt_path_or_text):
|
|
35
|
+
if prompt_path_or_text.startswith('@'):
|
|
36
|
+
with open(prompt_path_or_text[1:], 'r', encoding='utf-8') as file:
|
|
37
|
+
return file.read()
|
|
38
|
+
return prompt_path_or_text
|
|
39
|
+
|
|
40
|
+
|
|
19
41
|
def encode_data(data) -> str:
|
|
20
42
|
"""Encodes data using base64 and pickle."""
|
|
21
43
|
return base64.b64encode(pickle.dumps(data)).decode('utf-8')
|
|
@@ -34,20 +56,20 @@ def transpose_results(data):
|
|
|
34
56
|
|
|
35
57
|
|
|
36
58
|
def create_result_table(cursor):
|
|
37
|
-
cursor.execute('''CREATE TABLE IF NOT EXISTS result(
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
59
|
+
cursor.execute(f'''CREATE TABLE IF NOT EXISTS result(
|
|
60
|
+
{DatabaseColumns.REQUEST} TEXT,
|
|
61
|
+
{DatabaseColumns.START_TIME} REAL,
|
|
62
|
+
{DatabaseColumns.CHUNK_TIMES} TEXT,
|
|
63
|
+
{DatabaseColumns.SUCCESS} INTEGER,
|
|
64
|
+
{DatabaseColumns.RESPONSE_MESSAGES} TEXT,
|
|
65
|
+
{DatabaseColumns.COMPLETED_TIME} REAL,
|
|
66
|
+
{DatabaseColumns.LATENCY} REAL,
|
|
67
|
+
{DatabaseColumns.FIRST_CHUNK_LATENCY} REAL,
|
|
68
|
+
{DatabaseColumns.PROMPT_TOKENS} INTEGER,
|
|
69
|
+
{DatabaseColumns.COMPLETION_TOKENS} INTEGER,
|
|
70
|
+
{DatabaseColumns.MAX_GPU_MEMORY_COST} REAL,
|
|
71
|
+
{DatabaseColumns.TIME_PER_OUTPUT_TOKEN} REAL
|
|
72
|
+
)''')
|
|
51
73
|
|
|
52
74
|
|
|
53
75
|
def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData):
|
|
@@ -67,24 +89,21 @@ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData)
|
|
|
67
89
|
|
|
68
90
|
if benchmark_data.success:
|
|
69
91
|
# Add additional columns for success case
|
|
70
|
-
additional_columns = (
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
request, start_time, chunk_times, success, response_messages,
|
|
81
|
-
completed_time, latency, first_chunk_latency,
|
|
82
|
-
n_chunks, chunk_time, prompt_tokens, completion_tokens, max_gpu_memory_cost
|
|
83
|
-
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"""
|
|
92
|
+
additional_columns = (benchmark_data.query_latency, benchmark_data.first_chunk_latency,
|
|
93
|
+
benchmark_data.prompt_tokens, benchmark_data.completion_tokens,
|
|
94
|
+
benchmark_data.max_gpu_memory_cost, benchmark_data.time_per_output_token)
|
|
95
|
+
query = f"""INSERT INTO result(
|
|
96
|
+
{DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.CHUNK_TIMES},
|
|
97
|
+
{DatabaseColumns.SUCCESS}, {DatabaseColumns.RESPONSE_MESSAGES}, {DatabaseColumns.COMPLETED_TIME},
|
|
98
|
+
{DatabaseColumns.LATENCY}, {DatabaseColumns.FIRST_CHUNK_LATENCY}, {DatabaseColumns.PROMPT_TOKENS},
|
|
99
|
+
{DatabaseColumns.COMPLETION_TOKENS}, {DatabaseColumns.MAX_GPU_MEMORY_COST},
|
|
100
|
+
{DatabaseColumns.TIME_PER_OUTPUT_TOKEN}
|
|
101
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"""
|
|
84
102
|
cursor.execute(query, common_columns + additional_columns)
|
|
85
103
|
else:
|
|
86
|
-
query = """INSERT INTO result(
|
|
87
|
-
|
|
104
|
+
query = f"""INSERT INTO result(
|
|
105
|
+
{DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.CHUNK_TIMES},
|
|
106
|
+
{DatabaseColumns.SUCCESS}, {DatabaseColumns.RESPONSE_MESSAGES}, {DatabaseColumns.COMPLETED_TIME}
|
|
88
107
|
) VALUES (?, ?, ?, ?, ?, ?)"""
|
|
89
108
|
cursor.execute(query, common_columns)
|
|
90
109
|
|
|
@@ -160,44 +179,43 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
|
|
|
160
179
|
logger.error(f'Error parsing chunk times: {e}')
|
|
161
180
|
return []
|
|
162
181
|
|
|
163
|
-
query_sql =
|
|
164
|
-
|
|
165
|
-
|
|
182
|
+
query_sql = f'''SELECT {DatabaseColumns.START_TIME}, {DatabaseColumns.CHUNK_TIMES}, {DatabaseColumns.SUCCESS},
|
|
183
|
+
{DatabaseColumns.COMPLETED_TIME}, {DatabaseColumns.LATENCY}, {DatabaseColumns.FIRST_CHUNK_LATENCY},
|
|
184
|
+
{DatabaseColumns.PROMPT_TOKENS},
|
|
185
|
+
{DatabaseColumns.COMPLETION_TOKENS}, {DatabaseColumns.TIME_PER_OUTPUT_TOKEN}
|
|
186
|
+
FROM result WHERE {DatabaseColumns.SUCCESS}=1'''
|
|
166
187
|
|
|
167
188
|
percentiles = [10, 25, 50, 66, 75, 80, 90, 95, 98, 99]
|
|
168
189
|
|
|
169
190
|
with sqlite3.connect(result_db_path) as con:
|
|
170
|
-
|
|
191
|
+
cursor = con.cursor()
|
|
192
|
+
cursor.execute(query_sql)
|
|
193
|
+
columns = [description[0] for description in cursor.description]
|
|
194
|
+
rows = cursor.fetchall()
|
|
171
195
|
|
|
172
|
-
#
|
|
173
|
-
|
|
174
|
-
LATENCY_INDEX = 4
|
|
175
|
-
FIRST_CHUNK_LATENCY_INDEX = 5
|
|
176
|
-
CHUNK_TIME_INDEX = 7
|
|
177
|
-
PROMPT_TOKENS_INDEX = 8
|
|
178
|
-
COMPLETION_TOKENS_INDEX = 9
|
|
196
|
+
# Create column index mapping
|
|
197
|
+
col_indices = {col: idx for idx, col in enumerate(columns)}
|
|
179
198
|
|
|
180
199
|
# Prepare data for each metric
|
|
181
200
|
inter_token_latencies_all = []
|
|
182
201
|
for row in rows:
|
|
183
|
-
inter_token_latencies_all.extend(inter_token_latencies(row[
|
|
202
|
+
inter_token_latencies_all.extend(inter_token_latencies(row[col_indices[DatabaseColumns.CHUNK_TIMES]]))
|
|
184
203
|
|
|
185
204
|
metrics = {
|
|
186
|
-
PercentileMetrics.TTFT: [row[
|
|
205
|
+
PercentileMetrics.TTFT: [row[col_indices[DatabaseColumns.FIRST_CHUNK_LATENCY]] for row in rows],
|
|
187
206
|
PercentileMetrics.ITL:
|
|
188
207
|
inter_token_latencies_all,
|
|
189
|
-
PercentileMetrics.TPOT:
|
|
190
|
-
[
|
|
191
|
-
|
|
192
|
-
PercentileMetrics.
|
|
193
|
-
PercentileMetrics.INPUT_TOKENS: [row[PROMPT_TOKENS_INDEX] for row in rows],
|
|
194
|
-
PercentileMetrics.OUTPUT_TOKENS: [row[COMPLETION_TOKENS_INDEX] for row in rows],
|
|
208
|
+
PercentileMetrics.TPOT: [row[col_indices[DatabaseColumns.TIME_PER_OUTPUT_TOKEN]] for row in rows],
|
|
209
|
+
PercentileMetrics.LATENCY: [row[col_indices[DatabaseColumns.LATENCY]] for row in rows],
|
|
210
|
+
PercentileMetrics.INPUT_TOKENS: [row[col_indices[DatabaseColumns.PROMPT_TOKENS]] for row in rows],
|
|
211
|
+
PercentileMetrics.OUTPUT_TOKENS: [row[col_indices[DatabaseColumns.COMPLETION_TOKENS]] for row in rows],
|
|
195
212
|
PercentileMetrics.OUTPUT_THROUGHPUT:
|
|
196
|
-
[(row[
|
|
197
|
-
for row in rows],
|
|
198
|
-
PercentileMetrics.TOTAL_THROUGHPUT:
|
|
199
|
-
|
|
200
|
-
|
|
213
|
+
[(row[col_indices[DatabaseColumns.COMPLETION_TOKENS]] / row[col_indices[DatabaseColumns.LATENCY]])
|
|
214
|
+
if row[col_indices[DatabaseColumns.LATENCY]] > 0 else float('nan') for row in rows],
|
|
215
|
+
PercentileMetrics.TOTAL_THROUGHPUT:
|
|
216
|
+
[((row[col_indices[DatabaseColumns.PROMPT_TOKENS]] + row[col_indices[DatabaseColumns.COMPLETION_TOKENS]])
|
|
217
|
+
/ row[col_indices[DatabaseColumns.LATENCY]])
|
|
218
|
+
if row[col_indices[DatabaseColumns.LATENCY]] > 0 else float('nan') for row in rows]
|
|
201
219
|
}
|
|
202
220
|
|
|
203
221
|
# Calculate percentiles for each metric
|
|
@@ -237,18 +255,18 @@ def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: s
|
|
|
237
255
|
|
|
238
256
|
|
|
239
257
|
def speed_benchmark_result(result_db_path: str):
|
|
240
|
-
query_sql = """
|
|
258
|
+
query_sql = f"""
|
|
241
259
|
SELECT
|
|
242
|
-
|
|
243
|
-
ROUND(AVG(
|
|
244
|
-
ROUND(AVG(
|
|
260
|
+
{DatabaseColumns.PROMPT_TOKENS},
|
|
261
|
+
ROUND(AVG({DatabaseColumns.COMPLETION_TOKENS} / {DatabaseColumns.LATENCY}), 2) AS avg_completion_token_per_second,
|
|
262
|
+
ROUND(AVG({DatabaseColumns.MAX_GPU_MEMORY_COST}), 2)
|
|
245
263
|
FROM
|
|
246
264
|
result
|
|
247
265
|
WHERE
|
|
248
|
-
|
|
266
|
+
{DatabaseColumns.SUCCESS} = 1 AND {DatabaseColumns.LATENCY} > 0
|
|
249
267
|
GROUP BY
|
|
250
|
-
|
|
251
|
-
"""
|
|
268
|
+
{DatabaseColumns.PROMPT_TOKENS}
|
|
269
|
+
""" # noqa: E501
|
|
252
270
|
|
|
253
271
|
with sqlite3.connect(result_db_path) as con:
|
|
254
272
|
cursor = con.cursor()
|
evalscope/report/__init__.py
CHANGED
|
@@ -4,7 +4,7 @@ from typing import TYPE_CHECKING
|
|
|
4
4
|
from evalscope.utils.import_utils import _LazyModule
|
|
5
5
|
|
|
6
6
|
if TYPE_CHECKING:
|
|
7
|
-
from .combinator import
|
|
7
|
+
from .combinator import gen_table, get_data_frame, get_report_list
|
|
8
8
|
from .generator import ReportGenerator
|
|
9
9
|
from .utils import Category, Report, ReportKey, Subset
|
|
10
10
|
|
evalscope/report/utils.py
CHANGED
|
@@ -3,14 +3,45 @@ import os
|
|
|
3
3
|
import pandas as pd
|
|
4
4
|
from collections import defaultdict
|
|
5
5
|
from dataclasses import asdict, dataclass, field
|
|
6
|
-
from typing import Any, Dict, List
|
|
6
|
+
from typing import Any, Dict, List, Union
|
|
7
7
|
|
|
8
8
|
from evalscope.metrics import macro_mean, micro_mean
|
|
9
|
-
from evalscope.utils import
|
|
10
|
-
from evalscope.utils.logger import get_logger
|
|
9
|
+
from evalscope.utils import get_logger
|
|
11
10
|
|
|
12
11
|
logger = get_logger()
|
|
13
12
|
|
|
13
|
+
ANALYSIS_PROMPT = """根据给出的json格式的模型评测结果,输出分析报告,要求如下:
|
|
14
|
+
1. 报告分为 总体表现、关键指标分析、改进建议、结论 四部分
|
|
15
|
+
2. 若模型有多种指标,将其分为低分、中分、高分三个部分,并列出markdown表格
|
|
16
|
+
3. 只列出报告本身,不要有其他多余内容
|
|
17
|
+
4. 输出报告语言为{language}
|
|
18
|
+
|
|
19
|
+
```json
|
|
20
|
+
{report_str}
|
|
21
|
+
```
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float, dict]:
|
|
26
|
+
"""
|
|
27
|
+
Normalize score.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
score: input score, could be float or dict. e.g. 0.12345678 or {'acc': 0.12345678, 'f1': 0.12345678}
|
|
31
|
+
keep_num: number of digits to keep.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
Union[float, dict]: normalized score. e.g. 0.1234 or {'acc': 0.1234, 'f1': 0.1234}
|
|
35
|
+
"""
|
|
36
|
+
if isinstance(score, float):
|
|
37
|
+
score = round(score, keep_num)
|
|
38
|
+
elif isinstance(score, dict):
|
|
39
|
+
score = {k: round(v, keep_num) for k, v in score.items()}
|
|
40
|
+
else:
|
|
41
|
+
logger.warning(f'Unknown score type: {type(score)}')
|
|
42
|
+
|
|
43
|
+
return score
|
|
44
|
+
|
|
14
45
|
|
|
15
46
|
@dataclass
|
|
16
47
|
class Subset:
|
|
@@ -74,18 +105,6 @@ class ReportKey:
|
|
|
74
105
|
score = 'Score'
|
|
75
106
|
|
|
76
107
|
|
|
77
|
-
ANALYSIS_PROMPT = """根据给出的json格式的模型评测结果,输出分析报告,要求如下:
|
|
78
|
-
1. 报告分为 总体表现、关键指标分析、改进建议、结论 四部分
|
|
79
|
-
2. 若模型有多种指标,将其分为低分、中分、高分三个部分,并列出markdown表格
|
|
80
|
-
3. 只列出报告本身,不要有其他多余内容
|
|
81
|
-
4. 输出报告语言为{language}
|
|
82
|
-
|
|
83
|
-
```json
|
|
84
|
-
{report_str}
|
|
85
|
-
```
|
|
86
|
-
"""
|
|
87
|
-
|
|
88
|
-
|
|
89
108
|
@dataclass
|
|
90
109
|
class Report:
|
|
91
110
|
name: str = 'default_report'
|