evalscope 0.16.3__py3-none-any.whl → 0.17.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (157) hide show
  1. evalscope/app/app.py +9 -762
  2. evalscope/app/constants.py +1 -0
  3. evalscope/app/ui/__init__.py +20 -0
  4. evalscope/app/ui/app_ui.py +52 -0
  5. evalscope/app/ui/multi_model.py +323 -0
  6. evalscope/app/ui/sidebar.py +42 -0
  7. evalscope/app/ui/single_model.py +202 -0
  8. evalscope/app/ui/visualization.py +36 -0
  9. evalscope/app/utils/data_utils.py +178 -0
  10. evalscope/app/utils/localization.py +221 -0
  11. evalscope/app/utils/text_utils.py +119 -0
  12. evalscope/app/utils/visualization.py +91 -0
  13. evalscope/backend/opencompass/backend_manager.py +2 -1
  14. evalscope/backend/rag_eval/backend_manager.py +2 -1
  15. evalscope/backend/rag_eval/utils/embedding.py +1 -1
  16. evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
  17. evalscope/benchmarks/__init__.py +15 -1
  18. evalscope/benchmarks/aime/aime24_adapter.py +2 -1
  19. evalscope/benchmarks/aime/aime25_adapter.py +2 -1
  20. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
  21. evalscope/benchmarks/arc/arc_adapter.py +1 -1
  22. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
  23. evalscope/benchmarks/arena_hard/utils.py +0 -12
  24. evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
  25. evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
  26. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
  27. evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
  28. evalscope/benchmarks/data_adapter.py +29 -9
  29. evalscope/benchmarks/general_arena/__init__.py +0 -0
  30. evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
  31. evalscope/benchmarks/general_arena/utils.py +226 -0
  32. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +3 -2
  33. evalscope/benchmarks/general_qa/general_qa_adapter.py +44 -30
  34. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
  35. evalscope/benchmarks/hle/__init__.py +0 -0
  36. evalscope/benchmarks/hle/hle_adapter.py +118 -0
  37. evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -21
  38. evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
  39. evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
  40. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
  41. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
  42. evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
  43. evalscope/benchmarks/mmlu/mmlu_adapter.py +2 -2
  44. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  45. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
  46. evalscope/benchmarks/musr/musr_adapter.py +1 -1
  47. evalscope/benchmarks/race/race_adapter.py +1 -1
  48. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  49. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +110 -0
  50. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +7 -1
  51. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
  52. evalscope/benchmarks/utils.py +2 -2
  53. evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
  54. evalscope/config.py +8 -123
  55. evalscope/constants.py +5 -21
  56. evalscope/evaluator/__init__.py +1 -1
  57. evalscope/evaluator/evaluator.py +20 -15
  58. evalscope/metrics/__init__.py +9 -1
  59. evalscope/{utils/utils.py → metrics/completion_parsers.py} +71 -176
  60. evalscope/metrics/llm_judge.py +106 -20
  61. evalscope/metrics/metrics.py +20 -8
  62. evalscope/models/__init__.py +4 -8
  63. evalscope/models/adapters/__init__.py +4 -9
  64. evalscope/models/adapters/base_adapter.py +4 -0
  65. evalscope/models/adapters/bfcl_adapter.py +2 -0
  66. evalscope/models/adapters/chat_adapter.py +3 -0
  67. evalscope/models/adapters/choice_adapter.py +4 -0
  68. evalscope/models/adapters/custom_adapter.py +7 -3
  69. evalscope/models/adapters/server_adapter.py +4 -2
  70. evalscope/models/adapters/t2i_adapter.py +3 -0
  71. evalscope/models/adapters/tau_bench_adapter.py +189 -0
  72. evalscope/models/custom/dummy_model.py +3 -3
  73. evalscope/models/register.py +0 -14
  74. evalscope/perf/arguments.py +15 -16
  75. evalscope/perf/benchmark.py +38 -39
  76. evalscope/perf/http_client.py +30 -86
  77. evalscope/perf/main.py +3 -3
  78. evalscope/perf/plugin/__init__.py +3 -2
  79. evalscope/perf/plugin/api/__init__.py +4 -3
  80. evalscope/perf/plugin/api/base.py +22 -4
  81. evalscope/perf/plugin/api/custom_api.py +212 -55
  82. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  83. evalscope/perf/plugin/api/default_api.py +105 -0
  84. evalscope/perf/plugin/api/openai_api.py +17 -19
  85. evalscope/perf/plugin/datasets/__init__.py +10 -7
  86. evalscope/perf/plugin/datasets/base.py +22 -1
  87. evalscope/perf/plugin/datasets/custom.py +2 -1
  88. evalscope/perf/plugin/datasets/flickr8k.py +4 -27
  89. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  90. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  91. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  92. evalscope/perf/plugin/datasets/openqa.py +2 -1
  93. evalscope/perf/plugin/datasets/random_dataset.py +15 -4
  94. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  95. evalscope/perf/plugin/registry.py +36 -16
  96. evalscope/perf/utils/analysis_result.py +24 -23
  97. evalscope/perf/utils/benchmark_util.py +14 -20
  98. evalscope/perf/utils/db_util.py +79 -61
  99. evalscope/report/__init__.py +1 -1
  100. evalscope/report/utils.py +34 -15
  101. evalscope/run.py +1 -1
  102. evalscope/summarizer.py +1 -2
  103. evalscope/utils/__init__.py +63 -2
  104. evalscope/utils/argument_utils.py +64 -0
  105. evalscope/utils/import_utils.py +16 -0
  106. evalscope/utils/io_utils.py +55 -4
  107. evalscope/utils/model_utils.py +37 -1
  108. evalscope/version.py +2 -2
  109. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/METADATA +100 -51
  110. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/RECORD +129 -133
  111. tests/aigc/test_t2i.py +1 -1
  112. tests/cli/test_all.py +68 -4
  113. tests/cli/test_collection.py +1 -1
  114. tests/cli/test_custom.py +261 -0
  115. tests/cli/test_run.py +34 -70
  116. tests/perf/test_perf.py +31 -4
  117. tests/rag/test_clip_benchmark.py +2 -1
  118. tests/rag/test_mteb.py +3 -1
  119. tests/rag/test_ragas.py +3 -1
  120. tests/swift/test_run_swift_eval.py +2 -1
  121. tests/swift/test_run_swift_vlm_eval.py +2 -1
  122. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
  123. tests/utils.py +13 -0
  124. tests/vlm/test_vlmeval.py +8 -2
  125. evalscope/evaluator/rating_eval.py +0 -157
  126. evalscope/evaluator/reviewer/__init__.py +0 -1
  127. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  128. evalscope/models/model.py +0 -189
  129. evalscope/registry/__init__.py +0 -1
  130. evalscope/registry/config/cfg_arena.yaml +0 -77
  131. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  132. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  133. evalscope/registry/config/cfg_single.yaml +0 -78
  134. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  135. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  136. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  137. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  138. evalscope/registry/data/question.jsonl +0 -80
  139. evalscope/registry/tasks/arc.yaml +0 -28
  140. evalscope/registry/tasks/bbh.yaml +0 -26
  141. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  142. evalscope/registry/tasks/ceval.yaml +0 -27
  143. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  144. evalscope/registry/tasks/cmmlu.yaml +0 -27
  145. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  146. evalscope/registry/tasks/general_qa.yaml +0 -27
  147. evalscope/registry/tasks/gsm8k.yaml +0 -29
  148. evalscope/registry/tasks/mmlu.yaml +0 -29
  149. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  150. evalscope/run_arena.py +0 -202
  151. evalscope/utils/arena_utils.py +0 -217
  152. evalscope/utils/completion_parsers.py +0 -82
  153. /evalscope/{utils → benchmarks}/filters.py +0 -0
  154. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/LICENSE +0 -0
  155. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/WHEEL +0 -0
  156. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/entry_points.txt +0 -0
  157. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/top_level.txt +0 -0
@@ -20,6 +20,7 @@ class LineByLineDatasetPlugin(DatasetPluginBase):
20
20
  if len(prompt) > self.query_parameters.min_prompt_length and len(
21
21
  prompt) < self.query_parameters.max_prompt_length:
22
22
  if self.query_parameters.apply_chat_template:
23
- yield [{'role': 'user', 'content': prompt}]
23
+ message = self.create_message(prompt)
24
+ yield [message]
24
25
  else:
25
26
  yield prompt
@@ -25,6 +25,7 @@ class LongAlpacaDatasetPlugin(DatasetPluginBase):
25
25
  if len(prompt) > self.query_parameters.min_prompt_length and len(
26
26
  prompt) < self.query_parameters.max_prompt_length:
27
27
  if self.query_parameters.apply_chat_template:
28
- yield [{'role': 'user', 'content': prompt}]
28
+ message = self.create_message(prompt)
29
+ yield [message]
29
30
  else:
30
31
  yield prompt
@@ -30,6 +30,7 @@ class OpenqaDatasetPlugin(DatasetPluginBase):
30
30
  if (len(prompt) > self.query_parameters.min_prompt_length
31
31
  and len(prompt) < self.query_parameters.max_prompt_length):
32
32
  if self.query_parameters.apply_chat_template:
33
- yield [{'role': 'user', 'content': prompt}]
33
+ message = self.create_message(prompt)
34
+ yield [message]
34
35
  else:
35
36
  yield prompt
@@ -37,12 +37,23 @@ class RandomDatasetPlugin(DatasetPluginBase):
37
37
  input_lens = np.random.randint(min_prompt_length, max_prompt_length, size=self.number)
38
38
  offsets = np.random.randint(0, self.tokenizer.vocab_size, size=self.number)
39
39
 
40
+ vocab_size = self.tokenizer.vocab_size
41
+
40
42
  for i in range(self.number):
41
- prompt_ids = ((offsets[i] + i + np.arange(input_lens[i])) % self.tokenizer.vocab_size).tolist()
42
- prompt = self.tokenizer.decode(self.prefix_ids + prompt_ids)
43
+ inner_seq = ((offsets[i] + i + np.arange(input_lens[i])) % vocab_size).tolist()
44
+ token_sequence = self.prefix_ids + inner_seq
45
+ prompt = self.tokenizer.decode(token_sequence)
46
+
47
+ # After decoding the prompt we have to encode and decode it again.
48
+ # This is done because in some cases N consecutive tokens
49
+ # give a string tokenized into != N number of tokens.
50
+ total_input_len = self.prefix_length + int(input_lens[i])
51
+ re_encoded_sequence = self.tokenizer.encode(prompt, add_special_tokens=False)[:total_input_len]
52
+ prompt = self.tokenizer.decode(re_encoded_sequence)
43
53
 
44
54
  if self.query_parameters.apply_chat_template:
45
- yield [{'role': 'user', 'content': prompt}]
55
+ message = self.create_message(prompt)
56
+ yield [message]
46
57
  else:
47
58
  yield prompt
48
59
 
@@ -53,6 +64,6 @@ class RandomDatasetPlugin(DatasetPluginBase):
53
64
  return input_ids
54
65
 
55
66
  def get_template_len(self):
56
- empty_message = [{'role': 'user', 'content': ''}]
67
+ empty_message = [self.create_message(text='')]
57
68
  template = self.tokenizer.apply_chat_template(empty_message, tokenize=True, add_generation_prompt=True)
58
69
  return len(template)
@@ -0,0 +1,80 @@
1
+ import random
2
+ from PIL import Image, ImageDraw
3
+ from typing import Dict, Iterator, List
4
+
5
+ from evalscope.perf.arguments import Arguments
6
+ from evalscope.perf.plugin.datasets.random_dataset import RandomDatasetPlugin
7
+ from evalscope.perf.plugin.registry import register_dataset
8
+ from evalscope.utils.io_utils import PIL_to_base64
9
+
10
+
11
+ @register_dataset('random_vl')
12
+ class RandomVLDatasetPlugin(RandomDatasetPlugin):
13
+ """Random Vision-Language Dataset Plugin for multimodal model stress testing."""
14
+
15
+ def __init__(self, query_parameters: Arguments):
16
+ super().__init__(query_parameters)
17
+
18
+ # Vision-language specific parameters
19
+ self.image_width = query_parameters.image_width
20
+ self.image_height = query_parameters.image_height
21
+ self.image_format = query_parameters.image_format
22
+ self.image_num = query_parameters.image_num
23
+
24
+ assert self.image_num > 0, 'image_num must be greater than 0.'
25
+
26
+ def build_messages(self) -> Iterator[List[Dict]]:
27
+ # Reuse parent's message generation logic
28
+ for messages in super().build_messages():
29
+ prompt = messages[0]['content'] if isinstance(messages[0], dict) else messages[0]
30
+
31
+ # Generate random images based on image_num
32
+ images_b64 = []
33
+ for _ in range(self.image_num):
34
+ images_b64.append(f'data:image/png;base64,{self._generate_random_image_b64()}')
35
+
36
+ message = self.create_message(text=prompt, image_urls=images_b64)
37
+ yield [message]
38
+
39
+ def _generate_random_image_b64(self) -> str:
40
+ """Generate a random image and return as base64 string."""
41
+ # Create a random colored image
42
+ color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
43
+ image = Image.new(self.image_format, (self.image_width, self.image_height), color)
44
+
45
+ # Add some random shapes for variety
46
+ draw = ImageDraw.Draw(image)
47
+ for _ in range(random.randint(1, 5)):
48
+ shape_type = random.choice(['rectangle', 'ellipse', 'line'])
49
+
50
+ # Generate two random points
51
+ x1 = random.randint(0, self.image_width - 1)
52
+ y1 = random.randint(0, self.image_height - 1)
53
+ x2 = random.randint(0, self.image_width - 1)
54
+ y2 = random.randint(0, self.image_height - 1)
55
+
56
+ # Ensure proper coordinate ordering (x1 <= x2, y1 <= y2)
57
+ if x1 > x2:
58
+ x1, x2 = x2, x1
59
+ if y1 > y2:
60
+ y1, y2 = y2, y1
61
+
62
+ # Ensure we have at least a 1-pixel difference
63
+ if x1 == x2:
64
+ x2 = min(x1 + 1, self.image_width - 1)
65
+ if y1 == y2:
66
+ y2 = min(y1 + 1, self.image_height - 1)
67
+
68
+ coords = [x1, y1, x2, y2]
69
+
70
+ shape_color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
71
+
72
+ if shape_type == 'rectangle':
73
+ draw.rectangle(coords, fill=shape_color)
74
+ elif shape_type == 'ellipse':
75
+ draw.ellipse(coords, fill=shape_color)
76
+ else:
77
+ draw.line(coords, fill=shape_color, width=random.randint(1, 5))
78
+
79
+ # Convert to base64
80
+ return PIL_to_base64(image, format='PNG')
@@ -1,23 +1,25 @@
1
- from typing import Any, List, Type, Union
1
+ from typing import TYPE_CHECKING, Any, List, Type, Union
2
2
 
3
+ if TYPE_CHECKING:
4
+ from .api import ApiPluginBase
5
+ from .datasets import DatasetPluginBase
3
6
 
4
- class PluginRegistry:
5
-
6
- def __init__(self):
7
- self._registry = {}
8
7
 
9
- def register(self, name, cls):
10
- self._registry[name] = cls
11
- return cls
8
+ class PluginRegistry:
9
+ _registry = {}
12
10
 
13
- def get_class(self, name):
14
- return self._registry[name]
11
+ @classmethod
12
+ def register(cls, name, plugin_cls):
13
+ cls._registry[name] = plugin_cls
14
+ return plugin_cls
15
15
 
16
- def all_classes(self):
17
- return list(self._registry.keys())
16
+ @classmethod
17
+ def get_class(cls, name):
18
+ return cls._registry[name]
18
19
 
19
- def __call__(self, name: str) -> Any:
20
- return self.get_class(name)
20
+ @classmethod
21
+ def all_classes(cls):
22
+ return list(cls._registry.keys())
21
23
 
22
24
 
23
25
  def register_dataset(name: Union[str, List[str]]):
@@ -50,5 +52,23 @@ def register_api(name: Union[str, List[str]]):
50
52
  return class_decorator
51
53
 
52
54
 
53
- DatasetRegistry = PluginRegistry()
54
- ApiRegistry = PluginRegistry()
55
+ class DatasetRegistry(PluginRegistry):
56
+ """Registry for dataset plugins."""
57
+ _registry = {}
58
+
59
+ @classmethod
60
+ def get_class(cls, name: str) -> Type['DatasetPluginBase']:
61
+ if name not in cls._registry:
62
+ raise ValueError(f"Dataset plugin '{name}' is not registered.")
63
+ return cls._registry[name]
64
+
65
+
66
+ class ApiRegistry(PluginRegistry):
67
+ """Registry for API plugins."""
68
+ _registry = {}
69
+
70
+ @classmethod
71
+ def get_class(cls, name: str) -> Type['ApiPluginBase']:
72
+ if name not in cls._registry:
73
+ raise ValueError(f"API plugin '{name}' is not registered.")
74
+ return cls._registry[name]
@@ -3,27 +3,28 @@ import json
3
3
  import pickle
4
4
  import sqlite3
5
5
 
6
- result_db_path = './outputs/qwen2.5_benchmark_20241111_160543.db'
7
- con = sqlite3.connect(result_db_path)
8
- query_sql = "SELECT request, response_messages, prompt_tokens, completion_tokens \
9
- FROM result WHERE success='1'"
6
+ db_path = 'your db path'
7
+ conn = sqlite3.connect(db_path)
8
+ cursor = conn.cursor()
10
9
 
11
- # how to save base64.b64encode(pickle.dumps(benchmark_data["request"])).decode("ascii"),
12
- with con:
13
- rows = con.execute(query_sql).fetchall()
14
- if len(rows) > 0:
15
- for row in rows:
16
- request = row[0]
17
- responses = row[1]
18
- request = base64.b64decode(request)
19
- request = pickle.loads(request)
20
- responses = base64.b64decode(responses)
21
- responses = pickle.loads(responses)
22
- response_content = ''
23
- for response in responses:
24
- response = json.loads(response)
25
- if not response['choices']:
26
- continue
27
- response_content += response['choices'][0]['delta']['content']
28
- print('prompt: %s, tokens: %s, completion: %s, tokens: %s' %
29
- (request['messages'][0]['content'], row[2], response_content, row[3]))
10
+ # 获取列名
11
+ cursor.execute('PRAGMA table_info(result)')
12
+ columns = [info[1] for info in cursor.fetchall()]
13
+ print('列名:', columns)
14
+
15
+ cursor.execute('SELECT * FROM result WHERE success=1 AND first_chunk_latency > 1')
16
+ rows = cursor.fetchall()
17
+ print(f'len(rows): {len(rows)}')
18
+
19
+ for row in rows:
20
+ row_dict = dict(zip(columns, row))
21
+ # 解码request
22
+ row_dict['request'] = pickle.loads(base64.b64decode(row_dict['request']))
23
+ # 解码response_messages
24
+ row_dict['response_messages'] = pickle.loads(base64.b64decode(row_dict['response_messages']))
25
+ # print(row_dict)
26
+ print(
27
+ f"request_id: {json.loads(row_dict['response_messages'][0])['id']}, first_chunk_latency: {row_dict['first_chunk_latency']}" # noqa: E501
28
+ )
29
+ # 如果只想看一个可以break
30
+ # break
@@ -20,25 +20,24 @@ class BenchmarkData:
20
20
  # late init
21
21
  query_latency: float = 0.0
22
22
  first_chunk_latency: float = 0.0
23
- n_chunks: int = 0
24
- n_chunks_time: float = 0.0
25
23
  max_gpu_memory_cost = 0
26
24
  time_per_output_token: float = 0.0
25
+ inter_chunk_latency: List[float] = field(default_factory=list)
27
26
 
28
27
  prompt_tokens = None
29
28
  completion_tokens = None
30
29
 
31
- def _calculate_query_stream_metric(self) -> Tuple[float, int, float]:
30
+ def _calculate_query_stream_metric(self) -> None:
32
31
  self.query_latency = self.completed_time - self.start_time
32
+ # only for stream responses
33
33
  if len(self.chunk_times) > 1:
34
34
  self.first_chunk_latency = self.chunk_times[0] - self.start_time
35
- self.n_chunks = len(self.chunk_times) - 2 # remove last and first chunk
36
- self.n_chunks_time = self.chunk_times[-2] - self.chunk_times[0]
35
+ # remove the first chunk time from the total latency
36
+ self.time_per_output_token = (self.query_latency - self.first_chunk_latency) / (
37
+ self.completion_tokens - 1) if self.completion_tokens > 1 else 0.0
38
+ self.inter_chunk_latency = [t2 - t1 for t1, t2 in zip(self.chunk_times[:-1], self.chunk_times[1:])]
37
39
  else:
38
40
  self.first_chunk_latency = self.query_latency
39
- self.n_chunks = 1
40
- self.n_chunks_time = self.query_latency
41
- self.time_per_output_token = self.n_chunks_time / self.n_chunks
42
41
 
43
42
  def _calculate_tokens(self, api_plugin):
44
43
  self.prompt_tokens, self.completion_tokens = \
@@ -63,10 +62,9 @@ class Metrics:
63
62
  AVERAGE_LATENCY = 'Average latency (s)'
64
63
  AVERAGE_TIME_TO_FIRST_TOKEN = 'Average time to first token (s)'
65
64
  AVERAGE_TIME_PER_OUTPUT_TOKEN = 'Average time per output token (s)'
65
+ AVERAGE_INTER_TOKEN_LATENCY = 'Average inter-token latency (s)'
66
66
  AVERAGE_INPUT_TOKENS_PER_REQUEST = 'Average input tokens per request'
67
67
  AVERAGE_OUTPUT_TOKENS_PER_REQUEST = 'Average output tokens per request'
68
- AVERAGE_PACKAGE_LATENCY = 'Average package latency (s)'
69
- AVERAGE_PACKAGE_PER_REQUEST = 'Average package per request'
70
68
 
71
69
 
72
70
  @dataclass
@@ -76,25 +74,23 @@ class BenchmarkMetrics:
76
74
  n_failed_queries: int = 0
77
75
  total_first_chunk_latency: float = 0.0
78
76
  total_latency: float = 0.0
79
- n_total_chunks: int = 0
80
77
  n_total_prompt_tokens: int = 0
81
78
  n_total_completion_tokens: int = 0
82
- total_chunks_time: float = 0.0
83
79
  start_time: Optional[float] = None
84
80
  total_time: float = 1.0
85
81
  n_total_queries: int = 0
86
82
  n_time_per_output_token: float = 0.0
83
+ n_total_inter_token_latency: List[float] = field(default_factory=list)
87
84
 
88
85
  avg_first_chunk_latency: float = -1
89
86
  avg_latency: float = -1
90
- n_avg_chunks: float = -1
91
- avg_chunk_time: float = -1
92
87
  avg_prompt_tokens: float = -1
93
88
  avg_completion_tokens: float = -1
94
89
  avg_input_token_per_seconds: float = -1
95
90
  avg_output_token_per_seconds: float = -1
96
91
  avg_total_token_per_seconds: float = -1
97
92
  avg_time_per_token: float = -1
93
+ avg_inter_token_latency: float = -1
98
94
  qps: float = -1
99
95
 
100
96
  def update_metrics(self, benchmark_data: BenchmarkData, api_plugin):
@@ -113,9 +109,8 @@ class BenchmarkMetrics:
113
109
  benchmark_data._calculate_query_stream_metric()
114
110
  self.total_latency += benchmark_data.query_latency
115
111
  self.total_first_chunk_latency += benchmark_data.first_chunk_latency
116
- self.n_total_chunks += benchmark_data.n_chunks
117
- self.total_chunks_time += benchmark_data.n_chunks_time
118
112
  self.n_time_per_output_token += benchmark_data.time_per_output_token
113
+ self.n_total_inter_token_latency += benchmark_data.inter_chunk_latency
119
114
  else:
120
115
  self.n_failed_queries += 1
121
116
 
@@ -127,8 +122,6 @@ class BenchmarkMetrics:
127
122
  try:
128
123
  self.avg_first_chunk_latency = self.total_first_chunk_latency / self.n_succeed_queries
129
124
  self.avg_latency = self.total_latency / self.n_succeed_queries
130
- self.n_avg_chunks = self.n_total_chunks / self.n_succeed_queries
131
- self.avg_chunk_time = self.total_chunks_time / self.n_total_chunks
132
125
  self.avg_prompt_tokens = self.n_total_prompt_tokens / self.n_succeed_queries
133
126
  self.avg_completion_tokens = self.n_total_completion_tokens / self.n_succeed_queries
134
127
  self.avg_input_token_per_seconds = self.n_total_prompt_tokens / self.total_first_chunk_latency
@@ -136,6 +129,8 @@ class BenchmarkMetrics:
136
129
  self.avg_total_token_per_seconds = (self.n_total_prompt_tokens
137
130
  + self.n_total_completion_tokens) / self.total_time
138
131
  self.avg_time_per_token = self.n_time_per_output_token / self.n_succeed_queries
132
+ self.avg_inter_token_latency = sum(self.n_total_inter_token_latency) / len(
133
+ self.n_total_inter_token_latency) if self.n_total_inter_token_latency else 0.0
139
134
  self.qps = self.n_succeed_queries / self.total_time
140
135
  except ZeroDivisionError as e:
141
136
  logger.exception(e)
@@ -154,9 +149,8 @@ class BenchmarkMetrics:
154
149
  Metrics.AVERAGE_LATENCY: round(self.avg_latency, default_ndigits),
155
150
  Metrics.AVERAGE_TIME_TO_FIRST_TOKEN: round(self.avg_first_chunk_latency, default_ndigits),
156
151
  Metrics.AVERAGE_TIME_PER_OUTPUT_TOKEN: round(self.avg_time_per_token, default_ndigits),
152
+ Metrics.AVERAGE_INTER_TOKEN_LATENCY: round(self.avg_inter_token_latency, default_ndigits),
157
153
  Metrics.AVERAGE_INPUT_TOKENS_PER_REQUEST: round(self.avg_prompt_tokens, default_ndigits),
158
154
  Metrics.AVERAGE_OUTPUT_TOKENS_PER_REQUEST: round(self.avg_completion_tokens, default_ndigits),
159
- Metrics.AVERAGE_PACKAGE_LATENCY: round(self.avg_chunk_time, default_ndigits),
160
- Metrics.AVERAGE_PACKAGE_PER_REQUEST: round(self.n_avg_chunks, default_ndigits),
161
155
  }
162
156
  return message
@@ -16,6 +16,28 @@ from evalscope.utils.logger import get_logger
16
16
  logger = get_logger()
17
17
 
18
18
 
19
+ class DatabaseColumns:
20
+ REQUEST = 'request'
21
+ START_TIME = 'start_time'
22
+ CHUNK_TIMES = 'chunk_times'
23
+ SUCCESS = 'success'
24
+ RESPONSE_MESSAGES = 'response_messages'
25
+ COMPLETED_TIME = 'completed_time'
26
+ LATENCY = 'latency'
27
+ FIRST_CHUNK_LATENCY = 'first_chunk_latency'
28
+ PROMPT_TOKENS = 'prompt_tokens'
29
+ COMPLETION_TOKENS = 'completion_tokens'
30
+ MAX_GPU_MEMORY_COST = 'max_gpu_memory_cost'
31
+ TIME_PER_OUTPUT_TOKEN = 'time_per_output_token'
32
+
33
+
34
+ def load_prompt(prompt_path_or_text):
35
+ if prompt_path_or_text.startswith('@'):
36
+ with open(prompt_path_or_text[1:], 'r', encoding='utf-8') as file:
37
+ return file.read()
38
+ return prompt_path_or_text
39
+
40
+
19
41
  def encode_data(data) -> str:
20
42
  """Encodes data using base64 and pickle."""
21
43
  return base64.b64encode(pickle.dumps(data)).decode('utf-8')
@@ -34,20 +56,20 @@ def transpose_results(data):
34
56
 
35
57
 
36
58
  def create_result_table(cursor):
37
- cursor.execute('''CREATE TABLE IF NOT EXISTS result(
38
- request TEXT,
39
- start_time REAL,
40
- chunk_times TEXT,
41
- success INTEGER,
42
- response_messages TEXT,
43
- completed_time REAL,
44
- latency REAL,
45
- first_chunk_latency REAL,
46
- n_chunks INTEGER,
47
- chunk_time REAL,
48
- prompt_tokens INTEGER,
49
- completion_tokens INTEGER,
50
- max_gpu_memory_cost REAL)''')
59
+ cursor.execute(f'''CREATE TABLE IF NOT EXISTS result(
60
+ {DatabaseColumns.REQUEST} TEXT,
61
+ {DatabaseColumns.START_TIME} REAL,
62
+ {DatabaseColumns.CHUNK_TIMES} TEXT,
63
+ {DatabaseColumns.SUCCESS} INTEGER,
64
+ {DatabaseColumns.RESPONSE_MESSAGES} TEXT,
65
+ {DatabaseColumns.COMPLETED_TIME} REAL,
66
+ {DatabaseColumns.LATENCY} REAL,
67
+ {DatabaseColumns.FIRST_CHUNK_LATENCY} REAL,
68
+ {DatabaseColumns.PROMPT_TOKENS} INTEGER,
69
+ {DatabaseColumns.COMPLETION_TOKENS} INTEGER,
70
+ {DatabaseColumns.MAX_GPU_MEMORY_COST} REAL,
71
+ {DatabaseColumns.TIME_PER_OUTPUT_TOKEN} REAL
72
+ )''')
51
73
 
52
74
 
53
75
  def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData):
@@ -67,24 +89,21 @@ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData)
67
89
 
68
90
  if benchmark_data.success:
69
91
  # Add additional columns for success case
70
- additional_columns = (
71
- benchmark_data.query_latency,
72
- benchmark_data.first_chunk_latency,
73
- benchmark_data.n_chunks,
74
- benchmark_data.n_chunks_time,
75
- benchmark_data.prompt_tokens,
76
- benchmark_data.completion_tokens,
77
- benchmark_data.max_gpu_memory_cost,
78
- )
79
- query = """INSERT INTO result(
80
- request, start_time, chunk_times, success, response_messages,
81
- completed_time, latency, first_chunk_latency,
82
- n_chunks, chunk_time, prompt_tokens, completion_tokens, max_gpu_memory_cost
83
- ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"""
92
+ additional_columns = (benchmark_data.query_latency, benchmark_data.first_chunk_latency,
93
+ benchmark_data.prompt_tokens, benchmark_data.completion_tokens,
94
+ benchmark_data.max_gpu_memory_cost, benchmark_data.time_per_output_token)
95
+ query = f"""INSERT INTO result(
96
+ {DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.CHUNK_TIMES},
97
+ {DatabaseColumns.SUCCESS}, {DatabaseColumns.RESPONSE_MESSAGES}, {DatabaseColumns.COMPLETED_TIME},
98
+ {DatabaseColumns.LATENCY}, {DatabaseColumns.FIRST_CHUNK_LATENCY}, {DatabaseColumns.PROMPT_TOKENS},
99
+ {DatabaseColumns.COMPLETION_TOKENS}, {DatabaseColumns.MAX_GPU_MEMORY_COST},
100
+ {DatabaseColumns.TIME_PER_OUTPUT_TOKEN}
101
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"""
84
102
  cursor.execute(query, common_columns + additional_columns)
85
103
  else:
86
- query = """INSERT INTO result(
87
- request, start_time, chunk_times, success, response_messages, completed_time
104
+ query = f"""INSERT INTO result(
105
+ {DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.CHUNK_TIMES},
106
+ {DatabaseColumns.SUCCESS}, {DatabaseColumns.RESPONSE_MESSAGES}, {DatabaseColumns.COMPLETED_TIME}
88
107
  ) VALUES (?, ?, ?, ?, ?, ?)"""
89
108
  cursor.execute(query, common_columns)
90
109
 
@@ -160,44 +179,43 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
160
179
  logger.error(f'Error parsing chunk times: {e}')
161
180
  return []
162
181
 
163
- query_sql = ('SELECT start_time, chunk_times, success, completed_time, latency, first_chunk_latency, '
164
- 'n_chunks, chunk_time, prompt_tokens, completion_tokens '
165
- 'FROM result WHERE success=1')
182
+ query_sql = f'''SELECT {DatabaseColumns.START_TIME}, {DatabaseColumns.CHUNK_TIMES}, {DatabaseColumns.SUCCESS},
183
+ {DatabaseColumns.COMPLETED_TIME}, {DatabaseColumns.LATENCY}, {DatabaseColumns.FIRST_CHUNK_LATENCY},
184
+ {DatabaseColumns.PROMPT_TOKENS},
185
+ {DatabaseColumns.COMPLETION_TOKENS}, {DatabaseColumns.TIME_PER_OUTPUT_TOKEN}
186
+ FROM result WHERE {DatabaseColumns.SUCCESS}=1'''
166
187
 
167
188
  percentiles = [10, 25, 50, 66, 75, 80, 90, 95, 98, 99]
168
189
 
169
190
  with sqlite3.connect(result_db_path) as con:
170
- rows = con.execute(query_sql).fetchall()
191
+ cursor = con.cursor()
192
+ cursor.execute(query_sql)
193
+ columns = [description[0] for description in cursor.description]
194
+ rows = cursor.fetchall()
171
195
 
172
- # Define index variables for columns
173
- CHUNK_TIMES_INDEX = 1
174
- LATENCY_INDEX = 4
175
- FIRST_CHUNK_LATENCY_INDEX = 5
176
- CHUNK_TIME_INDEX = 7
177
- PROMPT_TOKENS_INDEX = 8
178
- COMPLETION_TOKENS_INDEX = 9
196
+ # Create column index mapping
197
+ col_indices = {col: idx for idx, col in enumerate(columns)}
179
198
 
180
199
  # Prepare data for each metric
181
200
  inter_token_latencies_all = []
182
201
  for row in rows:
183
- inter_token_latencies_all.extend(inter_token_latencies(row[CHUNK_TIMES_INDEX]))
202
+ inter_token_latencies_all.extend(inter_token_latencies(row[col_indices[DatabaseColumns.CHUNK_TIMES]]))
184
203
 
185
204
  metrics = {
186
- PercentileMetrics.TTFT: [row[FIRST_CHUNK_LATENCY_INDEX] for row in rows],
205
+ PercentileMetrics.TTFT: [row[col_indices[DatabaseColumns.FIRST_CHUNK_LATENCY]] for row in rows],
187
206
  PercentileMetrics.ITL:
188
207
  inter_token_latencies_all,
189
- PercentileMetrics.TPOT:
190
- [(row[CHUNK_TIME_INDEX] / row[COMPLETION_TOKENS_INDEX]) if row[COMPLETION_TOKENS_INDEX] > 0 else float('nan')
191
- for row in rows],
192
- PercentileMetrics.LATENCY: [row[LATENCY_INDEX] for row in rows],
193
- PercentileMetrics.INPUT_TOKENS: [row[PROMPT_TOKENS_INDEX] for row in rows],
194
- PercentileMetrics.OUTPUT_TOKENS: [row[COMPLETION_TOKENS_INDEX] for row in rows],
208
+ PercentileMetrics.TPOT: [row[col_indices[DatabaseColumns.TIME_PER_OUTPUT_TOKEN]] for row in rows],
209
+ PercentileMetrics.LATENCY: [row[col_indices[DatabaseColumns.LATENCY]] for row in rows],
210
+ PercentileMetrics.INPUT_TOKENS: [row[col_indices[DatabaseColumns.PROMPT_TOKENS]] for row in rows],
211
+ PercentileMetrics.OUTPUT_TOKENS: [row[col_indices[DatabaseColumns.COMPLETION_TOKENS]] for row in rows],
195
212
  PercentileMetrics.OUTPUT_THROUGHPUT:
196
- [(row[COMPLETION_TOKENS_INDEX] / row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan')
197
- for row in rows],
198
- PercentileMetrics.TOTAL_THROUGHPUT: [((row[PROMPT_TOKENS_INDEX] + row[COMPLETION_TOKENS_INDEX])
199
- / row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan')
200
- for row in rows]
213
+ [(row[col_indices[DatabaseColumns.COMPLETION_TOKENS]] / row[col_indices[DatabaseColumns.LATENCY]])
214
+ if row[col_indices[DatabaseColumns.LATENCY]] > 0 else float('nan') for row in rows],
215
+ PercentileMetrics.TOTAL_THROUGHPUT:
216
+ [((row[col_indices[DatabaseColumns.PROMPT_TOKENS]] + row[col_indices[DatabaseColumns.COMPLETION_TOKENS]])
217
+ / row[col_indices[DatabaseColumns.LATENCY]])
218
+ if row[col_indices[DatabaseColumns.LATENCY]] > 0 else float('nan') for row in rows]
201
219
  }
202
220
 
203
221
  # Calculate percentiles for each metric
@@ -237,18 +255,18 @@ def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: s
237
255
 
238
256
 
239
257
  def speed_benchmark_result(result_db_path: str):
240
- query_sql = """
258
+ query_sql = f"""
241
259
  SELECT
242
- prompt_tokens,
243
- ROUND(AVG(completion_tokens / latency), 2) AS avg_completion_token_per_second,
244
- ROUND(AVG(max_gpu_memory_cost), 2)
260
+ {DatabaseColumns.PROMPT_TOKENS},
261
+ ROUND(AVG({DatabaseColumns.COMPLETION_TOKENS} / {DatabaseColumns.LATENCY}), 2) AS avg_completion_token_per_second,
262
+ ROUND(AVG({DatabaseColumns.MAX_GPU_MEMORY_COST}), 2)
245
263
  FROM
246
264
  result
247
265
  WHERE
248
- success = 1 AND latency > 0
266
+ {DatabaseColumns.SUCCESS} = 1 AND {DatabaseColumns.LATENCY} > 0
249
267
  GROUP BY
250
- prompt_tokens
251
- """
268
+ {DatabaseColumns.PROMPT_TOKENS}
269
+ """ # noqa: E501
252
270
 
253
271
  with sqlite3.connect(result_db_path) as con:
254
272
  cursor = con.cursor()
@@ -4,7 +4,7 @@ from typing import TYPE_CHECKING
4
4
  from evalscope.utils.import_utils import _LazyModule
5
5
 
6
6
  if TYPE_CHECKING:
7
- from .combinator import gen_report_table, gen_table, get_data_frame, get_report_list
7
+ from .combinator import gen_table, get_data_frame, get_report_list
8
8
  from .generator import ReportGenerator
9
9
  from .utils import Category, Report, ReportKey, Subset
10
10
 
evalscope/report/utils.py CHANGED
@@ -3,14 +3,45 @@ import os
3
3
  import pandas as pd
4
4
  from collections import defaultdict
5
5
  from dataclasses import asdict, dataclass, field
6
- from typing import Any, Dict, List
6
+ from typing import Any, Dict, List, Union
7
7
 
8
8
  from evalscope.metrics import macro_mean, micro_mean
9
- from evalscope.utils import normalize_score
10
- from evalscope.utils.logger import get_logger
9
+ from evalscope.utils import get_logger
11
10
 
12
11
  logger = get_logger()
13
12
 
13
+ ANALYSIS_PROMPT = """根据给出的json格式的模型评测结果,输出分析报告,要求如下:
14
+ 1. 报告分为 总体表现、关键指标分析、改进建议、结论 四部分
15
+ 2. 若模型有多种指标,将其分为低分、中分、高分三个部分,并列出markdown表格
16
+ 3. 只列出报告本身,不要有其他多余内容
17
+ 4. 输出报告语言为{language}
18
+
19
+ ```json
20
+ {report_str}
21
+ ```
22
+ """
23
+
24
+
25
+ def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float, dict]:
26
+ """
27
+ Normalize score.
28
+
29
+ Args:
30
+ score: input score, could be float or dict. e.g. 0.12345678 or {'acc': 0.12345678, 'f1': 0.12345678}
31
+ keep_num: number of digits to keep.
32
+
33
+ Returns:
34
+ Union[float, dict]: normalized score. e.g. 0.1234 or {'acc': 0.1234, 'f1': 0.1234}
35
+ """
36
+ if isinstance(score, float):
37
+ score = round(score, keep_num)
38
+ elif isinstance(score, dict):
39
+ score = {k: round(v, keep_num) for k, v in score.items()}
40
+ else:
41
+ logger.warning(f'Unknown score type: {type(score)}')
42
+
43
+ return score
44
+
14
45
 
15
46
  @dataclass
16
47
  class Subset:
@@ -74,18 +105,6 @@ class ReportKey:
74
105
  score = 'Score'
75
106
 
76
107
 
77
- ANALYSIS_PROMPT = """根据给出的json格式的模型评测结果,输出分析报告,要求如下:
78
- 1. 报告分为 总体表现、关键指标分析、改进建议、结论 四部分
79
- 2. 若模型有多种指标,将其分为低分、中分、高分三个部分,并列出markdown表格
80
- 3. 只列出报告本身,不要有其他多余内容
81
- 4. 输出报告语言为{language}
82
-
83
- ```json
84
- {report_str}
85
- ```
86
- """
87
-
88
-
89
108
  @dataclass
90
109
  class Report:
91
110
  name: str = 'default_report'