evalscope 0.8.0__py3-none-any.whl → 0.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (147) hide show
  1. evalscope/__init__.py +2 -0
  2. evalscope/arguments.py +11 -3
  3. evalscope/backend/base.py +1 -1
  4. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
  5. evalscope/backend/rag_eval/utils/clip.py +2 -2
  6. evalscope/backend/rag_eval/utils/embedding.py +1 -1
  7. evalscope/backend/rag_eval/utils/llm.py +1 -1
  8. evalscope/benchmarks/__init__.py +20 -1
  9. evalscope/benchmarks/arc/__init__.py +0 -5
  10. evalscope/benchmarks/arc/arc_adapter.py +24 -102
  11. evalscope/benchmarks/bbh/__init__.py +0 -4
  12. evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
  13. evalscope/benchmarks/benchmark.py +70 -59
  14. evalscope/benchmarks/ceval/__init__.py +0 -5
  15. evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
  16. evalscope/benchmarks/cmmlu/__init__.py +0 -5
  17. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
  18. evalscope/benchmarks/competition_math/__init__.py +0 -5
  19. evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
  20. evalscope/benchmarks/data_adapter.py +115 -87
  21. evalscope/benchmarks/general_qa/__init__.py +0 -5
  22. evalscope/benchmarks/general_qa/general_qa_adapter.py +24 -80
  23. evalscope/benchmarks/gpqa/__init__.py +0 -0
  24. evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
  25. evalscope/benchmarks/gpqa/gpqa_adapter.py +103 -0
  26. evalscope/benchmarks/gsm8k/__init__.py +0 -4
  27. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +22 -101
  28. evalscope/benchmarks/hellaswag/__init__.py +0 -5
  29. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +33 -99
  30. evalscope/benchmarks/humaneval/__init__.py +0 -4
  31. evalscope/benchmarks/humaneval/humaneval_adapter.py +93 -9
  32. evalscope/benchmarks/ifeval/__init__.py +0 -0
  33. evalscope/benchmarks/ifeval/ifeval_adapter.py +56 -0
  34. evalscope/benchmarks/ifeval/instructions.py +1477 -0
  35. evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
  36. evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
  37. evalscope/benchmarks/ifeval/utils.py +134 -0
  38. evalscope/benchmarks/iquiz/__init__.py +0 -0
  39. evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
  40. evalscope/benchmarks/mmlu/__init__.py +0 -5
  41. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
  42. evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
  43. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
  44. evalscope/benchmarks/race/__init__.py +0 -5
  45. evalscope/benchmarks/race/race_adapter.py +27 -123
  46. evalscope/benchmarks/trivia_qa/__init__.py +0 -5
  47. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
  48. evalscope/benchmarks/truthful_qa/__init__.py +0 -5
  49. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
  50. evalscope/cli/cli.py +2 -0
  51. evalscope/cli/start_app.py +30 -0
  52. evalscope/collections/__init__.py +3 -0
  53. evalscope/collections/evaluator.py +198 -0
  54. evalscope/collections/sampler.py +138 -0
  55. evalscope/collections/schema.py +126 -0
  56. evalscope/config.py +45 -7
  57. evalscope/constants.py +7 -38
  58. evalscope/evaluator/__init__.py +0 -1
  59. evalscope/evaluator/evaluator.py +89 -121
  60. evalscope/evaluator/rating_eval.py +1 -1
  61. evalscope/evaluator/reviewer/auto_reviewer.py +14 -5
  62. evalscope/metrics/__init__.py +3 -0
  63. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  64. evalscope/metrics/math_accuracy.py +193 -50
  65. evalscope/metrics/metrics.py +18 -6
  66. evalscope/metrics/named_metrics.py +17 -0
  67. evalscope/metrics/rouge_metric.py +13 -8
  68. evalscope/models/__init__.py +14 -1
  69. evalscope/models/base_adapter.py +52 -0
  70. evalscope/models/chat_adapter.py +140 -0
  71. evalscope/models/choice_adapter.py +211 -0
  72. evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +1 -1
  73. evalscope/models/custom_adapter.py +67 -0
  74. evalscope/models/local_model.py +74 -0
  75. evalscope/models/model.py +141 -0
  76. evalscope/models/server_adapter.py +111 -0
  77. evalscope/perf/__init__.py +1 -0
  78. evalscope/perf/arguments.py +3 -1
  79. evalscope/perf/benchmark.py +3 -3
  80. evalscope/perf/main.py +5 -7
  81. evalscope/perf/plugin/api/custom_api.py +1 -1
  82. evalscope/perf/plugin/api/openai_api.py +54 -50
  83. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  84. evalscope/perf/plugin/datasets/longalpaca.py +1 -1
  85. evalscope/perf/plugin/registry.py +3 -3
  86. evalscope/perf/utils/benchmark_util.py +4 -4
  87. evalscope/perf/utils/db_util.py +66 -22
  88. evalscope/perf/utils/local_server.py +4 -1
  89. evalscope/report/__init__.py +5 -0
  90. evalscope/report/app.py +693 -0
  91. evalscope/report/combinator.py +73 -0
  92. evalscope/report/generator.py +80 -0
  93. evalscope/report/utils.py +133 -0
  94. evalscope/run.py +64 -125
  95. evalscope/run_arena.py +3 -2
  96. evalscope/summarizer.py +15 -27
  97. evalscope/third_party/longbench_write/eval.py +2 -1
  98. evalscope/third_party/longbench_write/longbench_write.py +2 -1
  99. evalscope/third_party/longbench_write/tools/data_etl.py +1 -1
  100. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  101. evalscope/utils/__init__.py +1 -0
  102. evalscope/utils/chat_service.py +6 -5
  103. evalscope/utils/io_utils.py +170 -0
  104. evalscope/utils/logger.py +13 -0
  105. evalscope/utils/model_utils.py +15 -2
  106. evalscope/utils/utils.py +3 -200
  107. evalscope/version.py +2 -2
  108. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/METADATA +129 -23
  109. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/RECORD +119 -115
  110. tests/cli/test_collection.py +57 -0
  111. tests/cli/test_run.py +57 -7
  112. tests/perf/test_perf.py +3 -2
  113. tests/rag/test_mteb.py +3 -2
  114. tests/vlm/test_vlmeval.py +3 -2
  115. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +0 -87
  116. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +0 -36
  117. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +0 -26
  118. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +0 -41
  119. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +0 -7
  120. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +0 -60
  121. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +0 -36
  122. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +0 -24
  123. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +0 -35
  124. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
  125. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  126. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
  127. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  128. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +0 -34
  129. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +0 -36
  130. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +0 -25
  131. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -24
  132. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  133. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +0 -16
  134. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +0 -24
  135. evalscope/evaluator/humaneval_evaluator.py +0 -158
  136. evalscope/models/api/__init__.py +0 -3
  137. evalscope/models/dummy_chat_model.py +0 -49
  138. evalscope/models/model_adapter.py +0 -525
  139. evalscope/models/openai_model.py +0 -103
  140. evalscope/tools/__init__.py +0 -1
  141. evalscope/tools/combine_reports.py +0 -135
  142. evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
  143. /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
  144. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/LICENSE +0 -0
  145. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/WHEEL +0 -0
  146. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/entry_points.txt +0 -0
  147. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/top_level.txt +0 -0
@@ -6,6 +6,7 @@ import sqlite3
6
6
  import sys
7
7
  from datetime import datetime
8
8
  from tabulate import tabulate
9
+ from typing import Dict, List
9
10
 
10
11
  from evalscope.perf.arguments import Arguments
11
12
  from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
@@ -107,44 +108,87 @@ def get_result_db_path(args: Arguments):
107
108
  return result_db_path
108
109
 
109
110
 
110
- def get_percentile_results(result_db_path: str):
111
+ def calculate_percentiles(data: List[float], percentiles: List[int]) -> Dict[int, float]:
112
+ """
113
+ Calculate the percentiles for a specific list of data.
111
114
 
112
- def percentile_results(rows, index, percentiles):
113
- results = {}
114
- n_success_queries = len(rows)
115
- for percentile in percentiles:
115
+ :param data: List of values for a specific metric.
116
+ :param percentiles: List of percentiles to calculate.
117
+ :return: Dictionary of calculated percentiles.
118
+ """
119
+ results = {}
120
+ n_success_queries = len(data)
121
+ data.sort()
122
+ for percentile in percentiles:
123
+ try:
116
124
  idx = int(n_success_queries * percentile / 100)
117
- row = rows[idx]
118
- value = row[index] if row[index] is not None else float('inf')
125
+ value = data[idx] if data[idx] is not None else float('nan')
119
126
  results[percentile] = round(value, 4)
120
- return results
127
+ except IndexError:
128
+ results[percentile] = float('nan')
129
+ return results
130
+
131
+
132
+ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
133
+ """
134
+ Compute and return quantiles for various metrics from the database results.
135
+
136
+ :param result_db_path: Path to the SQLite database file.
137
+ :return: Dictionary of percentiles for various metrics.
138
+ """
139
+
140
+ def inter_token_latencies(chunk_times_json: str) -> List[float]:
141
+ try:
142
+ chunk_times = json.loads(chunk_times_json)
143
+ return [t2 - t1 for t1, t2 in zip(chunk_times[:-1], chunk_times[1:])]
144
+ except (json.JSONDecodeError, TypeError) as e:
145
+ logger.error(f'Error parsing chunk times: {e}')
146
+ return []
121
147
 
122
148
  query_sql = ('SELECT start_time, chunk_times, success, completed_time, latency, first_chunk_latency, '
123
149
  'n_chunks, chunk_time, prompt_tokens, completion_tokens '
124
- 'FROM result WHERE success=1 ORDER BY first_chunk_latency ASC')
150
+ 'FROM result WHERE success=1')
151
+
125
152
  percentiles = [10, 25, 50, 66, 75, 80, 90, 95, 98, 99]
126
153
 
127
154
  with sqlite3.connect(result_db_path) as con:
128
155
  rows = con.execute(query_sql).fetchall()
129
156
 
130
- if len(rows) <= len(percentiles):
157
+ if len(rows) < len(percentiles):
131
158
  logger.info('Too little data to calculate quantiles!')
132
159
  return {}
133
160
 
134
- # Calculate percentiles for first chunk latency and latency
135
- first_chunk_latency_index = 5
136
- latency_index = 4
161
+ # Define index variables for columns
162
+ CHUNK_TIMES_INDEX = 1
163
+ LATENCY_INDEX = 4
164
+ FIRST_CHUNK_LATENCY_INDEX = 5
165
+ PROMPT_TOKENS_INDEX = 8
166
+ COMPLETION_TOKENS_INDEX = 9
167
+
168
+ # Prepare data for each metric
169
+ inter_token_latencies_all = []
170
+ for row in rows:
171
+ inter_token_latencies_all.extend(inter_token_latencies(row[CHUNK_TIMES_INDEX]))
172
+
173
+ metrics = {
174
+ 'TTFT (s)': [row[FIRST_CHUNK_LATENCY_INDEX] for row in rows],
175
+ 'TPOT (s)':
176
+ inter_token_latencies_all,
177
+ 'Latency (s)': [row[LATENCY_INDEX] for row in rows],
178
+ 'Input tokens': [row[PROMPT_TOKENS_INDEX] for row in rows],
179
+ 'Output tokens': [row[COMPLETION_TOKENS_INDEX] for row in rows],
180
+ 'Throughput(tokens/s)':
181
+ [(row[COMPLETION_TOKENS_INDEX] / row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan')
182
+ for row in rows]
183
+ }
137
184
 
138
- first_chunk_latency_results = percentile_results(rows, first_chunk_latency_index, percentiles)
139
- rows.sort(key=lambda x: x[latency_index])
140
- latency_results = percentile_results(rows, latency_index, percentiles)
185
+ # Calculate percentiles for each metric
186
+ results = {'Percentile': [f'{p}%' for p in percentiles]}
187
+ for metric_name, data in metrics.items():
188
+ metric_percentiles = calculate_percentiles(data, percentiles)
189
+ results[metric_name] = [metric_percentiles[p] for p in percentiles]
141
190
 
142
- # Prepare data for tabulation
143
- return {
144
- 'Percentile': [f'{p}%' for p in percentiles],
145
- 'First Chunk Latency (s)': [first_chunk_latency_results[p] for p in percentiles],
146
- 'Latency (s)': [latency_results[p] for p in percentiles]
147
- }
191
+ return results
148
192
 
149
193
 
150
194
  def summary_result(args: Arguments, metrics: BenchmarkMetrics, expected_number_of_queries: int, result_db_path: str):
@@ -102,6 +102,8 @@ def start_app(args: Arguments):
102
102
 
103
103
  elif args.api == 'local_vllm':
104
104
  os.environ['VLLM_USE_MODELSCOPE'] = 'True'
105
+ os.environ['VLLM_ALLOW_LONG_MAX_MODEL_LEN'] = '1'
106
+ os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
105
107
  # yapf: disable
106
108
  proc = subprocess.Popen([
107
109
  'python', '-m', 'vllm.entrypoints.openai.api_server',
@@ -111,7 +113,8 @@ def start_app(args: Arguments):
111
113
  '--max-model-len', '32768',
112
114
  '--gpu-memory-utilization', '0.9',
113
115
  '--host', '0.0.0.0',
114
- '--port', args.port,
116
+ '--port', str(args.port),
117
+ '--trust-remote-code',
115
118
  '--disable-log-requests',
116
119
  '--disable-log-stats',
117
120
  ])
@@ -0,0 +1,5 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ from evalscope.report.combinator import gen_table, get_data_frame, get_report_list
4
+ from evalscope.report.generator import ReportGenerator
5
+ from evalscope.report.utils import Category, Report, ReportKey, Subset