evalscope 0.13.2__py3-none-any.whl → 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (57) hide show
  1. evalscope/backend/rag_eval/__init__.py +1 -1
  2. evalscope/backend/rag_eval/backend_manager.py +21 -5
  3. evalscope/backend/rag_eval/cmteb/arguments.py +10 -0
  4. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  5. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +7 -2
  6. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -5
  7. evalscope/backend/rag_eval/utils/embedding.py +49 -3
  8. evalscope/backend/rag_eval/utils/llm.py +4 -4
  9. evalscope/backend/vlm_eval_kit/backend_manager.py +4 -2
  10. evalscope/benchmarks/arc/arc_adapter.py +1 -1
  11. evalscope/benchmarks/data_adapter.py +6 -2
  12. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  13. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
  14. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +1 -3
  15. evalscope/benchmarks/live_code_bench/testing_util.py +365 -549
  16. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  17. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +79 -0
  18. evalscope/benchmarks/mmlu/mmlu_adapter.py +5 -7
  19. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  20. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
  21. evalscope/benchmarks/musr/musr_adapter.py +1 -1
  22. evalscope/collections/evaluator.py +4 -2
  23. evalscope/config.py +1 -1
  24. evalscope/perf/arguments.py +24 -5
  25. evalscope/perf/benchmark.py +28 -42
  26. evalscope/perf/http_client.py +2 -3
  27. evalscope/perf/plugin/api/custom_api.py +1 -1
  28. evalscope/perf/plugin/api/openai_api.py +2 -2
  29. evalscope/perf/plugin/datasets/custom.py +4 -1
  30. evalscope/perf/plugin/datasets/line_by_line.py +4 -1
  31. evalscope/perf/plugin/datasets/longalpaca.py +4 -1
  32. evalscope/perf/plugin/datasets/openqa.py +4 -1
  33. evalscope/perf/plugin/datasets/random_dataset.py +13 -6
  34. evalscope/perf/utils/benchmark_util.py +12 -6
  35. evalscope/perf/utils/db_util.py +1 -1
  36. evalscope/perf/utils/log_utils.py +41 -0
  37. evalscope/report/app.py +11 -11
  38. evalscope/run.py +7 -0
  39. evalscope/summarizer.py +2 -1
  40. evalscope/utils/utils.py +36 -25
  41. evalscope/version.py +2 -2
  42. {evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/METADATA +20 -15
  43. {evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/RECORD +55 -54
  44. tests/cli/test_all.py +4 -4
  45. tests/cli/test_collection.py +2 -1
  46. tests/cli/test_run.py +9 -8
  47. tests/perf/test_perf.py +1 -2
  48. tests/rag/test_clip_benchmark.py +0 -1
  49. tests/rag/test_mteb.py +37 -8
  50. tests/rag/test_ragas.py +29 -26
  51. tests/vlm/test_vlmeval.py +37 -1
  52. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  53. evalscope/benchmarks/live_code_bench/execute_utils.py +0 -267
  54. {evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/LICENSE +0 -0
  55. {evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/WHEEL +0 -0
  56. {evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/entry_points.txt +0 -0
  57. {evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/top_level.txt +0 -0
evalscope/report/app.py CHANGED
@@ -44,7 +44,7 @@ def scan_for_report_folders(root_path):
44
44
  continue
45
45
  datasets = []
46
46
  for dataset_item in glob.glob(os.path.join(model_item, '*.json')):
47
- datasets.append(os.path.basename(dataset_item).split('.')[0])
47
+ datasets.append(os.path.splitext(os.path.basename(dataset_item))[0])
48
48
  datasets = DATASET_TOKEN.join(datasets)
49
49
  reports.append(
50
50
  f'{os.path.basename(folder)}{REPORT_TOKEN}{os.path.basename(model_item)}{MODEL_TOKEN}{datasets}')
@@ -253,17 +253,17 @@ def process_model_prediction(item: Any):
253
253
 
254
254
 
255
255
  def normalize_score(score):
256
- if isinstance(score, bool):
257
- return 1.0 if score else 0.0
258
- elif isinstance(score, dict):
259
- for key in score:
260
- return float(score[key])
261
- return 0.0
262
- else:
263
- try:
264
- return float(score)
265
- except (ValueError, TypeError):
256
+ try:
257
+ if isinstance(score, bool):
258
+ return 1.0 if score else 0.0
259
+ elif isinstance(score, dict):
260
+ for key in score:
261
+ return float(score[key])
266
262
  return 0.0
263
+ else:
264
+ return float(score)
265
+ except (ValueError, TypeError):
266
+ return 0.0
267
267
 
268
268
 
269
269
  def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subset_name: str):
evalscope/run.py CHANGED
@@ -58,10 +58,17 @@ def setup_work_directory(task_cfg: TaskConfig, run_time: str):
58
58
 
59
59
  outputs = OutputsStructure(outputs_dir=task_cfg.work_dir)
60
60
 
61
+ # Unify the output directory structure
61
62
  if task_cfg.eval_backend == EvalBackend.OPEN_COMPASS:
62
63
  task_cfg.eval_config['time_str'] = run_time
63
64
  elif task_cfg.eval_backend == EvalBackend.VLM_EVAL_KIT:
64
65
  task_cfg.eval_config['work_dir'] = task_cfg.work_dir
66
+ elif task_cfg.eval_backend == EvalBackend.RAG_EVAL:
67
+ from evalscope.backend.rag_eval import Tools
68
+ if task_cfg.eval_config['tool'].lower() == Tools.MTEB:
69
+ task_cfg.eval_config['eval']['output_folder'] = task_cfg.work_dir
70
+ elif task_cfg.eval_config['tool'].lower() == Tools.CLIP_BENCHMARK:
71
+ task_cfg.eval_config['eval']['output_dir'] = task_cfg.work_dir
65
72
  return outputs
66
73
 
67
74
 
evalscope/summarizer.py CHANGED
@@ -105,7 +105,8 @@ class Summarizer:
105
105
  summary_res: dict = csv_to_list(summary_file_path)[0]
106
106
  elif summary_file_path.endswith('json'):
107
107
  summary_res: dict = json_to_dict(summary_file_path)
108
- file_name = os.path.basename(summary_file_path).split('.')[0]
108
+ base_name = os.path.basename(summary_file_path)
109
+ file_name = os.path.splitext(base_name)[0]
109
110
  final_res_list.append({file_name: summary_res})
110
111
 
111
112
  elif eval_backend == EvalBackend.THIRD_PARTY:
evalscope/utils/utils.py CHANGED
@@ -90,7 +90,7 @@ class ResponseParser:
90
90
  return ''
91
91
 
92
92
  @staticmethod
93
- def parse_first_option_with_choices(text: str, options: list) -> str:
93
+ def parse_first_option_with_choices(text: str, options: list[str]) -> str:
94
94
  """
95
95
  Find first valid option for text.
96
96
 
@@ -98,7 +98,7 @@ class ResponseParser:
98
98
  text: The text to parse.
99
99
  options: The options to find. e.g. ['A', 'B', 'C', 'D']
100
100
  """
101
- options_concat = '|'.join([str(i) for i in options])
101
+ options_concat = ResponseParser.process_options(options)
102
102
 
103
103
  patterns = [
104
104
  rf'答案是?\s?([{options_concat}])',
@@ -155,48 +155,53 @@ class ResponseParser:
155
155
  for i in options:
156
156
  if i in outputs:
157
157
  return i
158
- return ''
158
+ return 'No valid option found'
159
159
 
160
160
  @staticmethod
161
- def parse_first_option(text: str) -> str:
161
+ def parse_first_option(text: str, options: list[str]) -> str:
162
162
  """
163
163
  Find first valid option for text.
164
164
 
165
165
  Args:
166
166
  text: The text to parse.
167
167
  """
168
+ options_pattern = ResponseParser.process_options(options)
169
+
168
170
  patterns = [
169
- r'answer is \(?(\w+)\)?',
170
- r'[Aa]nswer:\s*(\w+)',
171
- r'[Tt]he correct answer is:\s*(\w+)',
172
- r'[Tt]he correct answer is:\n\s*(\w+)',
173
- r'[Tt]he correct answer is:\n\n-\s*(\w+)',
174
- r'[Tt]he answer might be:\n\n-\s*(\w+)',
175
- r'[Tt]he answer is \s*(\w+)',
171
+ rf'[Aa]nswer:\s*({options_pattern})',
172
+ rf'ANSWER:\s*({options_pattern})',
173
+ rf'answer is \(?({options_pattern})\)?',
174
+ rf'[Tt]he correct answer is:\s*({options_pattern})',
175
+ rf'[Tt]he correct answer is:\n\s*({options_pattern})',
176
+ rf'[Tt]he correct answer is:\n\n-\s*({options_pattern})',
177
+ rf'[Tt]he answer might be:\n\n-\s*({options_pattern})',
178
+ rf'[Tt]he answer is \s*({options_pattern})',
176
179
  ]
177
180
 
178
181
  regexes = [re.compile(pattern) for pattern in patterns]
179
182
  for regex in regexes:
180
- match = regex.search(text)
181
- if match:
182
- return match.group(1)
183
- return ''
183
+ matches = regex.search(text)
184
+ if matches:
185
+ return matches.group(1)
186
+ return 'No valid option found'
187
+
184
188
 
185
189
  @staticmethod
186
- def parse_first_capital_multi(text: str) -> str:
187
- match = re.search(r'([A-D]+)', text)
190
+ def parse_bracketed_answer(text: str, options: list[str]) -> str:
191
+ options = ResponseParser.process_options(options)
192
+ # Match the first occurrence of the options in angle brackets
193
+ match = re.search(rf'<({options})>', text)
188
194
  if match:
189
195
  return match.group(1)
190
- return ''
196
+ return 'No valid option found'
191
197
 
192
198
  @staticmethod
193
- def parse_last_option(text: str, options: str) -> str:
194
- match = re.findall(rf'([{options}])', text)
195
- if match:
196
- return match[-1]
197
- return ''
198
-
199
-
199
+ def process_options(options: list[str]) -> str:
200
+ # Escape each option to ensure special characters in options are treated literally
201
+ escaped_options = [re.escape(option) for option in options]
202
+ # Join options into a regex pattern separated by '|', to match any of the options
203
+ options_pattern = '|'.join(escaped_options)
204
+ return options_pattern
200
205
 
201
206
  def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float, dict]:
202
207
  """
@@ -299,3 +304,9 @@ def seed_everything(seed: int):
299
304
  torch.cuda.manual_seed_all(seed)
300
305
  torch.backends.cudnn.deterministic = True
301
306
  torch.backends.cudnn.benchmark = False
307
+
308
+ if __name__ == '__main__':
309
+ options = ['A', 'B', 'C', 'D']
310
+ answers = ['Context .... ANSWER: A', 'answer: A']
311
+ for answer in answers:
312
+ print(ResponseParser.parse_first_option(answer, options))
evalscope/version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- __version__ = '0.13.2'
4
- __release_datetime__ = '2025-04-01 20:00:00'
3
+ __version__ = '0.14.0'
4
+ __release_datetime__ = '2025-04-10 20:00:00'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.13.2
3
+ Version: 0.14.0
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -71,12 +71,12 @@ Requires-Dist: transformers>=4.33; extra == "all"
71
71
  Requires-Dist: word2number; extra == "all"
72
72
  Requires-Dist: ms-opencompass>=0.1.4; extra == "all"
73
73
  Requires-Dist: ms-vlmeval>=0.0.9; extra == "all"
74
- Requires-Dist: langchain<0.3.0; extra == "all"
75
- Requires-Dist: langchain-community<0.3.0; extra == "all"
76
- Requires-Dist: langchain-core<0.3.0; extra == "all"
77
- Requires-Dist: langchain-openai<0.3.0; extra == "all"
74
+ Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "all"
75
+ Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "all"
76
+ Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "all"
77
+ Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "all"
78
78
  Requires-Dist: mteb==1.19.4; extra == "all"
79
- Requires-Dist: ragas==0.2.9; extra == "all"
79
+ Requires-Dist: ragas==0.2.14; extra == "all"
80
80
  Requires-Dist: webdataset>0.2.0; extra == "all"
81
81
  Requires-Dist: aiohttp; extra == "all"
82
82
  Requires-Dist: fastapi; extra == "all"
@@ -99,12 +99,12 @@ Requires-Dist: sse-starlette; extra == "perf"
99
99
  Requires-Dist: transformers; extra == "perf"
100
100
  Requires-Dist: unicorn; extra == "perf"
101
101
  Provides-Extra: rag
102
- Requires-Dist: langchain<0.3.0; extra == "rag"
103
- Requires-Dist: langchain-community<0.3.0; extra == "rag"
104
- Requires-Dist: langchain-core<0.3.0; extra == "rag"
105
- Requires-Dist: langchain-openai<0.3.0; extra == "rag"
102
+ Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "rag"
103
+ Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "rag"
104
+ Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "rag"
105
+ Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "rag"
106
106
  Requires-Dist: mteb==1.19.4; extra == "rag"
107
- Requires-Dist: ragas==0.2.9; extra == "rag"
107
+ Requires-Dist: ragas==0.2.14; extra == "rag"
108
108
  Requires-Dist: webdataset>0.2.0; extra == "rag"
109
109
  Provides-Extra: vlmeval
110
110
  Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
@@ -121,7 +121,7 @@ Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
121
121
  </p>
122
122
 
123
123
  <p align="center">
124
- <img src="https://img.shields.io/badge/python-%E2%89%A53.8-5be.svg">
124
+ <img src="https://img.shields.io/badge/python-%E2%89%A53.9-5be.svg">
125
125
  <a href="https://badge.fury.io/py/evalscope"><img src="https://badge.fury.io/py/evalscope.svg" alt="PyPI version" height="18"></a>
126
126
  <a href="https://pypi.org/project/evalscope"><img alt="PyPI - Downloads" src="https://static.pepy.tech/badge/evalscope"></a>
127
127
  <a href="https://github.com/modelscope/evalscope/pulls"><img src="https://img.shields.io/badge/PR-welcome-55EB99.svg"></a>
@@ -199,6 +199,8 @@ Please scan the QR code below to join our community groups:
199
199
 
200
200
  ## 🎉 News
201
201
 
202
+ - 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
203
+ - 🔥 **[2025.04.08]** Support for evaluating embedding model services compatible with the OpenAI API has been added. For more details, check the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html#configure-evaluation-parameters).
202
204
  - 🔥 **[2025.03.27]** Added support for [AlpacaEval](https://www.modelscope.cn/datasets/AI-ModelScope/alpaca_eval/dataPeview) and [ArenaHard](https://modelscope.cn/datasets/AI-ModelScope/arena-hard-auto-v0.1/summary) evaluation benchmarks. For usage notes, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html)
203
205
  - 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
204
206
  - 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
@@ -212,15 +214,14 @@ Please scan the QR code below to join our community groups:
212
214
  - 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/en/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
213
215
  - 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
214
216
  - 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
217
+ <details><summary>More</summary>
218
+
215
219
  - 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
216
220
  - 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
217
221
  - 🔥 **[2024.11.26]** The model inference service performance evaluator has been completely refactored: it now supports local inference service startup and Speed Benchmark; asynchronous call error handling has been optimized. For more details, refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
218
222
  - 🔥 **[2024.10.31]** The best practice for evaluating Multimodal-RAG has been updated, please check the [📖 Blog](https://evalscope.readthedocs.io/zh-cn/latest/blog/RAG/multimodal_RAG.html#multimodal-rag) for more details.
219
223
  - 🔥 **[2024.10.23]** Supports multimodal RAG evaluation, including the assessment of image-text retrieval using [CLIP_Benchmark](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/clip_benchmark.html), and extends [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html) to support end-to-end multimodal metrics evaluation.
220
224
  - 🔥 **[2024.10.8]** Support for RAG evaluation, including independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
221
-
222
- <details><summary>More</summary>
223
-
224
225
  - 🔥 **[2024.09.18]** Our documentation has been updated to include a blog module, featuring some technical research and discussions related to evaluations. We invite you to [📖 read it](https://evalscope.readthedocs.io/en/refact_readme/blog/index.html).
225
226
  - 🔥 **[2024.09.12]** Support for LongWriter evaluation, which supports 10,000+ word generation. You can use the benchmark [LongBench-Write](evalscope/third_party/longbench_write/README.md) to measure the long output quality as well as the output length.
226
227
  - 🔥 **[2024.08.30]** Support for custom dataset evaluations, including text datasets and multimodal image-text datasets.
@@ -503,6 +504,10 @@ Reference: Performance Testing [📖 User Guide](https://evalscope.readthedocs.i
503
504
 
504
505
  ![wandb sample](https://modelscope.oss-cn-beijing.aliyuncs.com/resource/wandb_sample.png)
505
506
 
507
+ **Supports swanlab for recording results**
508
+
509
+ ![swanlab sample](https://sail-moe.oss-cn-hangzhou.aliyuncs.com/yunlin/images/evalscope/swanlab.png)
510
+
506
511
  **Supports Speed Benchmark**
507
512
 
508
513
  It supports speed testing and provides speed benchmarks similar to those found in the [official Qwen](https://qwen.readthedocs.io/en/latest/benchmark/speed_benchmark.html) reports:
@@ -1,11 +1,11 @@
1
1
  evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
2
2
  evalscope/arguments.py,sha256=OPYmX_ar7rXFm0ETPuE2hs-knDQtwQ0pFwSazjn3S9Q,5241
3
- evalscope/config.py,sha256=CkNBE83S335iyu0VRMkblaJw5nGM8pXv4NhK5ySE3cs,9476
3
+ evalscope/config.py,sha256=sc8NoqhspbrNYMS201ZWreCKV-tBJrUEt96vKwpqfDY,9483
4
4
  evalscope/constants.py,sha256=Cgzkoz4R3MC3YLtbCM2fmSwF8Z2kuxYdOC8t9FWJj9w,3740
5
- evalscope/run.py,sha256=LUCdnNzNIfHSWvxu3gxAsHEDX7hT5mcVnV4lSY5h0iA,6007
5
+ evalscope/run.py,sha256=XbUhllYPjaJJuR1hPoGZH0jlW8XlvUv9gONrMBc4Ni0,6450
6
6
  evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
7
- evalscope/summarizer.py,sha256=Wnt8Y61PasOkigo8zMpi1qdFzfETjfnDtCDDQ6VwgDw,5867
8
- evalscope/version.py,sha256=JzXnfz-D9eKhVPZu2TQUPFaTFhRiZ3iK4jcIuxfnQE8,119
7
+ evalscope/summarizer.py,sha256=61kU5ZoSh1dd8HMJPqP3ZvJwcY9szwWFCZdu2lfATJA,5920
8
+ evalscope/version.py,sha256=4w52xL5au75pTD-PrvG-9l-U1euGk2032efyc-7IkQw,119
9
9
  evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
11
11
  evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
@@ -14,8 +14,8 @@ evalscope/backend/opencompass/backend_manager.py,sha256=y5NnAIY1pI7E1ZSeKU3acrD-
14
14
  evalscope/backend/opencompass/tasks/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
15
15
  evalscope/backend/opencompass/tasks/eval_api.py,sha256=ZaGdUbEOtAW5VX3ZXmpHIttg_QrID34EnBTylD3uvos,1152
16
16
  evalscope/backend/opencompass/tasks/eval_datasets.py,sha256=JHSq4EnPJgv4sRJJplLH80EqE3ghtkn2k8HnV6DaDew,5406
17
- evalscope/backend/rag_eval/__init__.py,sha256=jFWj8l8bPAu1sz7wtX5gGIweBFC8c2LzXUPz7tGambE,284
18
- evalscope/backend/rag_eval/backend_manager.py,sha256=Cw322R1j-L8vMERAWEXUTT-0a1K-V6KhQOtrOhgKVMM,2857
17
+ evalscope/backend/rag_eval/__init__.py,sha256=Tbj7HboP5zzJ77-9qVEwwhHKjHL5V8MwLFr6sw1oeoA,291
18
+ evalscope/backend/rag_eval/backend_manager.py,sha256=OEFADT8kdsuVMU0QOfiafzFQopY7bKbWZ_jhdXyYElY,3472
19
19
  evalscope/backend/rag_eval/clip_benchmark/__init__.py,sha256=C8Vetf52nyHiRwY2Pm74Bjn3UpWboQeghCGNh67X1EM,151
20
20
  evalscope/backend/rag_eval/clip_benchmark/arguments.py,sha256=d5UkbC3RXb6iyzy_ILumToAVO1AdwvDeyOiX5KB2u0g,1530
21
21
  evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py,sha256=anuIhRk9OC8y0LNBjvttSXppc99gbz-f0TYQjnyLLyU,8347
@@ -27,7 +27,7 @@ evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py,sha256=t0U
27
27
  evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py,sha256=rZY-TulG-Cb8b6GTBxqTDYQ_4Ois3kbgKhuunZq8Ato,8407
28
28
  evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt,sha256=eiiAaxhS48b5rVLy5O9VvFfV2AfxY86ITu_iqT7ZLkQ,649
29
29
  evalscope/backend/rag_eval/cmteb/__init__.py,sha256=I502GHPFYo8BwlFvoljGKI24PY76eBXJQiquWk8nJNU,280
30
- evalscope/backend/rag_eval/cmteb/arguments.py,sha256=Z3GkGi7zjK85JynG-7CSVPmAxPRcGYuykkgfbxgn7_E,2317
30
+ evalscope/backend/rag_eval/cmteb/arguments.py,sha256=y2iTbs3a7R747NgS00nK2j3zO7gmREh8n7mWMrzF1js,2653
31
31
  evalscope/backend/rag_eval/cmteb/base.py,sha256=UCobQ81dHkiTmIz_0BJ_VANj_uG6mkJbYLKJztvMXfo,2849
32
32
  evalscope/backend/rag_eval/cmteb/task_template.py,sha256=FyFs1reefcsFCrWyi7Ya5dnFYvBhtxph2wIaFtOtFls,2595
33
33
  evalscope/backend/rag_eval/cmteb/tasks/Classification.py,sha256=sqbH0XmSiIm4n5UX5sXMwJHby1r-d35mwW1tKIhb2Hg,10848
@@ -39,25 +39,24 @@ evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py,sha256=ofmmeoieXHmU6O14JKWO9
39
39
  evalscope/backend/rag_eval/cmteb/tasks/STS.py,sha256=uhGLsQTo5lM3-L2Na3WJGqOLQw3c1WxHDA22ePJPxtU,12285
40
40
  evalscope/backend/rag_eval/cmteb/tasks/__init__.py,sha256=PKBNyp45hIa3FYNA1psiwtwfwUcn7s9eNt6r5aUpyyY,1505
41
41
  evalscope/backend/rag_eval/ragas/__init__.py,sha256=D0yJkN9SuNGIAL3niZw4BI08Yh3HznsUUewdIAa_-LM,171
42
- evalscope/backend/rag_eval/ragas/arguments.py,sha256=8SYCV15d25ocdDHRqmGMQzd9zR6gwfOrVSFBe4T-KCo,1806
42
+ evalscope/backend/rag_eval/ragas/arguments.py,sha256=S6M1nsqwMQ8lnZZDtlQTdzyOCfLn9WP0QJ_7wAEsVgc,1695
43
43
  evalscope/backend/rag_eval/ragas/task_template.py,sha256=a_3bWfLx0j2zJkWgEWNStO0XXAeUFdnFpeukpoGfxLg,1669
44
44
  evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py,sha256=fX9sCci787ViGiL3BhGsykx0bnWfOWWEFueaJKyR8g4,793
45
45
  evalscope/backend/rag_eval/ragas/tasks/__init__.py,sha256=hErdWKbvV9aRqOpQTzdFHw1tcYoDbnttmic7GpZzKx8,173
46
46
  evalscope/backend/rag_eval/ragas/tasks/build_distribution.py,sha256=vFfemiqtPx22u5pwwZxEQJKYf3B9efYmwbpWDI5hY30,1491
47
47
  evalscope/backend/rag_eval/ragas/tasks/build_transform.py,sha256=GtAYqdVOy7BxIGyC4rSZ_UfXagKYzE6eEtXbaOI_g-k,5425
48
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py,sha256=B5ZETlQw5XTEDnO-VR5yXjSbbg1eUtjGts7M5msK2ik,5618
49
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py,sha256=aP8U9zjIDl26X_YF82SXLpkxoJ4nUurmdKSEoJ-qsLY,2129
48
+ evalscope/backend/rag_eval/ragas/tasks/testset_generation.py,sha256=YSqpaXMFVe8mkVfq3i_oJg1MSnPm98E7WdOBdyUwMpA,5784
49
+ evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py,sha256=6x-4O2pgsjZCVfJNvwZEKcgLe_QhSknPg-f2jGjZkU4,1890
50
50
  evalscope/backend/rag_eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
51
51
  evalscope/backend/rag_eval/utils/clip.py,sha256=GLHhPCac2AH35AvRLvVqePA1gIMAewHTFmCJCDZzvqU,5015
52
- evalscope/backend/rag_eval/utils/embedding.py,sha256=x9HAEfZSSAnT2Tdbf-9a5UmBVagCr__ay5A2nMCPMpg,6258
53
- evalscope/backend/rag_eval/utils/llm.py,sha256=UIfdvkxVViYkIpX-MoM8sAwGEAozzVFyzX-YoFxXC1E,2607
52
+ evalscope/backend/rag_eval/utils/embedding.py,sha256=tFMepPAMO4Kkqeqh-XxXIDYRjGbCMlk7lwuUW7FNvCA,7977
53
+ evalscope/backend/rag_eval/utils/llm.py,sha256=acaD5QHPJUstJGpW1sNJ-3ZPT5J_Z8beOWb61Rtz07U,2607
54
54
  evalscope/backend/rag_eval/utils/tools.py,sha256=FU7tNu-8y8V_o_kArFVTTLM_GzL12KBNeXiwQw5SpJA,1529
55
55
  evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4tycyy-FJgzLdjeY,84
56
- evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=ZB0wYzPBPyIM0zjfp5C71GcjAgKxKoWxF7RrhuDQBYM,5931
57
- evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=D-8Sr0tZblf-3ZR5T0lOggbgFRuy5wq9Nm02GUpNyAs,1620
56
+ evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=sUYvQxCtPl6CrcwhQpY8lJjW5skqWc-fvHUSnXd_MvQ,6054
58
57
  evalscope/benchmarks/__init__.py,sha256=b_SWdV1ZyOqFiwc_9lIjKrIvK1rwnF2cCIF7XN9CN8E,932
59
58
  evalscope/benchmarks/benchmark.py,sha256=a_7Ctz36McuTyBSTYi56jis9pvOdWhg7JVSPFrbxqR4,2535
60
- evalscope/benchmarks/data_adapter.py,sha256=UvbJJTNBvA0aM-xmsaj9jEEsNksn9pTDDr90FfFX2pg,17606
59
+ evalscope/benchmarks/data_adapter.py,sha256=lwW23GjHHAptv4mc1u3xLlKqiRI1EfbSqaG3QGmxqEQ,17750
61
60
  evalscope/benchmarks/utils.py,sha256=6kxeBz4w8Fw68AYH05a4ncjgkaUV4bU3eaFVLqOdkMI,1321
62
61
  evalscope/benchmarks/aime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
63
62
  evalscope/benchmarks/aime/aime24_adapter.py,sha256=dBm9yukt4-CByEPUlAPAIN6mL3VkZcI-dw2kz4oQBMo,1715
@@ -66,7 +65,7 @@ evalscope/benchmarks/alpaca_eval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
66
65
  evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py,sha256=2a6wHJSLe89Xh18u1LBkMQEZzfOURiek6o0-k2lCQgM,4065
67
66
  evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
68
67
  evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
69
- evalscope/benchmarks/arc/arc_adapter.py,sha256=lkhDz-DYjPQ1vHzo8X4j-0Lq_rBxAnws35_R00pIbNI,6347
68
+ evalscope/benchmarks/arc/arc_adapter.py,sha256=U-yPDAjYkPUUOXYjCM1ajdvlUVcdeuVoMK7yWJcX6LI,6369
70
69
  evalscope/benchmarks/arena_hard/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
71
70
  evalscope/benchmarks/arena_hard/arena_hard_adapter.py,sha256=bdQfLTWB5pFo4hET0uFqu5zMX9PNQNwdoLoGrL5jCBE,6213
72
71
  evalscope/benchmarks/arena_hard/utils.py,sha256=NstI1VR5fTaT-bfXRj0cLqm0DtH8EY4EQHR-K9HJubI,5089
@@ -116,7 +115,7 @@ evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=ecNwAE3p2
116
115
  evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
117
116
  evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=ZVGzUuuQ0UTOqQtXE40ZyBeMOSl8saSiFEQ5_siJ-c8,5052
118
117
  evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
119
- evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=KBZDP1T-t7uu8vBLGL_unVdj7rDko3KWBPKqWlw31JQ,4596
118
+ evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=8d5znAcQmFSmvyKV-JuMQzbY5k6xDNQQdrWZ7zgPTK4,4603
120
119
  evalscope/benchmarks/gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
121
120
  evalscope/benchmarks/gpqa/chain_of_thought.txt,sha256=pgoT5a-DMPJaMhoH_M8zfU5s80ibWDTVW6vnonITd8k,5610
122
121
  evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=UB287DtnbkSQXZsbReFJqmQRwbo672DTCeXXilR_-Vc,4790
@@ -125,7 +124,7 @@ evalscope/benchmarks/gsm8k/gsm8k.py,sha256=ZDN5lfeZyc_pkTDVY0voC_zUExHE1ZoEgEaTv
125
124
  evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=ZZZ-9oja53IwiU33Kjm7NTk4MbFGWyvonhnHrn_3Na8,10557
126
125
  evalscope/benchmarks/hellaswag/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
127
126
  evalscope/benchmarks/hellaswag/hellaswag.py,sha256=5_c9WbaS1LIdvgXzqEcvjAEtKi2V2Yn0YtszPlFqhXI,4610
128
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=QYZZuxbjkKxAjxuoWn0M5WgusO55vzeAcyKnWUMow3M,5871
127
+ evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=SRM_-AKlWtKXi4zrlBAH9YceFnrktZDNsjvQOiPizUM,5893
129
128
  evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
130
129
  evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
131
130
  evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=UOjakV31J0g7TYbrRls0ItcopWOJu54ucPfaqSJB7Os,5250
@@ -139,25 +138,26 @@ evalscope/benchmarks/iquiz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJ
139
138
  evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=16whmFkJt9fLbei9d-kmjnWB_5y5vsiX9tK5kSuxDw8,2449
140
139
  evalscope/benchmarks/live_code_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
141
140
  evalscope/benchmarks/live_code_bench/evaluate_utils.py,sha256=rOWaG8PV4AGIRhS_gqwxEhphEVe1Cqg57Eudwm5HTjI,6820
142
- evalscope/benchmarks/live_code_bench/execute_utils.py,sha256=MreaMLI0IicNZawpfqcyoRLt67EZ3CJvmxxRTYwhAbU,7397
143
141
  evalscope/benchmarks/live_code_bench/extract_utils.py,sha256=ZcQ8y741uawPo6I_1_XglR3eqJFDNrqc8fILKZupVRs,2375
144
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py,sha256=8MOECcweLG465JFgUzP20QlKyBAO90oFHhH7Z77FuUY,3521
142
+ evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py,sha256=vLr43hvtR0WS9GclJ6xL9MIqwC941EiRSqgZ_hGHZnw,3382
145
143
  evalscope/benchmarks/live_code_bench/load_utils.py,sha256=5i9wtdPLYR8ckjx5MaYQVC2LFYvjKzR6Fa6UZmeOTRc,2445
146
144
  evalscope/benchmarks/live_code_bench/pass_k_utils.py,sha256=Ktrp_lXdfFzoHtQNQNdGfIl26ySjaPCHm4Zv-dFvRqM,2024
147
145
  evalscope/benchmarks/live_code_bench/prompts.py,sha256=P4KILIAIDT1MKDck0xHYV_6v9820wDZRhxVMazmlL-g,12600
148
- evalscope/benchmarks/live_code_bench/testing_util.py,sha256=s5oa--dOcugcpBmHsbeqnTRTDhdiCNXkIQuRc6EgA8o,28241
146
+ evalscope/benchmarks/live_code_bench/testing_util.py,sha256=v4N7Y4MasNL6TjC4w-Duw_4Zn0oLdWAw3HU6ZrM76P8,17161
147
+ evalscope/benchmarks/maritime_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
148
+ evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py,sha256=RVbsiglxmEW37-tDYgr4Drywh26I94DRGhwv7uP2aYk,2829
149
149
  evalscope/benchmarks/math_500/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
150
150
  evalscope/benchmarks/math_500/math_500_adapter.py,sha256=SB2eb4Z7DTXdptqirEoctqTdDLEu28s7bLeCAMBmAFo,1923
151
151
  evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
152
152
  evalscope/benchmarks/mmlu/mmlu.py,sha256=sA8AC0bN7iURrSazqkY31s_reNVbDZSUCB-NCTQsVeI,5042
153
- evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=e__Evar99V9l65FlzT6T594CN4iMgmuVhjujQAm4po4,11662
153
+ evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=bQSRTgXk01pCfKdmTxr3si4FxET3j_yBVVmQlLchTns,11586
154
154
  evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
155
155
  evalscope/benchmarks/mmlu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
156
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=OANfue-fK543drJrDj6V_zDMtySrQEBHPgTsejr-e7U,4226
156
+ evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=hPqxDqDhqin3TxfimfhIxfEc_8UfzTDGAfX7iDrWy28,4248
157
157
  evalscope/benchmarks/mmlu_redux/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
158
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py,sha256=ZZMy9exJ8hknr1D6s73sAhHHzBAKcqo7WAmlUtPqpCI,9556
158
+ evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py,sha256=Kr30i_exxBJRz9PLB5g6F04e2HJ4WuF6LDyAwaRh2MY,9578
159
159
  evalscope/benchmarks/musr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
160
- evalscope/benchmarks/musr/musr_adapter.py,sha256=Po8hcIQiqlFo0AGjcNQe75cpsMNDcfiJaKgZsk33-DY,2442
160
+ evalscope/benchmarks/musr/musr_adapter.py,sha256=85P0sY7H9pthYdCjkE2AOxaiNhcIBW1iZmODkz3FN0M,2464
161
161
  evalscope/benchmarks/process_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
162
162
  evalscope/benchmarks/process_bench/critique_template.txt,sha256=tycx8n42QEC0uGcwbIvHfZvfTnchlRxGz8Tp1R2_e_Y,489
163
163
  evalscope/benchmarks/process_bench/process_bench_adapter.py,sha256=ydU-r1T0DaYhOxkhZgGL7PhDd4XoeqOBzVO9oiFPd8M,3422
@@ -187,7 +187,7 @@ evalscope/cli/start_eval.py,sha256=MXhVDeaMFd6ny88-gnVtQflH660UaDj240YGYnHccx8,7
187
187
  evalscope/cli/start_perf.py,sha256=5hLi5jWgM9BJPXLd8d9D1zqrcj_5c0KvkfB1DgD4_RU,831
188
188
  evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
189
189
  evalscope/collections/__init__.py,sha256=hd68Qf-ryeDsz5Pu-Dh83M5V5RE5mhLsG-vc55n5n0o,228
190
- evalscope/collections/evaluator.py,sha256=YJy8Dj35XCdCwhNDwZecJkeW1_ZgIOsuRLFzfe3SyV8,12724
190
+ evalscope/collections/evaluator.py,sha256=4IkdbKySOW-MzH9Zjn0uddQviFLe2pOef746fgbjkJo,12784
191
191
  evalscope/collections/sampler.py,sha256=2NwvhJVdi-mrDeK7RWwEGOoE7DdxtpyASRUZU_D6hWw,4855
192
192
  evalscope/collections/schema.py,sha256=mjJfNmy_athJ1TmnuJRkrKRlefzefuQXZuTtjn8SHKo,4073
193
193
  evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
@@ -219,32 +219,33 @@ evalscope/models/custom/__init__.py,sha256=wb6f_Bi39s5sj-VO7EXRDXB2WhyFb49BUtEMk
219
219
  evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
220
220
  evalscope/models/custom/dummy_model.py,sha256=ODD6pt9FvZq_a54oYsehBDslRKHOsk9zsC9iAZvi5Yg,2020
221
221
  evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
222
- evalscope/perf/arguments.py,sha256=srDp3JMYIPZxkfua5WHkjq3G8lJlTtxdXKxE_CivoJk,9156
223
- evalscope/perf/benchmark.py,sha256=qY7zrsZMDBr1fABsShXjgK12tNE7PhzGZdLaUtdtxU8,8318
224
- evalscope/perf/http_client.py,sha256=xMakdQkJ2cgIOd-yOmHEW0vbGKTJ0JWhLFt9IFtUP8Q,7473
222
+ evalscope/perf/arguments.py,sha256=UZKlkbDE2N408dY8Ji-WB8sl1rcmamywzxLvNXpnY0w,10194
223
+ evalscope/perf/benchmark.py,sha256=nv7gtCkeKnLKQQiKM4G0MYO2ambcuwsbx67OgEQG0nM,7917
224
+ evalscope/perf/http_client.py,sha256=-c3-N7bxKsj3d5DVsKSaYA3XAHJDzZgoqZBbhuDYIGk,7419
225
225
  evalscope/perf/main.py,sha256=w-yDbl0osaTAMgC-JNPpqIq2LQ7U4c-Ht7Amj8Nbjc8,1278
226
226
  evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
227
227
  evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
228
228
  evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
229
229
  evalscope/perf/plugin/api/base.py,sha256=B_H04qKx7eRTn155rnDrbTYur7PK1mvxfQKYcqYbndU,2118
230
- evalscope/perf/plugin/api/custom_api.py,sha256=ay1AGi4y2opjwyRl0J0A54-vLB-pBj3QBFkzog0KA-g,3787
230
+ evalscope/perf/plugin/api/custom_api.py,sha256=ssE4J8AynA0n5SnXSQyk7K5Co3dwUN6Opph08clZna0,3785
231
231
  evalscope/perf/plugin/api/dashscope_api.py,sha256=V5fwn-p_fLH0dWKzhN9TvYSHRgla4INfXC4NDaIjoQ8,3825
232
- evalscope/perf/plugin/api/openai_api.py,sha256=DNDmW7jT0Abopw-K73X0PE7Vr2wTSKMBj79hJZTi-K8,7668
232
+ evalscope/perf/plugin/api/openai_api.py,sha256=kTL_2OACuKhzd2W0Pf4DirpMumzk4V3rqKZ2mvBZVCs,7655
233
233
  evalscope/perf/plugin/datasets/__init__.py,sha256=Z6Jc0RxJS_z0nBBV1-b0-56Ija60AtQ7I_67gY6ZfdQ,568
234
234
  evalscope/perf/plugin/datasets/base.py,sha256=Z-INWueeYjfEZhP4lbTlBMVwIa6BcXZKWx-w7Pop3mA,1786
235
- evalscope/perf/plugin/datasets/custom.py,sha256=_GSC5yR90_BjcRjdJqrJT2vHQAzskz5XxYOxngUM2Pg,815
235
+ evalscope/perf/plugin/datasets/custom.py,sha256=npreC7H1VsdTGYkqlMESvyOhtXOfZQA7_-ICmxe3FWk,936
236
236
  evalscope/perf/plugin/datasets/flickr8k.py,sha256=UzAIFIO0m5inWOkWM1mO6wfV2HOuXAqiTxCJ4b0SiZM,1589
237
- evalscope/perf/plugin/datasets/line_by_line.py,sha256=IKVZMpKei6XW9DTm9VEssWHE96i1lTqMf0621dA_img,836
238
- evalscope/perf/plugin/datasets/longalpaca.py,sha256=2aENqCly_DX1dyNcurYsLFJIvXYFph6jWm7z7XETvMk,1176
239
- evalscope/perf/plugin/datasets/openqa.py,sha256=_aVXs2s8wbmtoB6ZO-pNjUZvBVxRUYdoJDGv5-BumtI,1342
240
- evalscope/perf/plugin/datasets/random_dataset.py,sha256=wPyY5kk2zKnc8u9uYEl-vQ6BLHeWbdC8EHEAZNFSDeU,2702
237
+ evalscope/perf/plugin/datasets/line_by_line.py,sha256=AqZYG6tVL3BIGnzh_2Tev8lDYezJG_1gqJY8bSNQl3Q,957
238
+ evalscope/perf/plugin/datasets/longalpaca.py,sha256=XelLris0-c3StLInQ-Oav4jqGcXPNfJxEDeYvaetEbI,1297
239
+ evalscope/perf/plugin/datasets/openqa.py,sha256=4Pnx5duFJzoiTUfZCbcK7LO8f-skmcpYNUUrtNR_UUc,1463
240
+ evalscope/perf/plugin/datasets/random_dataset.py,sha256=SIlsjAE_Stknfr6o1CBFvANBGCSgSExFbscLwSM_Gmk,2958
241
241
  evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=J6q7AF_Re5eHLVejXEw9c1jlk1T1PPmist0yO9UFTPE,2432
242
242
  evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
243
243
  evalscope/perf/utils/analysis_result.py,sha256=ESzaZHGTpr2LoJR3jpOzqMphxSrr79d364ZzD159PmY,1169
244
- evalscope/perf/utils/benchmark_util.py,sha256=4TyQ_tE5odcjKDFDueI3jrC0vld6QxmTreOd5_SP4vE,5802
245
- evalscope/perf/utils/db_util.py,sha256=OAaR9bK4SPfMuk41w1t4d7ljxPDDEZOzcwDn2s9bpz0,9052
244
+ evalscope/perf/utils/benchmark_util.py,sha256=XrpB6ISjY2p1ngwPr5eOQS7O_I1kmlbEn2wCwsC_5AA,6278
245
+ evalscope/perf/utils/db_util.py,sha256=VDqiM6xOK7fSneU3YOOU-78LWB8El3mxj_Ixtw2gX3o,9051
246
246
  evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
247
247
  evalscope/perf/utils/local_server.py,sha256=clF8i0UFmaxBBB6gX05KvVCyzSv0xzsAidz0_sLLlAk,4627
248
+ evalscope/perf/utils/log_utils.py,sha256=1jmB31W3ol9ukPAPbQ8xG3yoZ9oi3tjEyMK5M3ERmbw,1471
248
249
  evalscope/registry/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
249
250
  evalscope/registry/config/cfg_arena.yaml,sha256=rub6ceaQxxB1mbSjdoFf0IaVgGfbOonV2nYRebv2OKo,3292
250
251
  evalscope/registry/config/cfg_arena_zhihu.yaml,sha256=tvvihBwvoTjoezwTSaZwoGOB44ysofpnin4pNyY9TfQ,2755
@@ -267,7 +268,7 @@ evalscope/registry/tasks/gsm8k.yaml,sha256=M2I7otwOSy0usD8yG8d6QziASQlKdhKLflRHM
267
268
  evalscope/registry/tasks/mmlu.yaml,sha256=cJcMH1Cvgo9PlYoTmeGx2bcZayysltaa6ehK57dDkvo,726
268
269
  evalscope/registry/tasks/mmlu_mini.yaml,sha256=K8ouHh7ve5ZsbkqRtV3Jl-DF01YFPuObfwEdACJA4Pk,778
269
270
  evalscope/report/__init__.py,sha256=0Wes3ot2hy9s-WwZaBztst8qkNrXkOF-Hwa1WW1e8lY,260
270
- evalscope/report/app.py,sha256=cvof2Nm4ORxC4D3L22Kg3Ngu3kJwBZlfnFJkwDMCmSQ,26881
271
+ evalscope/report/app.py,sha256=Lew--YreNeuyLVktnUNZKIfGvnGE_oAD054kZB-YTHo,26904
271
272
  evalscope/report/combinator.py,sha256=O3QirwtYhDhdaWVT4STJMCGZMwoX8BTeJ3HtS9iwnWQ,2567
272
273
  evalscope/report/generator.py,sha256=2DULY9W8QCUxdtyfNjo8XAP_YxI1LgR95jknK__kYPU,3600
273
274
  evalscope/report/utils.py,sha256=DRlbjbqHEmM8rGlA4pwtlHFhOZtyUzcqiS-mejfIDkU,4584
@@ -314,28 +315,28 @@ evalscope/utils/filters.py,sha256=x_NX40uWMmUsVrAGHCeeV2e63HZZFugWUgdUhk64ivM,15
314
315
  evalscope/utils/io_utils.py,sha256=Tjdgen1FsAA4ArqiUzu734L0Px5NuiS0GKRRiGIzxSA,4192
315
316
  evalscope/utils/logger.py,sha256=barHSdtbEu21ynGQj_wS-rd7B02wPPR5AgaWCQzvG4w,3638
316
317
  evalscope/utils/model_utils.py,sha256=hB9W334ecAb6553FhooT6_jM0g-tjj6AU48IV3K1CKw,1131
317
- evalscope/utils/utils.py,sha256=lGvn94ryIzx-7WLNJeuyehNTmINt0jYIjrjW12woPCs,9730
318
+ evalscope/utils/utils.py,sha256=VuGdJh3xZAZ-cRoGcKeJTx3z8sgSs2eMjH-1JX2ZYOU,10615
318
319
  tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
319
320
  tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
320
321
  tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
321
- tests/cli/test_all.py,sha256=tRC4TWaqxEsB6jMsGR7u9RHWHuKzn7Umt2XKY1V8CLU,4035
322
- tests/cli/test_collection.py,sha256=V-_M7ngwekMGqPuI16jjJZyAK2XLE4Z6QTn-8B5ykgU,4071
323
- tests/cli/test_run.py,sha256=0gD0nPiioieaDOqRZkS5ruIWuiv1B5D456wSSHv9y40,16471
322
+ tests/cli/test_all.py,sha256=pwup--iNxckUEsR_aFjIAbEQo3UogSu5aIWf9ryLP2o,4022
323
+ tests/cli/test_collection.py,sha256=y8FjoPziPRf5BdJK8DHjcXn26ETKz1OyqjnCpwjt-F4,4096
324
+ tests/cli/test_run.py,sha256=RW4AkJILqzzyd0wuIdy8Y9SB_4koSRJFezGjFdXdLJI,16549
324
325
  tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
325
- tests/perf/test_perf.py,sha256=mfXTCsD9RaCef3b4CLvm8ErxBUaWzn-EKKhOxD65i3A,3817
326
+ tests/perf/test_perf.py,sha256=BXd6SCMbBDKmh-P_KGTOpuwVQZ05xCKjvH01zGyvBJI,3787
326
327
  tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
327
- tests/rag/test_clip_benchmark.py,sha256=Ar8Br2CoAFYT2h4zCv_JKMKCGJKbKGYZgNwJ410ZaoU,2597
328
- tests/rag/test_mteb.py,sha256=t64FXE-ZsOCLiRJrw-dIDIhKd1OXiaglXaeERs0lOh4,4643
329
- tests/rag/test_ragas.py,sha256=fzpn4zZPeZ04ZdfLmwXbsSjf7WcjPWrGsA6RDNXgIEQ,4011
328
+ tests/rag/test_clip_benchmark.py,sha256=ZCBtgnF8Vuji6WQlb92-_RIvXlUX_Xt-cHZP4AN_DNI,2552
329
+ tests/rag/test_mteb.py,sha256=YJw6X1jwX6SYNB-ryVb-OHJWu3vsE3Y4STATI75rdG0,5619
330
+ tests/rag/test_ragas.py,sha256=E7rfKpKtBqglOL1GcW9adfY8nsOZMuoB8GC55UL1Q3c,4517
330
331
  tests/swift/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
331
332
  tests/swift/test_run_swift_eval.py,sha256=JKG-0BwTxkbg-XeiXxujPqnVIM3f2EFaJ_9a7p_R4dk,5748
332
333
  tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZUQhtYO5e0o,4898
333
334
  tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
334
335
  tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
335
- tests/vlm/test_vlmeval.py,sha256=nzWXjw49SlxXgDnYS9N5JSFtcUp8xPOW2YNNzupvtt4,1806
336
- evalscope-0.13.2.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
337
- evalscope-0.13.2.dist-info/METADATA,sha256=b7rVRQHN5miovM5qlh4Dozpl8OaxO0rg0ctT-kDZMyY,32399
338
- evalscope-0.13.2.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
339
- evalscope-0.13.2.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
340
- evalscope-0.13.2.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
341
- evalscope-0.13.2.dist-info/RECORD,,
336
+ tests/vlm/test_vlmeval.py,sha256=UqRiBPMU3vRtLIG1Qu4ZVhyUQx-zGYQuLCgobwf-7a4,3176
337
+ evalscope-0.14.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
338
+ evalscope-0.14.0.dist-info/METADATA,sha256=HQ1pt-YU950AcwwWiypjGcWg0wYU9n6PFZ7j6PG4uHg,33040
339
+ evalscope-0.14.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
340
+ evalscope-0.14.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
341
+ evalscope-0.14.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
342
+ evalscope-0.14.0.dist-info/RECORD,,
tests/cli/test_all.py CHANGED
@@ -4,13 +4,12 @@ from dotenv import dotenv_values
4
4
  env = dotenv_values('.env')
5
5
 
6
6
  import os
7
- import subprocess
8
7
  import unittest
9
8
 
10
9
  from evalscope.config import TaskConfig
11
10
  from evalscope.constants import EvalType, JudgeStrategy, OutputType
12
11
  from evalscope.run import run_task
13
- from evalscope.utils import is_module_installed, test_level_list
12
+ from evalscope.utils import test_level_list
14
13
  from evalscope.utils.logger import get_logger
15
14
 
16
15
  os.environ['LOG_LEVEL'] = 'DEBUG'
@@ -46,6 +45,7 @@ datasets=[
46
45
  'chinese_simpleqa',
47
46
  'alpaca_eval',
48
47
  'arena_hard',
48
+ 'maritime_bench',
49
49
  ]
50
50
 
51
51
  dataset_args={
@@ -134,8 +134,8 @@ class TestRun(unittest.TestCase):
134
134
  eval_type=EvalType.SERVICE,
135
135
  datasets=datasets,
136
136
  dataset_args=dataset_args,
137
- eval_batch_size=2,
138
- limit=2,
137
+ eval_batch_size=1,
138
+ limit=1,
139
139
  stream=True,
140
140
  generation_config={
141
141
  'temperature': 0,
@@ -80,4 +80,5 @@ class TestCollection(unittest.TestCase):
80
80
  'api_key': os.getenv('DASHSCOPE_API_KEY'),
81
81
  }
82
82
  )
83
- run_task(task_cfg=task_cfg)
83
+ res = run_task(task_cfg=task_cfg)
84
+ print(res)
tests/cli/test_run.py CHANGED
@@ -137,7 +137,7 @@ class TestRun(unittest.TestCase):
137
137
  'subset_list': ['gsm8k'],
138
138
  },
139
139
  'musr': {
140
- 'subset_list': ['murder_mysteries']
140
+ 'subset_list': ['murder_mysteries'],
141
141
  },
142
142
  'general_mcq': {
143
143
  'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
@@ -263,7 +263,7 @@ class TestRun(unittest.TestCase):
263
263
  datasets=[
264
264
  # 'iquiz',
265
265
  # 'ifeval',
266
- # 'mmlu',
266
+ 'mmlu',
267
267
  # 'mmlu_pro',
268
268
  # 'musr',
269
269
  # 'process_bench',
@@ -281,9 +281,10 @@ class TestRun(unittest.TestCase):
281
281
  # 'ceval',
282
282
  # 'hellaswag',
283
283
  # 'general_mcq',
284
- 'general_qa'
284
+ # 'general_qa'
285
285
  # 'super_gpqa',
286
- # 'mmlu_redux'
286
+ # 'mmlu_redux',
287
+ 'maritime_bench'
287
288
  ],
288
289
  dataset_args={
289
290
  'mmlu': {
@@ -322,7 +323,8 @@ class TestRun(unittest.TestCase):
322
323
  'subset_list': ['gsm8k'],
323
324
  },
324
325
  'musr': {
325
- 'subset_list': ['murder_mysteries']
326
+ 'subset_list': ['murder_mysteries'],
327
+ 'local_path': '/root/.cache/modelscope/hub/datasets/AI-ModelScope/MuSR'
326
328
  },
327
329
  'general_mcq': {
328
330
  'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
@@ -353,10 +355,9 @@ class TestRun(unittest.TestCase):
353
355
  stream=False,
354
356
  generation_config={
355
357
  'temperature': 0,
356
- 'n': 2,
358
+ 'n': 1,
357
359
  'max_tokens': 4096,
358
- },
359
- use_cache='outputs/20250326_202848',
360
+ }
360
361
  )
361
362
 
362
363
  run_task(task_cfg=task_cfg)