PyPI - evalscope - Versions diffs - 0.13.2__py3-none-any.whl → 0.14.0__py3-none-any.whl - Mend

evalscope 0.13.2py3-none-any.whl → 0.14.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (57) hide show

evalscope/backend/rag_eval/__init__.py +1 -1
evalscope/backend/rag_eval/backend_manager.py +21 -5
evalscope/backend/rag_eval/cmteb/arguments.py +10 -0
evalscope/backend/rag_eval/ragas/arguments.py +0 -1
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +7 -2
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -5
evalscope/backend/rag_eval/utils/embedding.py +49 -3
evalscope/backend/rag_eval/utils/llm.py +4 -4
evalscope/backend/vlm_eval_kit/backend_manager.py +4 -2
evalscope/benchmarks/arc/arc_adapter.py +1 -1
evalscope/benchmarks/data_adapter.py +6 -2
evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +1 -3
evalscope/benchmarks/live_code_bench/testing_util.py +365 -549
evalscope/benchmarks/maritime_bench/__init__.py +0 -0
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +79 -0
evalscope/benchmarks/mmlu/mmlu_adapter.py +5 -7
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
evalscope/benchmarks/musr/musr_adapter.py +1 -1
evalscope/collections/evaluator.py +4 -2
evalscope/config.py +1 -1
evalscope/perf/arguments.py +24 -5
evalscope/perf/benchmark.py +28 -42
evalscope/perf/http_client.py +2 -3
evalscope/perf/plugin/api/custom_api.py +1 -1
evalscope/perf/plugin/api/openai_api.py +2 -2
evalscope/perf/plugin/datasets/custom.py +4 -1
evalscope/perf/plugin/datasets/line_by_line.py +4 -1
evalscope/perf/plugin/datasets/longalpaca.py +4 -1
evalscope/perf/plugin/datasets/openqa.py +4 -1
evalscope/perf/plugin/datasets/random_dataset.py +13 -6
evalscope/perf/utils/benchmark_util.py +12 -6
evalscope/perf/utils/db_util.py +1 -1
evalscope/perf/utils/log_utils.py +41 -0
evalscope/report/app.py +11 -11
evalscope/run.py +7 -0
evalscope/summarizer.py +2 -1
evalscope/utils/utils.py +36 -25
evalscope/version.py +2 -2
{evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/METADATA +20 -15
{evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/RECORD +55 -54
tests/cli/test_all.py +4 -4
tests/cli/test_collection.py +2 -1
tests/cli/test_run.py +9 -8
tests/perf/test_perf.py +1 -2
tests/rag/test_clip_benchmark.py +0 -1
tests/rag/test_mteb.py +37 -8
tests/rag/test_ragas.py +29 -26
tests/vlm/test_vlmeval.py +37 -1
evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
evalscope/benchmarks/live_code_bench/execute_utils.py +0 -267
{evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/LICENSE +0 -0
{evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/WHEEL +0 -0
{evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/top_level.txt +0 -0

evalscope/report/app.py CHANGED Viewed

@@ -44,7 +44,7 @@ def scan_for_report_folders(root_path):
                 continue
             datasets = []
             for dataset_item in glob.glob(os.path.join(model_item, '*.json')):
-                datasets.append(os.path.basename(dataset_item).split('.')[0])
+                datasets.append(os.path.splitext(os.path.basename(dataset_item))[0])
             datasets = DATASET_TOKEN.join(datasets)
             reports.append(
                 f'{os.path.basename(folder)}{REPORT_TOKEN}{os.path.basename(model_item)}{MODEL_TOKEN}{datasets}')
@@ -253,17 +253,17 @@ def process_model_prediction(item: Any):
 def normalize_score(score):
-    if isinstance(score, bool):
-        return 1.0 if score else 0.0
-    elif isinstance(score, dict):
-        for key in score:
-            return float(score[key])
-        return 0.0
-    else:
-        try:
-            return float(score)
-        except (ValueError, TypeError):
+    try:
+        if isinstance(score, bool):
+            return 1.0 if score else 0.0
+        elif isinstance(score, dict):
+            for key in score:
+                return float(score[key])
             return 0.0
+        else:
+            return float(score)
+    except (ValueError, TypeError):
+        return 0.0
 def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subset_name: str):

evalscope/run.py CHANGED Viewed

@@ -58,10 +58,17 @@ def setup_work_directory(task_cfg: TaskConfig, run_time: str):
     outputs = OutputsStructure(outputs_dir=task_cfg.work_dir)
+    # Unify the output directory structure
     if task_cfg.eval_backend == EvalBackend.OPEN_COMPASS:
         task_cfg.eval_config['time_str'] = run_time
     elif task_cfg.eval_backend == EvalBackend.VLM_EVAL_KIT:
         task_cfg.eval_config['work_dir'] = task_cfg.work_dir
+    elif task_cfg.eval_backend == EvalBackend.RAG_EVAL:
+        from evalscope.backend.rag_eval import Tools
+        if task_cfg.eval_config['tool'].lower() == Tools.MTEB:
+            task_cfg.eval_config['eval']['output_folder'] = task_cfg.work_dir
+        elif task_cfg.eval_config['tool'].lower() == Tools.CLIP_BENCHMARK:
+            task_cfg.eval_config['eval']['output_dir'] = task_cfg.work_dir
     return outputs

evalscope/summarizer.py CHANGED Viewed

@@ -105,7 +105,8 @@ class Summarizer:
                             summary_res: dict = csv_to_list(summary_file_path)[0]
                         elif summary_file_path.endswith('json'):
                             summary_res: dict = json_to_dict(summary_file_path)
-                        file_name = os.path.basename(summary_file_path).split('.')[0]
+                        base_name = os.path.basename(summary_file_path)
+                        file_name = os.path.splitext(base_name)[0]
                         final_res_list.append({file_name: summary_res})
             elif eval_backend == EvalBackend.THIRD_PARTY:

evalscope/utils/utils.py CHANGED Viewed

@@ -90,7 +90,7 @@ class ResponseParser:
         return ''
     @staticmethod
-    def parse_first_option_with_choices(text: str, options: list) -> str:
+    def parse_first_option_with_choices(text: str, options: list[str]) -> str:
         """
         Find first valid option for text.
@@ -98,7 +98,7 @@ class ResponseParser:
             text: The text to parse.
             options: The options to find. e.g. ['A', 'B', 'C', 'D']
         """
-        options_concat = '|'.join([str(i) for i in options])
+        options_concat = ResponseParser.process_options(options)
         patterns = [
             rf'答案是?\s?([{options_concat}])',
@@ -155,48 +155,53 @@ class ResponseParser:
                 for i in options:
                     if i in outputs:
                         return i
-        return ''
+        return 'No valid option found'
     @staticmethod
-    def parse_first_option(text: str) -> str:
+    def parse_first_option(text: str, options: list[str]) -> str:
         """
         Find first valid option for text.
         Args:
             text: The text to parse.
         """
+        options_pattern = ResponseParser.process_options(options)
         patterns = [
-            r'answer is \(?(\w+)\)?',
-            r'[Aa]nswer:\s*(\w+)',
-            r'[Tt]he correct answer is:\s*(\w+)',
-            r'[Tt]he correct answer is:\n\s*(\w+)',
-            r'[Tt]he correct answer is:\n\n-\s*(\w+)',
-            r'[Tt]he answer might be:\n\n-\s*(\w+)',
-            r'[Tt]he answer is \s*(\w+)',
+            rf'[Aa]nswer:\s*({options_pattern})',
+            rf'ANSWER:\s*({options_pattern})',
+            rf'answer is \(?({options_pattern})\)?',
+            rf'[Tt]he correct answer is:\s*({options_pattern})',
+            rf'[Tt]he correct answer is:\n\s*({options_pattern})',
+            rf'[Tt]he correct answer is:\n\n-\s*({options_pattern})',
+            rf'[Tt]he answer might be:\n\n-\s*({options_pattern})',
+            rf'[Tt]he answer is \s*({options_pattern})',
         ]
         regexes = [re.compile(pattern) for pattern in patterns]
         for regex in regexes:
-            match = regex.search(text)
-            if match:
-                return match.group(1)
-        return ''
+            matches = regex.search(text)
+            if matches:
+                return matches.group(1)
+        return 'No valid option found'
     @staticmethod
-    def parse_first_capital_multi(text: str) -> str:
-        match = re.search(r'([A-D]+)', text)
+    def parse_bracketed_answer(text: str, options: list[str]) -> str:
+        options = ResponseParser.process_options(options)
+        # Match the first occurrence of the options in angle brackets
+        match = re.search(rf'<({options})>', text)
         if match:
             return match.group(1)
-        return ''
+        return 'No valid option found'
     @staticmethod
-    def parse_last_option(text: str, options: str) -> str:
-        match = re.findall(rf'([{options}])', text)
-        if match:
-            return match[-1]
-        return ''
+    def process_options(options: list[str]) -> str:
+        # Escape each option to ensure special characters in options are treated literally
+        escaped_options = [re.escape(option) for option in options]
+        # Join options into a regex pattern separated by '|', to match any of the options
+        options_pattern = '|'.join(escaped_options)
+        return options_pattern
 def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float, dict]:
     """
@@ -299,3 +304,9 @@ def seed_everything(seed: int):
         torch.cuda.manual_seed_all(seed)
         torch.backends.cudnn.deterministic = True
         torch.backends.cudnn.benchmark = False
+if __name__ == '__main__':
+    options = ['A', 'B', 'C', 'D']
+    answers = ['Context .... ANSWER: A', 'answer: A']
+    for answer in answers:
+        print(ResponseParser.parse_first_option(answer, options))

evalscope/version.py CHANGED Viewed

@@ -1,4 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-__version__ = '0.13.2'
-__release_datetime__ = '2025-04-01 20:00:00'
+__version__ = '0.14.0'
+__release_datetime__ = '2025-04-10 20:00:00'

{evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: evalscope
-Version: 0.13.2
+Version: 0.14.0
 Summary: EvalScope: Lightweight LLMs Evaluation Framework
 Home-page: https://github.com/modelscope/evalscope
 Author: ModelScope team
@@ -71,12 +71,12 @@ Requires-Dist: transformers>=4.33; extra == "all"
 Requires-Dist: word2number; extra == "all"
 Requires-Dist: ms-opencompass>=0.1.4; extra == "all"
 Requires-Dist: ms-vlmeval>=0.0.9; extra == "all"
-Requires-Dist: langchain<0.3.0; extra == "all"
-Requires-Dist: langchain-community<0.3.0; extra == "all"
-Requires-Dist: langchain-core<0.3.0; extra == "all"
-Requires-Dist: langchain-openai<0.3.0; extra == "all"
+Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "all"
+Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "all"
+Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "all"
+Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "all"
 Requires-Dist: mteb==1.19.4; extra == "all"
-Requires-Dist: ragas==0.2.9; extra == "all"
+Requires-Dist: ragas==0.2.14; extra == "all"
 Requires-Dist: webdataset>0.2.0; extra == "all"
 Requires-Dist: aiohttp; extra == "all"
 Requires-Dist: fastapi; extra == "all"
@@ -99,12 +99,12 @@ Requires-Dist: sse-starlette; extra == "perf"
 Requires-Dist: transformers; extra == "perf"
 Requires-Dist: unicorn; extra == "perf"
 Provides-Extra: rag
-Requires-Dist: langchain<0.3.0; extra == "rag"
-Requires-Dist: langchain-community<0.3.0; extra == "rag"
-Requires-Dist: langchain-core<0.3.0; extra == "rag"
-Requires-Dist: langchain-openai<0.3.0; extra == "rag"
+Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "rag"
+Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "rag"
+Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "rag"
+Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "rag"
 Requires-Dist: mteb==1.19.4; extra == "rag"
-Requires-Dist: ragas==0.2.9; extra == "rag"
+Requires-Dist: ragas==0.2.14; extra == "rag"
 Requires-Dist: webdataset>0.2.0; extra == "rag"
 Provides-Extra: vlmeval
 Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
@@ -121,7 +121,7 @@ Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
 </p>
 <p align="center">
-<img src="https://img.shields.io/badge/python-%E2%89%A53.8-5be.svg">
+<img src="https://img.shields.io/badge/python-%E2%89%A53.9-5be.svg">
 <a href="https://badge.fury.io/py/evalscope"><img src="https://badge.fury.io/py/evalscope.svg" alt="PyPI version" height="18"></a>
 <a href="https://pypi.org/project/evalscope"><img alt="PyPI - Downloads" src="https://static.pepy.tech/badge/evalscope"></a>
 <a href="https://github.com/modelscope/evalscope/pulls"><img src="https://img.shields.io/badge/PR-welcome-55EB99.svg"></a>
@@ -199,6 +199,8 @@ Please scan the QR code below to join our community groups:
 ## 🎉 News
+- 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
+- 🔥 **[2025.04.08]** Support for evaluating embedding model services compatible with the OpenAI API has been added. For more details, check the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html#configure-evaluation-parameters).
 - 🔥 **[2025.03.27]** Added support for [AlpacaEval](https://www.modelscope.cn/datasets/AI-ModelScope/alpaca_eval/dataPeview) and [ArenaHard](https://modelscope.cn/datasets/AI-ModelScope/arena-hard-auto-v0.1/summary) evaluation benchmarks. For usage notes, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html)
 - 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
 - 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
@@ -212,15 +214,14 @@ Please scan the QR code below to join our community groups:
 - 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets，refer to [best practice](https://evalscope.readthedocs.io/en/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
 - 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
 - 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
+<details><summary>More</summary>
 - 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
 - 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
 - 🔥 **[2024.11.26]** The model inference service performance evaluator has been completely refactored: it now supports local inference service startup and Speed Benchmark; asynchronous call error handling has been optimized. For more details, refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
 - 🔥 **[2024.10.31]** The best practice for evaluating Multimodal-RAG has been updated, please check the [📖 Blog](https://evalscope.readthedocs.io/zh-cn/latest/blog/RAG/multimodal_RAG.html#multimodal-rag) for more details.
 - 🔥 **[2024.10.23]** Supports multimodal RAG evaluation, including the assessment of image-text retrieval using [CLIP_Benchmark](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/clip_benchmark.html), and extends [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html) to support end-to-end multimodal metrics evaluation.
 - 🔥 **[2024.10.8]** Support for RAG evaluation, including independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
-<details><summary>More</summary>
 - 🔥 **[2024.09.18]** Our documentation has been updated to include a blog module, featuring some technical research and discussions related to evaluations. We invite you to [📖 read it](https://evalscope.readthedocs.io/en/refact_readme/blog/index.html).
 - 🔥 **[2024.09.12]** Support for LongWriter evaluation, which supports 10,000+ word generation. You can use the benchmark [LongBench-Write](evalscope/third_party/longbench_write/README.md) to measure the long output quality as well as the output length.
 - 🔥 **[2024.08.30]** Support for custom dataset evaluations, including text datasets and multimodal image-text datasets.
@@ -503,6 +504,10 @@ Reference: Performance Testing [📖 User Guide](https://evalscope.readthedocs.i
 ![wandb sample](https://modelscope.oss-cn-beijing.aliyuncs.com/resource/wandb_sample.png)
+**Supports swanlab for recording results**
+![swanlab sample](https://sail-moe.oss-cn-hangzhou.aliyuncs.com/yunlin/images/evalscope/swanlab.png)
 **Supports Speed Benchmark**
 It supports speed testing and provides speed benchmarks similar to those found in the [official Qwen](https://qwen.readthedocs.io/en/latest/benchmark/speed_benchmark.html) reports:

{evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/RECORD RENAMED Viewed

@@ -1,11 +1,11 @@
 evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
 evalscope/arguments.py,sha256=OPYmX_ar7rXFm0ETPuE2hs-knDQtwQ0pFwSazjn3S9Q,5241
-evalscope/config.py,sha256=CkNBE83S335iyu0VRMkblaJw5nGM8pXv4NhK5ySE3cs,9476
+evalscope/config.py,sha256=sc8NoqhspbrNYMS201ZWreCKV-tBJrUEt96vKwpqfDY,9483
 evalscope/constants.py,sha256=Cgzkoz4R3MC3YLtbCM2fmSwF8Z2kuxYdOC8t9FWJj9w,3740
-evalscope/run.py,sha256=LUCdnNzNIfHSWvxu3gxAsHEDX7hT5mcVnV4lSY5h0iA,6007
+evalscope/run.py,sha256=XbUhllYPjaJJuR1hPoGZH0jlW8XlvUv9gONrMBc4Ni0,6450
 evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
-evalscope/summarizer.py,sha256=Wnt8Y61PasOkigo8zMpi1qdFzfETjfnDtCDDQ6VwgDw,5867
-evalscope/version.py,sha256=JzXnfz-D9eKhVPZu2TQUPFaTFhRiZ3iK4jcIuxfnQE8,119
+evalscope/summarizer.py,sha256=61kU5ZoSh1dd8HMJPqP3ZvJwcY9szwWFCZdu2lfATJA,5920
+evalscope/version.py,sha256=4w52xL5au75pTD-PrvG-9l-U1euGk2032efyc-7IkQw,119
 evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
 evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
@@ -14,8 +14,8 @@ evalscope/backend/opencompass/backend_manager.py,sha256=y5NnAIY1pI7E1ZSeKU3acrD-
 evalscope/backend/opencompass/tasks/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
 evalscope/backend/opencompass/tasks/eval_api.py,sha256=ZaGdUbEOtAW5VX3ZXmpHIttg_QrID34EnBTylD3uvos,1152
 evalscope/backend/opencompass/tasks/eval_datasets.py,sha256=JHSq4EnPJgv4sRJJplLH80EqE3ghtkn2k8HnV6DaDew,5406
-evalscope/backend/rag_eval/__init__.py,sha256=jFWj8l8bPAu1sz7wtX5gGIweBFC8c2LzXUPz7tGambE,284
-evalscope/backend/rag_eval/backend_manager.py,sha256=Cw322R1j-L8vMERAWEXUTT-0a1K-V6KhQOtrOhgKVMM,2857
+evalscope/backend/rag_eval/__init__.py,sha256=Tbj7HboP5zzJ77-9qVEwwhHKjHL5V8MwLFr6sw1oeoA,291
+evalscope/backend/rag_eval/backend_manager.py,sha256=OEFADT8kdsuVMU0QOfiafzFQopY7bKbWZ_jhdXyYElY,3472
 evalscope/backend/rag_eval/clip_benchmark/__init__.py,sha256=C8Vetf52nyHiRwY2Pm74Bjn3UpWboQeghCGNh67X1EM,151
 evalscope/backend/rag_eval/clip_benchmark/arguments.py,sha256=d5UkbC3RXb6iyzy_ILumToAVO1AdwvDeyOiX5KB2u0g,1530
 evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py,sha256=anuIhRk9OC8y0LNBjvttSXppc99gbz-f0TYQjnyLLyU,8347
@@ -27,7 +27,7 @@ evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py,sha256=t0U
 evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py,sha256=rZY-TulG-Cb8b6GTBxqTDYQ_4Ois3kbgKhuunZq8Ato,8407
 evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt,sha256=eiiAaxhS48b5rVLy5O9VvFfV2AfxY86ITu_iqT7ZLkQ,649
 evalscope/backend/rag_eval/cmteb/__init__.py,sha256=I502GHPFYo8BwlFvoljGKI24PY76eBXJQiquWk8nJNU,280
-evalscope/backend/rag_eval/cmteb/arguments.py,sha256=Z3GkGi7zjK85JynG-7CSVPmAxPRcGYuykkgfbxgn7_E,2317
+evalscope/backend/rag_eval/cmteb/arguments.py,sha256=y2iTbs3a7R747NgS00nK2j3zO7gmREh8n7mWMrzF1js,2653
 evalscope/backend/rag_eval/cmteb/base.py,sha256=UCobQ81dHkiTmIz_0BJ_VANj_uG6mkJbYLKJztvMXfo,2849
 evalscope/backend/rag_eval/cmteb/task_template.py,sha256=FyFs1reefcsFCrWyi7Ya5dnFYvBhtxph2wIaFtOtFls,2595
 evalscope/backend/rag_eval/cmteb/tasks/Classification.py,sha256=sqbH0XmSiIm4n5UX5sXMwJHby1r-d35mwW1tKIhb2Hg,10848
@@ -39,25 +39,24 @@ evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py,sha256=ofmmeoieXHmU6O14JKWO9
 evalscope/backend/rag_eval/cmteb/tasks/STS.py,sha256=uhGLsQTo5lM3-L2Na3WJGqOLQw3c1WxHDA22ePJPxtU,12285
 evalscope/backend/rag_eval/cmteb/tasks/__init__.py,sha256=PKBNyp45hIa3FYNA1psiwtwfwUcn7s9eNt6r5aUpyyY,1505
 evalscope/backend/rag_eval/ragas/__init__.py,sha256=D0yJkN9SuNGIAL3niZw4BI08Yh3HznsUUewdIAa_-LM,171
-evalscope/backend/rag_eval/ragas/arguments.py,sha256=8SYCV15d25ocdDHRqmGMQzd9zR6gwfOrVSFBe4T-KCo,1806
+evalscope/backend/rag_eval/ragas/arguments.py,sha256=S6M1nsqwMQ8lnZZDtlQTdzyOCfLn9WP0QJ_7wAEsVgc,1695
 evalscope/backend/rag_eval/ragas/task_template.py,sha256=a_3bWfLx0j2zJkWgEWNStO0XXAeUFdnFpeukpoGfxLg,1669
 evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py,sha256=fX9sCci787ViGiL3BhGsykx0bnWfOWWEFueaJKyR8g4,793
 evalscope/backend/rag_eval/ragas/tasks/__init__.py,sha256=hErdWKbvV9aRqOpQTzdFHw1tcYoDbnttmic7GpZzKx8,173
 evalscope/backend/rag_eval/ragas/tasks/build_distribution.py,sha256=vFfemiqtPx22u5pwwZxEQJKYf3B9efYmwbpWDI5hY30,1491
 evalscope/backend/rag_eval/ragas/tasks/build_transform.py,sha256=GtAYqdVOy7BxIGyC4rSZ_UfXagKYzE6eEtXbaOI_g-k,5425
-evalscope/backend/rag_eval/ragas/tasks/testset_generation.py,sha256=B5ZETlQw5XTEDnO-VR5yXjSbbg1eUtjGts7M5msK2ik,5618
-evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py,sha256=aP8U9zjIDl26X_YF82SXLpkxoJ4nUurmdKSEoJ-qsLY,2129
+evalscope/backend/rag_eval/ragas/tasks/testset_generation.py,sha256=YSqpaXMFVe8mkVfq3i_oJg1MSnPm98E7WdOBdyUwMpA,5784
+evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py,sha256=6x-4O2pgsjZCVfJNvwZEKcgLe_QhSknPg-f2jGjZkU4,1890
 evalscope/backend/rag_eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 evalscope/backend/rag_eval/utils/clip.py,sha256=GLHhPCac2AH35AvRLvVqePA1gIMAewHTFmCJCDZzvqU,5015
-evalscope/backend/rag_eval/utils/embedding.py,sha256=x9HAEfZSSAnT2Tdbf-9a5UmBVagCr__ay5A2nMCPMpg,6258
-evalscope/backend/rag_eval/utils/llm.py,sha256=UIfdvkxVViYkIpX-MoM8sAwGEAozzVFyzX-YoFxXC1E,2607
+evalscope/backend/rag_eval/utils/embedding.py,sha256=tFMepPAMO4Kkqeqh-XxXIDYRjGbCMlk7lwuUW7FNvCA,7977
+evalscope/backend/rag_eval/utils/llm.py,sha256=acaD5QHPJUstJGpW1sNJ-3ZPT5J_Z8beOWb61Rtz07U,2607
 evalscope/backend/rag_eval/utils/tools.py,sha256=FU7tNu-8y8V_o_kArFVTTLM_GzL12KBNeXiwQw5SpJA,1529
 evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4tycyy-FJgzLdjeY,84
-evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=ZB0wYzPBPyIM0zjfp5C71GcjAgKxKoWxF7RrhuDQBYM,5931
-evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=D-8Sr0tZblf-3ZR5T0lOggbgFRuy5wq9Nm02GUpNyAs,1620
+evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=sUYvQxCtPl6CrcwhQpY8lJjW5skqWc-fvHUSnXd_MvQ,6054
 evalscope/benchmarks/__init__.py,sha256=b_SWdV1ZyOqFiwc_9lIjKrIvK1rwnF2cCIF7XN9CN8E,932
 evalscope/benchmarks/benchmark.py,sha256=a_7Ctz36McuTyBSTYi56jis9pvOdWhg7JVSPFrbxqR4,2535
-evalscope/benchmarks/data_adapter.py,sha256=UvbJJTNBvA0aM-xmsaj9jEEsNksn9pTDDr90FfFX2pg,17606
+evalscope/benchmarks/data_adapter.py,sha256=lwW23GjHHAptv4mc1u3xLlKqiRI1EfbSqaG3QGmxqEQ,17750
 evalscope/benchmarks/utils.py,sha256=6kxeBz4w8Fw68AYH05a4ncjgkaUV4bU3eaFVLqOdkMI,1321
 evalscope/benchmarks/aime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 evalscope/benchmarks/aime/aime24_adapter.py,sha256=dBm9yukt4-CByEPUlAPAIN6mL3VkZcI-dw2kz4oQBMo,1715
@@ -66,7 +65,7 @@ evalscope/benchmarks/alpaca_eval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
 evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py,sha256=2a6wHJSLe89Xh18u1LBkMQEZzfOURiek6o0-k2lCQgM,4065
 evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
 evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
-evalscope/benchmarks/arc/arc_adapter.py,sha256=lkhDz-DYjPQ1vHzo8X4j-0Lq_rBxAnws35_R00pIbNI,6347
+evalscope/benchmarks/arc/arc_adapter.py,sha256=U-yPDAjYkPUUOXYjCM1ajdvlUVcdeuVoMK7yWJcX6LI,6369
 evalscope/benchmarks/arena_hard/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 evalscope/benchmarks/arena_hard/arena_hard_adapter.py,sha256=bdQfLTWB5pFo4hET0uFqu5zMX9PNQNwdoLoGrL5jCBE,6213
 evalscope/benchmarks/arena_hard/utils.py,sha256=NstI1VR5fTaT-bfXRj0cLqm0DtH8EY4EQHR-K9HJubI,5089
@@ -116,7 +115,7 @@ evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=ecNwAE3p2
 evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=ZVGzUuuQ0UTOqQtXE40ZyBeMOSl8saSiFEQ5_siJ-c8,5052
 evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
-evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=KBZDP1T-t7uu8vBLGL_unVdj7rDko3KWBPKqWlw31JQ,4596
+evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=8d5znAcQmFSmvyKV-JuMQzbY5k6xDNQQdrWZ7zgPTK4,4603
 evalscope/benchmarks/gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 evalscope/benchmarks/gpqa/chain_of_thought.txt,sha256=pgoT5a-DMPJaMhoH_M8zfU5s80ibWDTVW6vnonITd8k,5610
 evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=UB287DtnbkSQXZsbReFJqmQRwbo672DTCeXXilR_-Vc,4790
@@ -125,7 +124,7 @@ evalscope/benchmarks/gsm8k/gsm8k.py,sha256=ZDN5lfeZyc_pkTDVY0voC_zUExHE1ZoEgEaTv
 evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=ZZZ-9oja53IwiU33Kjm7NTk4MbFGWyvonhnHrn_3Na8,10557
 evalscope/benchmarks/hellaswag/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
 evalscope/benchmarks/hellaswag/hellaswag.py,sha256=5_c9WbaS1LIdvgXzqEcvjAEtKi2V2Yn0YtszPlFqhXI,4610
-evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=QYZZuxbjkKxAjxuoWn0M5WgusO55vzeAcyKnWUMow3M,5871
+evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=SRM_-AKlWtKXi4zrlBAH9YceFnrktZDNsjvQOiPizUM,5893
 evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
 evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
 evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=UOjakV31J0g7TYbrRls0ItcopWOJu54ucPfaqSJB7Os,5250
@@ -139,25 +138,26 @@ evalscope/benchmarks/iquiz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJ
 evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=16whmFkJt9fLbei9d-kmjnWB_5y5vsiX9tK5kSuxDw8,2449
 evalscope/benchmarks/live_code_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 evalscope/benchmarks/live_code_bench/evaluate_utils.py,sha256=rOWaG8PV4AGIRhS_gqwxEhphEVe1Cqg57Eudwm5HTjI,6820
-evalscope/benchmarks/live_code_bench/execute_utils.py,sha256=MreaMLI0IicNZawpfqcyoRLt67EZ3CJvmxxRTYwhAbU,7397
 evalscope/benchmarks/live_code_bench/extract_utils.py,sha256=ZcQ8y741uawPo6I_1_XglR3eqJFDNrqc8fILKZupVRs,2375
-evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py,sha256=8MOECcweLG465JFgUzP20QlKyBAO90oFHhH7Z77FuUY,3521
+evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py,sha256=vLr43hvtR0WS9GclJ6xL9MIqwC941EiRSqgZ_hGHZnw,3382
 evalscope/benchmarks/live_code_bench/load_utils.py,sha256=5i9wtdPLYR8ckjx5MaYQVC2LFYvjKzR6Fa6UZmeOTRc,2445
 evalscope/benchmarks/live_code_bench/pass_k_utils.py,sha256=Ktrp_lXdfFzoHtQNQNdGfIl26ySjaPCHm4Zv-dFvRqM,2024
 evalscope/benchmarks/live_code_bench/prompts.py,sha256=P4KILIAIDT1MKDck0xHYV_6v9820wDZRhxVMazmlL-g,12600
-evalscope/benchmarks/live_code_bench/testing_util.py,sha256=s5oa--dOcugcpBmHsbeqnTRTDhdiCNXkIQuRc6EgA8o,28241
+evalscope/benchmarks/live_code_bench/testing_util.py,sha256=v4N7Y4MasNL6TjC4w-Duw_4Zn0oLdWAw3HU6ZrM76P8,17161
+evalscope/benchmarks/maritime_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py,sha256=RVbsiglxmEW37-tDYgr4Drywh26I94DRGhwv7uP2aYk,2829
 evalscope/benchmarks/math_500/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 evalscope/benchmarks/math_500/math_500_adapter.py,sha256=SB2eb4Z7DTXdptqirEoctqTdDLEu28s7bLeCAMBmAFo,1923
 evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
 evalscope/benchmarks/mmlu/mmlu.py,sha256=sA8AC0bN7iURrSazqkY31s_reNVbDZSUCB-NCTQsVeI,5042
-evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=e__Evar99V9l65FlzT6T594CN4iMgmuVhjujQAm4po4,11662
+evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=bQSRTgXk01pCfKdmTxr3si4FxET3j_yBVVmQlLchTns,11586
 evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
 evalscope/benchmarks/mmlu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=OANfue-fK543drJrDj6V_zDMtySrQEBHPgTsejr-e7U,4226
+evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=hPqxDqDhqin3TxfimfhIxfEc_8UfzTDGAfX7iDrWy28,4248
 evalscope/benchmarks/mmlu_redux/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py,sha256=ZZMy9exJ8hknr1D6s73sAhHHzBAKcqo7WAmlUtPqpCI,9556
+evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py,sha256=Kr30i_exxBJRz9PLB5g6F04e2HJ4WuF6LDyAwaRh2MY,9578
 evalscope/benchmarks/musr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-evalscope/benchmarks/musr/musr_adapter.py,sha256=Po8hcIQiqlFo0AGjcNQe75cpsMNDcfiJaKgZsk33-DY,2442
+evalscope/benchmarks/musr/musr_adapter.py,sha256=85P0sY7H9pthYdCjkE2AOxaiNhcIBW1iZmODkz3FN0M,2464
 evalscope/benchmarks/process_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 evalscope/benchmarks/process_bench/critique_template.txt,sha256=tycx8n42QEC0uGcwbIvHfZvfTnchlRxGz8Tp1R2_e_Y,489
 evalscope/benchmarks/process_bench/process_bench_adapter.py,sha256=ydU-r1T0DaYhOxkhZgGL7PhDd4XoeqOBzVO9oiFPd8M,3422
@@ -187,7 +187,7 @@ evalscope/cli/start_eval.py,sha256=MXhVDeaMFd6ny88-gnVtQflH660UaDj240YGYnHccx8,7
 evalscope/cli/start_perf.py,sha256=5hLi5jWgM9BJPXLd8d9D1zqrcj_5c0KvkfB1DgD4_RU,831
 evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
 evalscope/collections/__init__.py,sha256=hd68Qf-ryeDsz5Pu-Dh83M5V5RE5mhLsG-vc55n5n0o,228
-evalscope/collections/evaluator.py,sha256=YJy8Dj35XCdCwhNDwZecJkeW1_ZgIOsuRLFzfe3SyV8,12724
+evalscope/collections/evaluator.py,sha256=4IkdbKySOW-MzH9Zjn0uddQviFLe2pOef746fgbjkJo,12784
 evalscope/collections/sampler.py,sha256=2NwvhJVdi-mrDeK7RWwEGOoE7DdxtpyASRUZU_D6hWw,4855
 evalscope/collections/schema.py,sha256=mjJfNmy_athJ1TmnuJRkrKRlefzefuQXZuTtjn8SHKo,4073
 evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
@@ -219,32 +219,33 @@ evalscope/models/custom/__init__.py,sha256=wb6f_Bi39s5sj-VO7EXRDXB2WhyFb49BUtEMk
 evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
 evalscope/models/custom/dummy_model.py,sha256=ODD6pt9FvZq_a54oYsehBDslRKHOsk9zsC9iAZvi5Yg,2020
 evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-evalscope/perf/arguments.py,sha256=srDp3JMYIPZxkfua5WHkjq3G8lJlTtxdXKxE_CivoJk,9156
-evalscope/perf/benchmark.py,sha256=qY7zrsZMDBr1fABsShXjgK12tNE7PhzGZdLaUtdtxU8,8318
-evalscope/perf/http_client.py,sha256=xMakdQkJ2cgIOd-yOmHEW0vbGKTJ0JWhLFt9IFtUP8Q,7473
+evalscope/perf/arguments.py,sha256=UZKlkbDE2N408dY8Ji-WB8sl1rcmamywzxLvNXpnY0w,10194
+evalscope/perf/benchmark.py,sha256=nv7gtCkeKnLKQQiKM4G0MYO2ambcuwsbx67OgEQG0nM,7917
+evalscope/perf/http_client.py,sha256=-c3-N7bxKsj3d5DVsKSaYA3XAHJDzZgoqZBbhuDYIGk,7419
 evalscope/perf/main.py,sha256=w-yDbl0osaTAMgC-JNPpqIq2LQ7U4c-Ht7Amj8Nbjc8,1278
 evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
 evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
 evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
 evalscope/perf/plugin/api/base.py,sha256=B_H04qKx7eRTn155rnDrbTYur7PK1mvxfQKYcqYbndU,2118
-evalscope/perf/plugin/api/custom_api.py,sha256=ay1AGi4y2opjwyRl0J0A54-vLB-pBj3QBFkzog0KA-g,3787
+evalscope/perf/plugin/api/custom_api.py,sha256=ssE4J8AynA0n5SnXSQyk7K5Co3dwUN6Opph08clZna0,3785
 evalscope/perf/plugin/api/dashscope_api.py,sha256=V5fwn-p_fLH0dWKzhN9TvYSHRgla4INfXC4NDaIjoQ8,3825
-evalscope/perf/plugin/api/openai_api.py,sha256=DNDmW7jT0Abopw-K73X0PE7Vr2wTSKMBj79hJZTi-K8,7668
+evalscope/perf/plugin/api/openai_api.py,sha256=kTL_2OACuKhzd2W0Pf4DirpMumzk4V3rqKZ2mvBZVCs,7655
 evalscope/perf/plugin/datasets/__init__.py,sha256=Z6Jc0RxJS_z0nBBV1-b0-56Ija60AtQ7I_67gY6ZfdQ,568
 evalscope/perf/plugin/datasets/base.py,sha256=Z-INWueeYjfEZhP4lbTlBMVwIa6BcXZKWx-w7Pop3mA,1786
-evalscope/perf/plugin/datasets/custom.py,sha256=_GSC5yR90_BjcRjdJqrJT2vHQAzskz5XxYOxngUM2Pg,815
+evalscope/perf/plugin/datasets/custom.py,sha256=npreC7H1VsdTGYkqlMESvyOhtXOfZQA7_-ICmxe3FWk,936
 evalscope/perf/plugin/datasets/flickr8k.py,sha256=UzAIFIO0m5inWOkWM1mO6wfV2HOuXAqiTxCJ4b0SiZM,1589
-evalscope/perf/plugin/datasets/line_by_line.py,sha256=IKVZMpKei6XW9DTm9VEssWHE96i1lTqMf0621dA_img,836
-evalscope/perf/plugin/datasets/longalpaca.py,sha256=2aENqCly_DX1dyNcurYsLFJIvXYFph6jWm7z7XETvMk,1176
-evalscope/perf/plugin/datasets/openqa.py,sha256=_aVXs2s8wbmtoB6ZO-pNjUZvBVxRUYdoJDGv5-BumtI,1342
-evalscope/perf/plugin/datasets/random_dataset.py,sha256=wPyY5kk2zKnc8u9uYEl-vQ6BLHeWbdC8EHEAZNFSDeU,2702
+evalscope/perf/plugin/datasets/line_by_line.py,sha256=AqZYG6tVL3BIGnzh_2Tev8lDYezJG_1gqJY8bSNQl3Q,957
+evalscope/perf/plugin/datasets/longalpaca.py,sha256=XelLris0-c3StLInQ-Oav4jqGcXPNfJxEDeYvaetEbI,1297
+evalscope/perf/plugin/datasets/openqa.py,sha256=4Pnx5duFJzoiTUfZCbcK7LO8f-skmcpYNUUrtNR_UUc,1463
+evalscope/perf/plugin/datasets/random_dataset.py,sha256=SIlsjAE_Stknfr6o1CBFvANBGCSgSExFbscLwSM_Gmk,2958
 evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=J6q7AF_Re5eHLVejXEw9c1jlk1T1PPmist0yO9UFTPE,2432
 evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 evalscope/perf/utils/analysis_result.py,sha256=ESzaZHGTpr2LoJR3jpOzqMphxSrr79d364ZzD159PmY,1169
-evalscope/perf/utils/benchmark_util.py,sha256=4TyQ_tE5odcjKDFDueI3jrC0vld6QxmTreOd5_SP4vE,5802
-evalscope/perf/utils/db_util.py,sha256=OAaR9bK4SPfMuk41w1t4d7ljxPDDEZOzcwDn2s9bpz0,9052
+evalscope/perf/utils/benchmark_util.py,sha256=XrpB6ISjY2p1ngwPr5eOQS7O_I1kmlbEn2wCwsC_5AA,6278
+evalscope/perf/utils/db_util.py,sha256=VDqiM6xOK7fSneU3YOOU-78LWB8El3mxj_Ixtw2gX3o,9051
 evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
 evalscope/perf/utils/local_server.py,sha256=clF8i0UFmaxBBB6gX05KvVCyzSv0xzsAidz0_sLLlAk,4627
+evalscope/perf/utils/log_utils.py,sha256=1jmB31W3ol9ukPAPbQ8xG3yoZ9oi3tjEyMK5M3ERmbw,1471
 evalscope/registry/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
 evalscope/registry/config/cfg_arena.yaml,sha256=rub6ceaQxxB1mbSjdoFf0IaVgGfbOonV2nYRebv2OKo,3292
 evalscope/registry/config/cfg_arena_zhihu.yaml,sha256=tvvihBwvoTjoezwTSaZwoGOB44ysofpnin4pNyY9TfQ,2755
@@ -267,7 +268,7 @@ evalscope/registry/tasks/gsm8k.yaml,sha256=M2I7otwOSy0usD8yG8d6QziASQlKdhKLflRHM
 evalscope/registry/tasks/mmlu.yaml,sha256=cJcMH1Cvgo9PlYoTmeGx2bcZayysltaa6ehK57dDkvo,726
 evalscope/registry/tasks/mmlu_mini.yaml,sha256=K8ouHh7ve5ZsbkqRtV3Jl-DF01YFPuObfwEdACJA4Pk,778
 evalscope/report/__init__.py,sha256=0Wes3ot2hy9s-WwZaBztst8qkNrXkOF-Hwa1WW1e8lY,260
-evalscope/report/app.py,sha256=cvof2Nm4ORxC4D3L22Kg3Ngu3kJwBZlfnFJkwDMCmSQ,26881
+evalscope/report/app.py,sha256=Lew--YreNeuyLVktnUNZKIfGvnGE_oAD054kZB-YTHo,26904
 evalscope/report/combinator.py,sha256=O3QirwtYhDhdaWVT4STJMCGZMwoX8BTeJ3HtS9iwnWQ,2567
 evalscope/report/generator.py,sha256=2DULY9W8QCUxdtyfNjo8XAP_YxI1LgR95jknK__kYPU,3600
 evalscope/report/utils.py,sha256=DRlbjbqHEmM8rGlA4pwtlHFhOZtyUzcqiS-mejfIDkU,4584
@@ -314,28 +315,28 @@ evalscope/utils/filters.py,sha256=x_NX40uWMmUsVrAGHCeeV2e63HZZFugWUgdUhk64ivM,15
 evalscope/utils/io_utils.py,sha256=Tjdgen1FsAA4ArqiUzu734L0Px5NuiS0GKRRiGIzxSA,4192
 evalscope/utils/logger.py,sha256=barHSdtbEu21ynGQj_wS-rd7B02wPPR5AgaWCQzvG4w,3638
 evalscope/utils/model_utils.py,sha256=hB9W334ecAb6553FhooT6_jM0g-tjj6AU48IV3K1CKw,1131
-evalscope/utils/utils.py,sha256=lGvn94ryIzx-7WLNJeuyehNTmINt0jYIjrjW12woPCs,9730
+evalscope/utils/utils.py,sha256=VuGdJh3xZAZ-cRoGcKeJTx3z8sgSs2eMjH-1JX2ZYOU,10615
 tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
 tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
 tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
-tests/cli/test_all.py,sha256=tRC4TWaqxEsB6jMsGR7u9RHWHuKzn7Umt2XKY1V8CLU,4035
-tests/cli/test_collection.py,sha256=V-_M7ngwekMGqPuI16jjJZyAK2XLE4Z6QTn-8B5ykgU,4071
-tests/cli/test_run.py,sha256=0gD0nPiioieaDOqRZkS5ruIWuiv1B5D456wSSHv9y40,16471
+tests/cli/test_all.py,sha256=pwup--iNxckUEsR_aFjIAbEQo3UogSu5aIWf9ryLP2o,4022
+tests/cli/test_collection.py,sha256=y8FjoPziPRf5BdJK8DHjcXn26ETKz1OyqjnCpwjt-F4,4096
+tests/cli/test_run.py,sha256=RW4AkJILqzzyd0wuIdy8Y9SB_4koSRJFezGjFdXdLJI,16549
 tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
-tests/perf/test_perf.py,sha256=mfXTCsD9RaCef3b4CLvm8ErxBUaWzn-EKKhOxD65i3A,3817
+tests/perf/test_perf.py,sha256=BXd6SCMbBDKmh-P_KGTOpuwVQZ05xCKjvH01zGyvBJI,3787
 tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-tests/rag/test_clip_benchmark.py,sha256=Ar8Br2CoAFYT2h4zCv_JKMKCGJKbKGYZgNwJ410ZaoU,2597
-tests/rag/test_mteb.py,sha256=t64FXE-ZsOCLiRJrw-dIDIhKd1OXiaglXaeERs0lOh4,4643
-tests/rag/test_ragas.py,sha256=fzpn4zZPeZ04ZdfLmwXbsSjf7WcjPWrGsA6RDNXgIEQ,4011
+tests/rag/test_clip_benchmark.py,sha256=ZCBtgnF8Vuji6WQlb92-_RIvXlUX_Xt-cHZP4AN_DNI,2552
+tests/rag/test_mteb.py,sha256=YJw6X1jwX6SYNB-ryVb-OHJWu3vsE3Y4STATI75rdG0,5619
+tests/rag/test_ragas.py,sha256=E7rfKpKtBqglOL1GcW9adfY8nsOZMuoB8GC55UL1Q3c,4517
 tests/swift/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
 tests/swift/test_run_swift_eval.py,sha256=JKG-0BwTxkbg-XeiXxujPqnVIM3f2EFaJ_9a7p_R4dk,5748
 tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZUQhtYO5e0o,4898
 tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
 tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
-tests/vlm/test_vlmeval.py,sha256=nzWXjw49SlxXgDnYS9N5JSFtcUp8xPOW2YNNzupvtt4,1806
-evalscope-0.13.2.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
-evalscope-0.13.2.dist-info/METADATA,sha256=b7rVRQHN5miovM5qlh4Dozpl8OaxO0rg0ctT-kDZMyY,32399
-evalscope-0.13.2.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
-evalscope-0.13.2.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
-evalscope-0.13.2.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
-evalscope-0.13.2.dist-info/RECORD,,
+tests/vlm/test_vlmeval.py,sha256=UqRiBPMU3vRtLIG1Qu4ZVhyUQx-zGYQuLCgobwf-7a4,3176
+evalscope-0.14.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
+evalscope-0.14.0.dist-info/METADATA,sha256=HQ1pt-YU950AcwwWiypjGcWg0wYU9n6PFZ7j6PG4uHg,33040
+evalscope-0.14.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
+evalscope-0.14.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
+evalscope-0.14.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
+evalscope-0.14.0.dist-info/RECORD,,

tests/cli/test_all.py CHANGED Viewed

@@ -4,13 +4,12 @@ from dotenv import dotenv_values
 env = dotenv_values('.env')
 import os
-import subprocess
 import unittest
 from evalscope.config import TaskConfig
 from evalscope.constants import EvalType, JudgeStrategy, OutputType
 from evalscope.run import run_task
-from evalscope.utils import is_module_installed, test_level_list
+from evalscope.utils import test_level_list
 from evalscope.utils.logger import get_logger
 os.environ['LOG_LEVEL'] = 'DEBUG'
@@ -46,6 +45,7 @@ datasets=[
         'chinese_simpleqa',
         'alpaca_eval',
         'arena_hard',
+        'maritime_bench',
 ]
 dataset_args={
@@ -134,8 +134,8 @@ class TestRun(unittest.TestCase):
             eval_type=EvalType.SERVICE,
             datasets=datasets,
             dataset_args=dataset_args,
-            eval_batch_size=2,
-            limit=2,
+            eval_batch_size=1,
+            limit=1,
             stream=True,
             generation_config={
                 'temperature': 0,

tests/cli/test_collection.py CHANGED Viewed

@@ -80,4 +80,5 @@ class TestCollection(unittest.TestCase):
                 'api_key': os.getenv('DASHSCOPE_API_KEY'),
             }
         )
-        run_task(task_cfg=task_cfg)
+        res = run_task(task_cfg=task_cfg)
+        print(res)

tests/cli/test_run.py CHANGED Viewed

@@ -137,7 +137,7 @@ class TestRun(unittest.TestCase):
                     'subset_list': ['gsm8k'],
                 },
                 'musr': {
-                    'subset_list': ['murder_mysteries']
+                    'subset_list': ['murder_mysteries'],
                 },
                 'general_mcq': {
                     'local_path': 'custom_eval/text/mcq',  # 自定义数据集路径
@@ -263,7 +263,7 @@ class TestRun(unittest.TestCase):
             datasets=[
                 # 'iquiz',
                 # 'ifeval',
-                # 'mmlu',
+                'mmlu',
                 # 'mmlu_pro',
                 # 'musr',
                 # 'process_bench',
@@ -281,9 +281,10 @@ class TestRun(unittest.TestCase):
                 # 'ceval',
                 # 'hellaswag',
                 # 'general_mcq',
-                'general_qa'
+                # 'general_qa'
                 # 'super_gpqa',
-                # 'mmlu_redux'
+                # 'mmlu_redux',
+                'maritime_bench'
             ],
             dataset_args={
                 'mmlu': {
@@ -322,7 +323,8 @@ class TestRun(unittest.TestCase):
                     'subset_list': ['gsm8k'],
                 },
                 'musr': {
-                    'subset_list': ['murder_mysteries']
+                    'subset_list': ['murder_mysteries'],
+                    'local_path': '/root/.cache/modelscope/hub/datasets/AI-ModelScope/MuSR'
                 },
                 'general_mcq': {
                     'local_path': 'custom_eval/text/mcq',  # 自定义数据集路径
@@ -353,10 +355,9 @@ class TestRun(unittest.TestCase):
             stream=False,
             generation_config={
                 'temperature': 0,
-                'n': 2,
+                'n': 1,
                 'max_tokens': 4096,
-            },
-            use_cache='outputs/20250326_202848',
+            }
         )
         run_task(task_cfg=task_cfg)

evalscope 0.13.2__py3-none-any.whl → 0.14.0__py3-none-any.whl

Potentially problematic release.

evalscope 0.13.2py3-none-any.whl → 0.14.0py3-none-any.whl