PyPI - evalscope - Versions diffs - 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl - Mend

evalscope 0.7.2py3-none-any.whl → 0.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (234) hide show

evalscope/__init__.py +1 -1
evalscope/arguments.py +73 -0
evalscope/backend/base.py +6 -2
evalscope/backend/opencompass/api_meta_template.py +8 -14
evalscope/backend/opencompass/backend_manager.py +24 -15
evalscope/backend/opencompass/tasks/eval_api.py +1 -6
evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
evalscope/backend/rag_eval/__init__.py +3 -3
evalscope/backend/rag_eval/backend_manager.py +21 -25
evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
evalscope/backend/rag_eval/cmteb/base.py +22 -23
evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
evalscope/backend/rag_eval/ragas/__init__.py +2 -2
evalscope/backend/rag_eval/ragas/arguments.py +3 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
evalscope/backend/rag_eval/ragas/task_template.py +10 -15
evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
evalscope/backend/rag_eval/utils/clip.py +47 -51
evalscope/backend/rag_eval/utils/embedding.py +13 -12
evalscope/backend/rag_eval/utils/llm.py +8 -6
evalscope/backend/rag_eval/utils/tools.py +12 -11
evalscope/backend/vlm_eval_kit/__init__.py +1 -1
evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
evalscope/benchmarks/arc/__init__.py +3 -2
evalscope/benchmarks/arc/ai2_arc.py +19 -16
evalscope/benchmarks/arc/arc_adapter.py +32 -24
evalscope/benchmarks/bbh/__init__.py +1 -2
evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
evalscope/benchmarks/benchmark.py +16 -16
evalscope/benchmarks/ceval/__init__.py +3 -2
evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
evalscope/benchmarks/ceval/ceval_exam.py +18 -31
evalscope/benchmarks/cmmlu/__init__.py +3 -2
evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
evalscope/benchmarks/competition_math/__init__.py +3 -2
evalscope/benchmarks/competition_math/competition_math.py +7 -16
evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
evalscope/benchmarks/data_adapter.py +24 -24
evalscope/benchmarks/general_qa/__init__.py +3 -2
evalscope/benchmarks/general_qa/general_qa_adapter.py +35 -39
evalscope/benchmarks/gsm8k/__init__.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +27 -24
evalscope/benchmarks/hellaswag/__init__.py +3 -2
evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +28 -23
evalscope/benchmarks/humaneval/__init__.py +1 -1
evalscope/benchmarks/humaneval/humaneval.py +15 -18
evalscope/benchmarks/humaneval/humaneval_adapter.py +192 -7
evalscope/benchmarks/mmlu/__init__.py +3 -2
evalscope/benchmarks/mmlu/mmlu.py +15 -29
evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
evalscope/benchmarks/race/__init__.py +3 -2
evalscope/benchmarks/race/race.py +21 -35
evalscope/benchmarks/race/race_adapter.py +33 -29
evalscope/benchmarks/race/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/__init__.py +3 -2
evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
evalscope/benchmarks/truthful_qa/__init__.py +3 -2
evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
evalscope/cli/cli.py +6 -5
evalscope/cli/start_eval.py +31 -0
evalscope/cli/start_perf.py +0 -3
evalscope/cli/start_server.py +27 -41
evalscope/config.py +154 -96
evalscope/constants.py +50 -32
evalscope/evaluator/evaluator.py +97 -377
evalscope/evaluator/rating_eval.py +12 -33
evalscope/evaluator/reviewer/auto_reviewer.py +48 -76
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
evalscope/metrics/code_metric.py +3 -9
evalscope/metrics/math_accuracy.py +3 -6
evalscope/metrics/metrics.py +21 -21
evalscope/metrics/rouge_metric.py +11 -25
evalscope/models/__init__.py +1 -2
evalscope/models/api/openai_api.py +40 -29
evalscope/models/custom/__init__.py +0 -1
evalscope/models/custom/custom_model.py +3 -3
evalscope/models/dummy_chat_model.py +7 -8
evalscope/models/model_adapter.py +89 -156
evalscope/models/openai_model.py +20 -20
evalscope/perf/arguments.py +16 -3
evalscope/perf/benchmark.py +9 -11
evalscope/perf/http_client.py +3 -8
evalscope/perf/main.py +8 -1
evalscope/perf/plugin/api/custom_api.py +1 -2
evalscope/perf/plugin/api/dashscope_api.py +1 -2
evalscope/perf/plugin/api/openai_api.py +3 -4
evalscope/perf/plugin/datasets/base.py +1 -2
evalscope/perf/plugin/datasets/flickr8k.py +1 -2
evalscope/perf/plugin/datasets/longalpaca.py +1 -2
evalscope/perf/plugin/datasets/openqa.py +1 -2
evalscope/perf/plugin/registry.py +3 -3
evalscope/perf/utils/analysis_result.py +1 -2
evalscope/perf/utils/benchmark_util.py +5 -6
evalscope/perf/utils/db_util.py +77 -30
evalscope/perf/utils/local_server.py +21 -13
evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
evalscope/registry/tasks/arc.yaml +2 -3
evalscope/registry/tasks/bbh.yaml +3 -4
evalscope/registry/tasks/bbh_mini.yaml +3 -4
evalscope/registry/tasks/ceval.yaml +3 -3
evalscope/registry/tasks/ceval_mini.yaml +3 -4
evalscope/registry/tasks/cmmlu.yaml +3 -3
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
evalscope/registry/tasks/general_qa.yaml +1 -1
evalscope/registry/tasks/gsm8k.yaml +2 -2
evalscope/registry/tasks/mmlu.yaml +3 -3
evalscope/registry/tasks/mmlu_mini.yaml +3 -3
evalscope/run.py +153 -381
evalscope/run_arena.py +21 -25
evalscope/summarizer.py +27 -40
evalscope/third_party/longbench_write/README.md +99 -42
evalscope/third_party/longbench_write/default_task.json +1 -1
evalscope/third_party/longbench_write/default_task.yaml +8 -7
evalscope/third_party/longbench_write/eval.py +29 -27
evalscope/third_party/longbench_write/infer.py +16 -104
evalscope/third_party/longbench_write/longbench_write.py +5 -4
evalscope/third_party/longbench_write/resources/judge.txt +1 -1
evalscope/third_party/longbench_write/tools/data_etl.py +5 -6
evalscope/third_party/longbench_write/utils.py +0 -1
evalscope/third_party/toolbench_static/eval.py +14 -15
evalscope/third_party/toolbench_static/infer.py +48 -69
evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
evalscope/third_party/toolbench_static/requirements.txt +1 -1
evalscope/third_party/toolbench_static/toolbench_static.py +4 -3
evalscope/tools/combine_reports.py +27 -34
evalscope/tools/rewrite_eval_results.py +15 -47
evalscope/utils/__init__.py +1 -1
evalscope/utils/arena_utils.py +18 -48
evalscope/{perf/utils → utils}/chat_service.py +4 -5
evalscope/utils/completion_parsers.py +3 -8
evalscope/utils/io_utils.py +162 -0
evalscope/utils/logger.py +17 -7
evalscope/utils/model_utils.py +11 -0
evalscope/utils/utils.py +5 -306
evalscope/version.py +2 -2
{evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/METADATA +123 -118
evalscope-0.8.1.dist-info/RECORD +285 -0
tests/cli/test_run.py +53 -15
tests/perf/test_perf.py +6 -1
tests/rag/test_clip_benchmark.py +38 -38
tests/rag/test_mteb.py +3 -2
tests/rag/test_ragas.py +5 -5
tests/swift/test_run_swift_eval.py +2 -3
tests/swift/test_run_swift_vlm_eval.py +2 -3
tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
tests/vlm/test_vlmeval.py +3 -2
evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
evalscope/cache.py +0 -98
evalscope/models/template.py +0 -1446
evalscope/run_ms.py +0 -140
evalscope/utils/task_cfg_parser.py +0 -10
evalscope/utils/task_utils.py +0 -22
evalscope-0.7.2.dist-info/RECORD +0 -286
{evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/LICENSE +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/WHEEL +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/entry_points.txt +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/top_level.txt +0 -0

evalscope/perf/arguments.py CHANGED Viewed

@@ -1,18 +1,22 @@
 import argparse
+import json
+import os
 import sys
 from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional
-import json
+from evalscope.constants import DEFAULT_WORK_DIR
 @dataclass
 class Arguments:
     # Model and API
-    model: str  # Model identifier
+    model: str  # Model name or path
+    model_id: Optional[str] = None  # Model identifier
     attn_implementation: Optional[str] = None  # Attention implementaion, only for local inference
     api: str = 'openai'  # API to be used (default: 'openai')
     tokenizer_path: Optional[str] = None  # Path to the tokenizer
+    port: int = 8877  # Port number for the local API server
     # Connection settings
     url: str = 'http://127.0.0.1:8877/v1/chat/completions'  # URL for the API connection
@@ -32,6 +36,9 @@ class Arguments:
     wandb_api_key: Optional[str] = None  # WandB API key for logging
     name: Optional[str] = None  # Name for the run
+    # Output settings
+    outputs_dir: str = DEFAULT_WORK_DIR
     # Prompt settings
     max_prompt_length: int = sys.maxsize  # Maximum length of the prompt
     min_prompt_length: int = 0  # Minimum length of the prompt
@@ -57,7 +64,6 @@ class Arguments:
     @staticmethod
     def from_args(args):
         return Arguments(
             model=args.model,
             attn_implementation=args.attn_implementation,
@@ -72,6 +78,7 @@ class Arguments:
             headers=args.headers,
             wandb_api_key=args.wandb_api_key,
             name=args.name,
+            outputs_dir=args.outputs_dir,
             debug=args.debug,
             tokenizer_path=args.tokenizer_path,
             api=args.api,
@@ -98,6 +105,7 @@ class Arguments:
         if self.api_key:
             # Assuming the API key is used as a Bearer token
             self.headers['Authorization'] = f'Bearer {self.api_key}'
+        self.model_id = os.path.basename(self.model)
     def __str__(self):
         return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
@@ -130,6 +138,7 @@ def add_argument(parser: argparse.ArgumentParser):
     # Connection settings
     parser.add_argument('--url', type=str, default='http://127.0.0.1:8877/v1/chat/completions')
+    parser.add_argument('--port', type=int, default=8877, help='The port for local inference')
     parser.add_argument('--headers', nargs='+', dest='headers', action=ParseKVAction, help='Extra HTTP headers')
     parser.add_argument('--api-key', type=str, required=False, default='EMPTY', help='The API key for authentication')
     parser.add_argument('--connect-timeout', type=int, default=120, help='The network connection timeout')
@@ -152,6 +161,9 @@ def add_argument(parser: argparse.ArgumentParser):
     parser.add_argument('--prompt', type=str, required=False, default=None, help='Specified the request prompt')
     parser.add_argument('--query-template', type=str, default=None, help='Specify the query template')
+    # Output settings
+    parser.add_argument('--outputs-dir', help='Outputs dir.', default='outputs')
     # Dataset settings
     parser.add_argument('--dataset', type=str, default='openqa', help='Specify the dataset')
     parser.add_argument('--dataset-path', type=str, required=False, help='Path to the dataset file')
@@ -170,6 +182,7 @@ def add_argument(parser: argparse.ArgumentParser):
     parser.add_argument('--stream', action='store_true', help='Stream output with SSE', default=None)
     parser.add_argument('--temperature', type=float, help='The sample temperature', default=None)
     parser.add_argument('--top-p', type=float, help='Sampling top p', default=None)
     # yapf: enable

evalscope/perf/benchmark.py CHANGED Viewed

@@ -1,16 +1,15 @@
 import asyncio
 import copy
+import json
+import numpy as np
 import os
 import platform
 import sqlite3
 import threading
 import time
 from http import HTTPStatus
-from typing import List
-import json
-import numpy as np
 from tqdm import tqdm
+from typing import List
 from evalscope.perf.arguments import Arguments
 from evalscope.perf.http_client import AioHttpClient, test_connection
@@ -138,17 +137,17 @@ async def statistic_benchmark_metric_worker(benchmark_data_queue: asyncio.Queue,
     api_plugin_class = ApiRegistry(args.api)
     api_plugin = api_plugin_class(args.tokenizer_path)
-    result_db_path = get_result_db_path(args.name, args.model)
+    result_db_path = get_result_db_path(args)
     # Initialize wandb
     if args.wandb_api_key:
-        import wandb
         import datetime
+        import wandb
         os.environ['WANDB_SILENT'] = 'true'
-        os.environ['WANDB_DIR'] = './outputs'
+        os.environ['WANDB_DIR'] = args.outputs_dir
         wandb.login(key=args.wandb_api_key)
         current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
-        name = args.name if args.name else f'{args.model}_{current_time}'
+        name = args.name if args.name else f'{args.model_id}_{current_time}'
         wandb.init(project='perf_benchmark', name=name, config=args.to_dict())
     with sqlite3.connect(result_db_path) as con:
@@ -196,10 +195,9 @@ async def start_server(args: Arguments) -> bool:
         server.start()
         if args.dataset.startswith('speed_benchmark'):
-            args.url = 'http://127.0.0.1:8877/v1/completions'
+            args.url = f'http://127.0.0.1:{args.port}/v1/completions'
         else:
-            args.url = 'http://127.0.0.1:8877/v1/chat/completions'
-        args.model = os.path.basename(args.model)
+            args.url = f'http://127.0.0.1:{args.port}/v1/chat/completions'
     if not await test_connection(args):
         raise TimeoutError('Test connection failed')

evalscope/perf/http_client.py CHANGED Viewed

@@ -1,12 +1,10 @@
+import aiohttp
 import asyncio
-import logging
+import json
 import time
 from http import HTTPStatus
 from typing import AsyncGenerator, Dict, List, Tuple
-import aiohttp
-import json
 from evalscope.perf.arguments import Arguments
 from evalscope.perf.utils.local_server import ServerSentEvent
 from evalscope.utils.logger import get_logger
@@ -21,7 +19,6 @@ class AioHttpClient:
         args: Arguments,
     ):
         self.url = args.url
-        self.debug = args.debug
         self.headers = {'user-agent': 'modelscope_bench', **(args.headers or {})}
         self.read_timeout = args.read_timeout
         self.connect_timeout = args.connect_timeout
@@ -31,9 +28,7 @@ class AioHttpClient:
                 connect=self.connect_timeout,
                 sock_read=self.read_timeout),
             connector=aiohttp.TCPConnector(limit=1),
-            trace_configs=[self._create_trace_config()] if self.debug else [])
-        if self.debug:
-            get_logger(log_level=logging.DEBUG)
+            trace_configs=[self._create_trace_config()] if args.debug else [])
     def _create_trace_config(self):
         trace_config = aiohttp.TraceConfig()

evalscope/perf/main.py CHANGED Viewed

@@ -1,11 +1,14 @@
 import asyncio
+import logging
+import os
 import platform
 from argparse import Namespace
 from evalscope.perf.arguments import Arguments, parse_args
 from evalscope.perf.benchmark import benchmark
+from evalscope.perf.utils.db_util import get_output_path
 from evalscope.perf.utils.handler import add_signal_handlers
-from evalscope.utils.logger import get_logger
+from evalscope.utils.logger import configure_logging, get_logger
 from evalscope.utils.utils import seed_everything
 logger = get_logger()
@@ -18,6 +21,10 @@ def run_perf_benchmark(args):
         args = Arguments.from_args(args)
     seed_everything(args.seed)
+    # Setup logger and output
+    args.outputs_dir = get_output_path(args)
+    configure_logging(args.debug, os.path.join(args.outputs_dir, 'benchmark.log'))
     logger.info('Starting benchmark...')
     logger.info(args)

evalscope/perf/plugin/api/custom_api.py CHANGED Viewed

@@ -1,7 +1,6 @@
-from typing import Any, Dict, Iterator, List
 import json
 from transformers import AutoTokenizer
+from typing import Any, Dict, Iterator, List
 from evalscope.perf.arguments import Arguments
 from evalscope.perf.plugin.api.base import ApiPluginBase

evalscope/perf/plugin/api/dashscope_api.py CHANGED Viewed

@@ -1,8 +1,7 @@
+import json
 import os
 from typing import Any, Dict, Iterator, List
-import json
 from evalscope.perf.arguments import Arguments
 from evalscope.perf.plugin.api.base import ApiPluginBase
 from evalscope.perf.plugin.registry import register_api

evalscope/perf/plugin/api/openai_api.py CHANGED Viewed

@@ -1,8 +1,7 @@
-import os
-from typing import Any, Dict, Iterator, List
 import json
+import os
 from transformers import AutoTokenizer
+from typing import Any, Dict, Iterator, List, Union
 from evalscope.perf.arguments import Arguments
 from evalscope.perf.plugin.api.base import ApiPluginBase
@@ -30,7 +29,7 @@ class OpenaiPlugin(ApiPluginBase):
         else:
             self.tokenizer = None
-    def build_request(self, messages: List[Dict] | str, param: Arguments) -> Dict:
+    def build_request(self, messages: Union[List[Dict], str], param: Arguments) -> Dict:
         """Build the openai format request based on prompt, dataset
         Args:

evalscope/perf/plugin/datasets/base.py CHANGED Viewed

@@ -1,9 +1,8 @@
+import json
 import sys
 from abc import abstractmethod
 from typing import Any, Dict, Iterator, List, Tuple
-import json
 from evalscope.perf.arguments import Arguments

evalscope/perf/plugin/datasets/flickr8k.py CHANGED Viewed

@@ -1,9 +1,8 @@
 import base64
 from io import BytesIO
-from typing import Any, Dict, Iterator, List
 from modelscope.msdatasets import MsDataset
 from PIL import Image
+from typing import Any, Dict, Iterator, List
 from evalscope.perf.arguments import Arguments
 from evalscope.perf.plugin.datasets.base import DatasetPluginBase

evalscope/perf/plugin/datasets/longalpaca.py CHANGED Viewed

@@ -1,6 +1,5 @@
-from typing import Any, Dict, Iterator, List
 from modelscope import MsDataset
+from typing import Any, Dict, Iterator, List
 from evalscope.perf.arguments import Arguments
 from evalscope.perf.plugin.datasets.base import DatasetPluginBase

evalscope/perf/plugin/datasets/openqa.py CHANGED Viewed

@@ -1,8 +1,7 @@
+import json
 import subprocess
 from typing import Any, Dict, Iterator, List
-import json
 from evalscope.perf.arguments import Arguments
 from evalscope.perf.plugin.datasets.base import DatasetPluginBase
 from evalscope.perf.plugin.registry import register_dataset

evalscope/perf/plugin/registry.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Any, List, Type
+from typing import Any, List, Type, Union
 class PluginRegistry:
@@ -20,7 +20,7 @@ class PluginRegistry:
         return self.get_class(name)
-def register_dataset(name: str | List[str]):
+def register_dataset(name: Union[str, List[str]]):
     def class_decorator(cls: Type):
         if isinstance(name, str):
@@ -35,7 +35,7 @@ def register_dataset(name: str | List[str]):
     return class_decorator
-def register_api(name: str | List[str]):
+def register_api(name: Union[str, List[str]]):
     def class_decorator(cls: Type):
         if isinstance(name, str):

evalscope/perf/utils/analysis_result.py CHANGED Viewed

@@ -1,9 +1,8 @@
 import base64
+import json
 import pickle
 import sqlite3
-import json
 result_db_path = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/qwen2.5_benchmark_20241111_160543.db'
 con = sqlite3.connect(result_db_path)
 query_sql = "SELECT request, response_messages, prompt_tokens, completion_tokens \

evalscope/perf/utils/benchmark_util.py CHANGED Viewed

@@ -1,9 +1,8 @@
 import time
+import torch
 from dataclasses import dataclass, field
 from typing import Any, List, Optional, Tuple
-import torch
 from evalscope.utils.logger import get_logger
 logger = get_logger()
@@ -117,19 +116,19 @@ class BenchmarkMetrics:
     def create_message(self, default_ndigits=3):
         message = {
-            'Time taken for tests (senconds)': round(self.total_time, default_ndigits),
+            'Time taken for tests (s)': round(self.total_time, default_ndigits),
             'Number of concurrency': self.concurrency,
             'Total requests': int(self.n_total_queries),
             'Succeed requests': self.n_succeed_queries,
             'Failed requests': self.n_failed_queries,
+            'Throughput(average tokens/s)': round(self.avg_token_per_seconds, default_ndigits),
             'Average QPS': round(self.qps, default_ndigits),
             'Average latency (s)': round(self.avg_latency, default_ndigits),
             'Average time to first token (s)': round(self.avg_first_chunk_latency, default_ndigits),
             'Average time per output token (s)': round(self.avg_time_per_token, 5),
-            'Average package latency (s)': round(self.avg_chunk_time, default_ndigits),
-            'Average package per request': round(self.n_avg_chunks, default_ndigits),
-            'Throughput(average output tokens per second)': round(self.avg_token_per_seconds, default_ndigits),
             'Average input tokens per request': round(self.avg_prompt_tokens, default_ndigits),
             'Average output tokens per request': round(self.avg_completion_tokens, default_ndigits),
+            'Average package latency (s)': round(self.avg_chunk_time, default_ndigits),
+            'Average package per request': round(self.n_avg_chunks, default_ndigits),
         }
         return message

evalscope/perf/utils/db_util.py CHANGED Viewed

@@ -1,12 +1,12 @@
 import base64
+import json
 import os
 import pickle
 import sqlite3
 import sys
 from datetime import datetime
-import json
 from tabulate import tabulate
+from typing import Dict, List
 from evalscope.perf.arguments import Arguments
 from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
@@ -88,15 +88,19 @@ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData)
         cursor.execute(query, common_columns)
-def get_result_db_path(name, model):
+def get_output_path(args: Arguments) -> str:
     current_time = datetime.now().strftime('%Y%m%d_%H%M%S')
-    output_dir = './outputs'
-    result_db_path = os.path.join(output_dir, f'{name or model}_perf', current_time, 'benchmark_data.db')
+    output_path = os.path.join(args.outputs_dir, current_time, f'{args.name or args.model_id}')
+    if not os.path.exists(output_path):
+        os.makedirs(output_path, exist_ok=True)
+    logger.info(f'Save the result to: {output_path}')
+    return output_path
-    if not os.path.exists(os.path.dirname(result_db_path)):
-        os.makedirs(os.path.dirname(result_db_path), exist_ok=True)
+def get_result_db_path(args: Arguments):
+    result_db_path = os.path.join(args.outputs_dir, 'benchmark_data.db')
-    logger.info(f'Save the result to: {result_db_path}')
+    logger.info(f'Save the data base to: {result_db_path}')
     if os.path.exists(result_db_path):
         logger.warning('The db file exists, delete it and start again!.')
         sys.exit(1)
@@ -104,44 +108,87 @@ def get_result_db_path(name, model):
     return result_db_path
-def get_percentile_results(result_db_path: str):
+def calculate_percentiles(data: List[float], percentiles: List[int]) -> Dict[int, float]:
+    """
+    Calculate the percentiles for a specific list of data.
-    def percentile_results(rows, index, percentiles):
-        results = {}
-        n_success_queries = len(rows)
-        for percentile in percentiles:
+    :param data: List of values for a specific metric.
+    :param percentiles: List of percentiles to calculate.
+    :return: Dictionary of calculated percentiles.
+    """
+    results = {}
+    n_success_queries = len(data)
+    data.sort()
+    for percentile in percentiles:
+        try:
             idx = int(n_success_queries * percentile / 100)
-            row = rows[idx]
-            value = row[index] if row[index] is not None else float('inf')
+            value = data[idx] if data[idx] is not None else float('nan')
             results[percentile] = round(value, 4)
-        return results
+        except IndexError:
+            results[percentile] = float('nan')
+    return results
+def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
+    """
+    Compute and return quantiles for various metrics from the database results.
+    :param result_db_path: Path to the SQLite database file.
+    :return: Dictionary of percentiles for various metrics.
+    """
+    def inter_token_latencies(chunk_times_json: str) -> List[float]:
+        try:
+            chunk_times = json.loads(chunk_times_json)
+            return [t2 - t1 for t1, t2 in zip(chunk_times[:-1], chunk_times[1:])]
+        except (json.JSONDecodeError, TypeError) as e:
+            logger.error(f'Error parsing chunk times: {e}')
+            return []
     query_sql = ('SELECT start_time, chunk_times, success, completed_time, latency, first_chunk_latency, '
                  'n_chunks, chunk_time, prompt_tokens, completion_tokens '
-                 'FROM result WHERE success=1 ORDER BY first_chunk_latency ASC')
+                 'FROM result WHERE success=1')
     percentiles = [10, 25, 50, 66, 75, 80, 90, 95, 98, 99]
     with sqlite3.connect(result_db_path) as con:
         rows = con.execute(query_sql).fetchall()
-    if len(rows) <= len(percentiles):
+    if len(rows) < len(percentiles):
         logger.info('Too little data to calculate quantiles!')
         return {}
-    # Calculate percentiles for first chunk latency and latency
-    first_chunk_latency_index = 5
-    latency_index = 4
+    # Define index variables for columns
+    CHUNK_TIMES_INDEX = 1
+    LATENCY_INDEX = 4
+    FIRST_CHUNK_LATENCY_INDEX = 5
+    PROMPT_TOKENS_INDEX = 8
+    COMPLETION_TOKENS_INDEX = 9
+    # Prepare data for each metric
+    inter_token_latencies_all = []
+    for row in rows:
+        inter_token_latencies_all.extend(inter_token_latencies(row[CHUNK_TIMES_INDEX]))
+    metrics = {
+        'TTFT (s)': [row[FIRST_CHUNK_LATENCY_INDEX] for row in rows],
+        'TPOT (s)':
+        inter_token_latencies_all,
+        'Latency (s)': [row[LATENCY_INDEX] for row in rows],
+        'Input tokens': [row[PROMPT_TOKENS_INDEX] for row in rows],
+        'Output tokens': [row[COMPLETION_TOKENS_INDEX] for row in rows],
+        'Throughput(tokens/s)':
+        [(row[COMPLETION_TOKENS_INDEX] / row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan')
+         for row in rows]
+    }
-    first_chunk_latency_results = percentile_results(rows, first_chunk_latency_index, percentiles)
-    rows.sort(key=lambda x: x[latency_index])
-    latency_results = percentile_results(rows, latency_index, percentiles)
+    # Calculate percentiles for each metric
+    results = {'Percentile': [f'{p}%' for p in percentiles]}
+    for metric_name, data in metrics.items():
+        metric_percentiles = calculate_percentiles(data, percentiles)
+        results[metric_name] = [metric_percentiles[p] for p in percentiles]
-    # Prepare data for tabulation
-    return {
-        'Percentile': [f'{p}%' for p in percentiles],
-        'First Chunk Latency (s)': [first_chunk_latency_results[p] for p in percentiles],
-        'Latency (s)': [latency_results[p] for p in percentiles]
-    }
+    return results
 def summary_result(args: Arguments, metrics: BenchmarkMetrics, expected_number_of_queries: int, result_db_path: str):

evalscope/perf/utils/local_server.py CHANGED Viewed

@@ -1,16 +1,15 @@
 import os
 import subprocess
-from contextlib import asynccontextmanager
-from dataclasses import dataclass
 import torch
 import uvicorn
+from contextlib import asynccontextmanager
+from dataclasses import dataclass
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from sse_starlette.sse import EventSourceResponse
 from evalscope.perf.arguments import Arguments
-from evalscope.perf.utils.chat_service import ChatCompletionRequest, ChatService, ModelList, TextCompletionRequest
+from evalscope.utils.chat_service import ChatCompletionRequest, ChatService, ModelList, TextCompletionRequest
 from evalscope.utils.logger import get_logger
 logger = get_logger()
@@ -66,9 +65,9 @@ async def lifespan(app: FastAPI):
         torch.cuda.empty_cache()
-def create_app(args) -> FastAPI:
+def create_app(model, attn_implementation=None) -> FastAPI:
     app = FastAPI(lifespan=lifespan)
-    chat_service = ChatService(model_path=args.model, attn_implementation=args.attn_implementation)
+    chat_service = ChatService(model_path=model, attn_implementation=attn_implementation)
     app.add_middleware(
         CORSMiddleware,
@@ -98,18 +97,27 @@ def create_app(args) -> FastAPI:
 def start_app(args: Arguments):
     if args.api == 'local':
-        app = create_app(args)
-        uvicorn.run(app, host='0.0.0.0', port=8877, workers=1)
+        app = create_app(args.model, args.attn_implementation)
+        uvicorn.run(app, host='0.0.0.0', port=args.port, workers=1)
     elif args.api == 'local_vllm':
         os.environ['VLLM_USE_MODELSCOPE'] = 'True'
+        os.environ['VLLM_ALLOW_LONG_MAX_MODEL_LEN'] = '1'
+        # yapf: disable
         proc = subprocess.Popen([
-            'python', '-m', 'vllm.entrypoints.openai.api_server', '--model', args.model, '--served-model-name',
-            os.path.basename(args.model), '--tensor-parallel-size',
-            str(torch.cuda.device_count()), '--max-model-len', '32768', '--gpu-memory-utilization', '0.9', '--host',
-            '0.0.0.0', '--port', '8877', '--disable-log-requests', '--disable-log-stats'
+            'python', '-m', 'vllm.entrypoints.openai.api_server',
+            '--model', args.model,
+            '--served-model-name', args.model,
+            '--tensor-parallel-size', str(torch.cuda.device_count()),
+            '--max-model-len', '32768',
+            '--gpu-memory-utilization', '0.9',
+            '--host', '0.0.0.0',
+            '--port', str(args.port),
+            '--trust-remote-code',
+            '--disable-log-requests',
+            '--disable-log-stats',
         ])
+        # yapf: enable
         import atexit
         def on_exit():

evalscope/registry/config/cfg_arena_zhihu.yaml CHANGED Viewed

@@ -21,7 +21,7 @@ answers_gen:
         model_id_or_path: /mnt/data/data/user/maoyunlin.myl/output/qwen2-7b-instruct/v25-20240809-113533/checkpoint-309-merged
         revision: NULL       # revision of model, default is NULL
         precision: torch.float16
-        enable: true           # enable or disable this model
+        enable: true           # enable or disable this model
         template_type: default-generation
         generation_config:
             do_sample: true

evalscope/registry/tasks/arc.yaml CHANGED Viewed

@@ -1,5 +1,5 @@
 model_args:    # model args should be followed by benchmark requirements
-  revision: default
+  revision: master
   precision: torch.float16
   device_map: auto
 #  model_name_or_path: qwen/qwen-7b-chat
@@ -22,8 +22,7 @@ model: null   # Note: to be implemented as CustomModel
 eval_type: custom
 datasets:
   - arc
-outputs: null    # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
 use_cache: false
 stage: all
-dataset_hub: ModelScope    # `Local` or `ModelScope`
+dataset_hub: modelscope    # `Local` or `ModelScope`
 limit: null

evalscope/registry/tasks/bbh.yaml CHANGED Viewed

@@ -1,5 +1,5 @@
 model_args:    # model args should be followed by benchmark requirements
-  revision: default
+  revision: master
   precision: torch.float16
   device_map: auto
 #  model_name_or_path: qwen/qwen-7b-chat
@@ -20,8 +20,7 @@ model: null   # Note: to be implemented as CustomModel
 eval_type: custom
 datasets:
   - bbh
-outputs: null    # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
 use_cache: false
 stage: all
-dataset_hub: ModelScope    # `Local` or `ModelScope`
-limit: null
+dataset_hub: modelscope    # `Local` or `ModelScope`
+limit: null

evalscope/registry/tasks/bbh_mini.yaml CHANGED Viewed

@@ -1,5 +1,5 @@
 model_args:    # model args should be followed by benchmark requirements
-  revision: default
+  revision: master
   precision: torch.float16
   device_map: auto
 #  model_name_or_path: qwen/qwen-7b-chat
@@ -20,8 +20,7 @@ model: null   # Note: to be implemented as CustomModel
 eval_type: custom
 datasets:
   - bbh
-outputs: null    # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
 use_cache: false
 stage: all
-dataset_hub: ModelScope    # `Local` or `ModelScope`
-limit: null
+dataset_hub: modelscope    # `Local` or `ModelScope`
+limit: null

evalscope/registry/tasks/ceval.yaml CHANGED Viewed

@@ -1,5 +1,5 @@
 model_args:    # model args should be followed by benchmark requirements
-  revision: default
+  revision: master
   precision: torch.float16
   device_map: auto
 #  model_name_or_path: qwen/qwen-7b-chat
@@ -23,5 +23,5 @@ datasets:
 outputs: null    # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
 use_cache: false
 stage: all
-dataset_hub: ModelScope    # `Local` or `ModelScope`
-limit: null
+dataset_hub: modelscope    # `Local` or `ModelScope`
+limit: null

evalscope 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl

Potentially problematic release.

evalscope 0.7.2py3-none-any.whl → 0.8.1py3-none-any.whl