PyPI - evalscope - Versions diffs - 1.0.2__py3-none-any.whl → 1.1.1__py3-none-any.whl - Mend

evalscope 1.0.2py3-none-any.whl → 1.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (176) hide show

evalscope/api/benchmark/__init__.py +8 -1
evalscope/api/benchmark/adapters/__init__.py +1 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +12 -0
evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
evalscope/api/benchmark/benchmark.py +14 -0
evalscope/api/dataset/dataset.py +21 -0
evalscope/api/dataset/loader.py +6 -2
evalscope/api/mixin/sandbox_mixin.py +32 -54
evalscope/api/model/generate_config.py +6 -0
evalscope/app/ui/multi_model.py +6 -1
evalscope/app/ui/single_model.py +8 -2
evalscope/app/utils/data_utils.py +3 -2
evalscope/app/utils/visualization.py +2 -2
evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
evalscope/benchmarks/ai2d/ai2d_adapter.py +3 -2
evalscope/benchmarks/bfcl/bfcl_adapter.py +11 -46
evalscope/benchmarks/blink/__init__.py +0 -0
evalscope/benchmarks/blink/blink_adapter.py +61 -0
evalscope/benchmarks/chartqa/__init__.py +0 -0
evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
evalscope/benchmarks/chartqa/utils.py +38 -0
evalscope/benchmarks/data_collection/data_collection_adapter.py +2 -1
evalscope/benchmarks/docvqa/__init__.py +0 -0
evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
evalscope/benchmarks/general_arena/general_arena_adapter.py +1 -1
evalscope/benchmarks/general_arena/utils.py +2 -1
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +23 -4
evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +158 -0
evalscope/benchmarks/hle/hle_adapter.py +3 -2
evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -1
evalscope/benchmarks/infovqa/__init__.py +0 -0
evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +3 -1
evalscope/benchmarks/math_verse/__init__.py +0 -0
evalscope/benchmarks/math_verse/math_verse_adapter.py +100 -0
evalscope/benchmarks/math_vision/__init__.py +0 -0
evalscope/benchmarks/math_vision/math_vision_adapter.py +111 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +6 -26
evalscope/benchmarks/mm_bench/mm_bench_adapter.py +2 -2
evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -1
evalscope/benchmarks/ner/__init__.py +0 -0
evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
evalscope/benchmarks/ner/copious_adapter.py +85 -0
evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
evalscope/benchmarks/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
evalscope/benchmarks/ocr_bench_v2/utils.py +433 -0
evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
evalscope/benchmarks/poly_math/__init__.py +0 -0
evalscope/benchmarks/poly_math/poly_math_adapter.py +127 -0
evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
evalscope/benchmarks/pope/__init__.py +0 -0
evalscope/benchmarks/pope/pope_adapter.py +111 -0
evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
evalscope/benchmarks/simple_vqa/__init__.py +0 -0
evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +1 -1
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +1 -1
evalscope/benchmarks/visu_logic/__init__.py +0 -0
evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
evalscope/benchmarks/zerobench/__init__.py +0 -0
evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
evalscope/constants.py +4 -0
evalscope/evaluator/evaluator.py +72 -79
evalscope/metrics/math_parser.py +14 -0
evalscope/metrics/metric.py +52 -1
evalscope/metrics/metrics.py +16 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
evalscope/models/utils/openai.py +4 -0
evalscope/perf/arguments.py +24 -4
evalscope/perf/benchmark.py +74 -89
evalscope/perf/http_client.py +31 -16
evalscope/perf/main.py +15 -2
evalscope/perf/plugin/api/base.py +9 -7
evalscope/perf/plugin/api/custom_api.py +13 -58
evalscope/perf/plugin/api/default_api.py +179 -79
evalscope/perf/plugin/api/openai_api.py +4 -3
evalscope/perf/plugin/datasets/base.py +21 -0
evalscope/perf/plugin/datasets/custom.py +2 -3
evalscope/perf/plugin/datasets/line_by_line.py +2 -3
evalscope/perf/plugin/datasets/longalpaca.py +2 -3
evalscope/perf/plugin/datasets/openqa.py +2 -4
evalscope/perf/plugin/datasets/random_dataset.py +1 -3
evalscope/perf/utils/benchmark_util.py +36 -22
evalscope/perf/utils/db_util.py +14 -19
evalscope/perf/utils/local_server.py +0 -44
evalscope/perf/utils/log_utils.py +21 -6
evalscope/report/__init__.py +11 -2
evalscope/report/combinator.py +52 -2
evalscope/run.py +4 -0
evalscope/utils/function_utils.py +195 -12
evalscope/utils/io_utils.py +74 -0
evalscope/utils/json_schema.py +8 -6
evalscope/utils/logger.py +49 -17
evalscope/utils/multi_choices.py +16 -1
evalscope/utils/ner.py +377 -0
evalscope/version.py +2 -2
{evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/METADATA +239 -393
{evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/RECORD +140 -98
{evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/WHEEL +1 -1
{evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/top_level.txt +0 -1
tests/__init__.py +0 -1
tests/benchmark/__init__.py +0 -1
tests/benchmark/test_eval.py +0 -429
tests/benchmark/test_image_edit.py +0 -65
tests/benchmark/test_sandbox.py +0 -81
tests/benchmark/test_t2i.py +0 -142
tests/benchmark/test_vlm.py +0 -137
tests/cli/__init__.py +0 -1
tests/cli/test_all.py +0 -269
tests/cli/test_collection.py +0 -99
tests/cli/test_custom.py +0 -268
tests/cli/test_reasoning.py +0 -81
tests/common.py +0 -73
tests/perf/__init__.py +0 -1
tests/perf/test_perf.py +0 -206
tests/rag/test_clip_benchmark.py +0 -87
tests/rag/test_mteb.py +0 -213
tests/rag/test_ragas.py +0 -128
tests/swift/__init__.py +0 -1
tests/swift/test_run_swift_eval.py +0 -146
tests/swift/test_run_swift_vlm_eval.py +0 -128
tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
tests/test_run_all.py +0 -12
tests/utils.py +0 -13
tests/vlm/__init__.py +0 -1
tests/vlm/test_vlmeval.py +0 -102
{tests/rag → evalscope/benchmarks/aa_lcr}/__init__.py +0 -0
{evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/entry_points.txt +0 -0
{evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info/licenses}/LICENSE +0 -0

evalscope/perf/arguments.py CHANGED Viewed

@@ -33,11 +33,17 @@ class Arguments(BaseArgument):
     rate: int = -1  # Rate limit for requests (default: -1, no limit)
     sleep_interval: int = 5  # Sleep interval between performance runs, in seconds
+    # Tuning knobs
+    db_commit_interval: int = 1000  # Number of rows buffered before committing to the DB
+    queue_size_multiplier: int = 5  # Maxsize for queue = parallel * this multiplier
+    in_flight_task_multiplier: int = 2  # Max scheduled tasks = parallel * this multiplier
     # Logging and debugging
     log_every_n_query: int = 10  # Log every N queries
     debug: bool = False  # Debug mode
-    wandb_api_key: Optional[str] = None  # WandB API key for logging
-    swanlab_api_key: Optional[str] = None  # SwanLab API key for logging
+    visualizer: Optional[str] = None  # Visualizer for logging, supports 'swanlab' or 'wandb'
+    wandb_api_key: Optional[str] = None  # Will be deprecated in the future
+    swanlab_api_key: Optional[str] = None  # Will be deprecated in the future
     name: Optional[str] = None  # Name for the run
     # Output settings
@@ -68,7 +74,7 @@ class Arguments(BaseArgument):
     max_tokens: Optional[int] = 2048  # Maximum number of tokens in the response
     min_tokens: Optional[int] = None  # Minimum number of tokens in the response
     n_choices: Optional[int] = None  # Number of response choices
-    seed: Optional[int] = 0  # Random seed for reproducibility
+    seed: Optional[int] = None  # Random seed for reproducibility
     stop: Optional[List[str]] = None  # Stop sequences for the response
     stop_token_ids: Optional[List[str]] = None  # Stop token IDs for the response
     stream: Optional[bool] = True  # Whether to stream the response
@@ -107,6 +113,14 @@ class Arguments(BaseArgument):
             self.parallel
         ), f'The length of number and parallel should be the same, but got number: {self.number} and parallel: {self.parallel}'  # noqa: E501
+        # Validate tuning knobs
+        if self.db_commit_interval <= 0:
+            self.db_commit_interval = 1
+        if self.queue_size_multiplier <= 0:
+            self.queue_size_multiplier = 1
+        if self.in_flight_task_multiplier <= 0:
+            self.in_flight_task_multiplier = 1
 class ParseKVAction(argparse.Action):
@@ -152,9 +166,15 @@ def add_argument(parser: argparse.ArgumentParser):
     parser.add_argument(
         '--sleep-interval', type=int, default=5, help='Sleep interval between performance runs, in seconds. Default 5')  # noqa: E501
+    # Tuning knobs
+    parser.add_argument('--db-commit-interval', type=int, default=1000, help='Rows buffered before SQLite commit')
+    parser.add_argument('--queue-size-multiplier', type=int, default=5, help='Queue maxsize = parallel * multiplier')
+    parser.add_argument('--in-flight-task-multiplier', type=int, default=2, help='Max scheduled tasks = parallel * multiplier')  # noqa: E501
     # Logging and debugging
     parser.add_argument('--log-every-n-query', type=int, default=10, help='Logging every n query')
     parser.add_argument('--debug', action='store_true', default=False, help='Debug request send')
+    parser.add_argument('--visualizer', type=str, default=None, help='The visualizer to use, default None')
     parser.add_argument('--wandb-api-key', type=str, default=None, help='The wandb API key')
     parser.add_argument('--swanlab-api-key', type=str, default=None, help='The swanlab API key')
     parser.add_argument('--name', type=str, help='The wandb/swanlab db result name and result db name')
@@ -190,7 +210,7 @@ def add_argument(parser: argparse.ArgumentParser):
     parser.add_argument(
         '--min-tokens', type=int, help='The minimum number of tokens that can be generated', default=None)
     parser.add_argument('--n-choices', type=int, help='How many completion choices to generate', default=None)
-    parser.add_argument('--seed', type=int, help='The random seed', default=0)
+    parser.add_argument('--seed', type=int, help='The random seed', default=None)
     parser.add_argument('--stop', nargs='*', help='The stop tokens', default=None)
     parser.add_argument('--stop-token-ids', nargs='*', help='Set the stop token IDs', default=None)
     parser.add_argument('--stream', action=argparse.BooleanOptionalAction, help='Stream output with SSE', default=True)

evalscope/perf/benchmark.py CHANGED Viewed

@@ -3,8 +3,6 @@ import json
 import numpy as np
 import platform
 import sqlite3
-import time
-from http import HTTPStatus
 from tqdm import tqdm
 from typing import TYPE_CHECKING, AsyncGenerator, Dict, List, Tuple
@@ -82,86 +80,58 @@ async def send_request(
     request: dict,
     benchmark_data_queue: asyncio.Queue,
     args: Arguments,
-    api_plugin: 'ApiPluginBase',
+    client: AioHttpClient,  # reuse shared client
 ):
     async with semaphore:
-        client = AioHttpClient(args, api_plugin)
-        async with client:
-            benchmark_data = BenchmarkData(request=request)
-            benchmark_data.start_time = time.perf_counter()
-            collected_messages = []
-            try:
-                async for is_error, state_code, response_data in client.post(request):
-                    if is_error or state_code != HTTPStatus.OK:
-                        error_msg = str(response_data) if response_data else 'Unknown error'
-                        logger.error(f'Request: {request} failed, state_code: {state_code}, data: {error_msg}')
-                        benchmark_data.success = False
-                        break
-                    if response_data:
-                        collected_messages.append(response_data)
-                        benchmark_data.chunk_times.append(time.perf_counter())
-                        benchmark_data.success = True
-                        benchmark_data.update_gpu_usage()
-            except Exception as e:
-                if response_data:
-                    collected_messages.append(response_data)
-                benchmark_data.success = False
-                logger.exception(e)
-                logger.error(f'Request query: {request} exception')
-            finally:
-                benchmark_data.completed_time = time.perf_counter()
-                benchmark_data.response_messages = collected_messages
-                await benchmark_data_queue.put(benchmark_data)
+        benchmark_data = await client.post(request)
+        benchmark_data.update_gpu_usage()
+        await benchmark_data_queue.put(benchmark_data)
 @exception_handler
 async def statistic_benchmark_metric(benchmark_data_queue: asyncio.Queue, args: Arguments, api_plugin: 'ApiPluginBase'):
     metrics = BenchmarkMetrics(concurrency=args.parallel)
     result_db_path = get_result_db_path(args)
-    collected_benchmark_data = []
-    with tqdm(desc='Processing', total=args.number) as pbar:
-        while not (data_process_completed_event.is_set() and benchmark_data_queue.empty()):
-            try:
-                # Attempt to get benchmark data from the queue with a timeout
-                benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=0.01)
-                benchmark_data_queue.task_done()
-            except asyncio.TimeoutError:
-                # If timeout, continue to the next iteration
-                continue
-            # Update metrics based on the benchmark data
-            metrics.update_metrics(benchmark_data, api_plugin)
-            # Collect benchmark data for later database insertion
-            collected_benchmark_data.append(benchmark_data)
-            # Create a message with the updated metrics
-            message = metrics.create_message()
+    # Stream inserts to DB to avoid accumulating all results in memory
+    commit_every = args.db_commit_interval
+    processed_since_commit = 0
-            # Log the message to wandb\swanlab if the api key is provided
-            if args.wandb_api_key:
-                import wandb
-                wandb.log(message)
-            if args.swanlab_api_key:
-                import swanlab
-                swanlab.log(message)
-            # Log the message to the logger every n queries
-            if int(metrics.n_total_queries) % args.log_every_n_query == 0:
-                msg = json.dumps(message, ensure_ascii=False, indent=2)
-                logger.info(msg)
-            pbar.update(1)  # Update the progress bar
-    # Now perform database operations after all benchmark data has been processed
     with sqlite3.connect(result_db_path) as con:
         cursor = con.cursor()
         create_result_table(cursor)
-        for benchmark_data in collected_benchmark_data:
-            insert_benchmark_data(cursor, benchmark_data)
+        with tqdm(desc='Processing', total=args.number) as pbar:
+            while not (data_process_completed_event.is_set() and benchmark_data_queue.empty()):
+                try:
+                    benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=0.1)
+                except asyncio.TimeoutError:
+                    continue
+                # Update metrics and write to DB immediately
+                metrics.update_metrics(benchmark_data, api_plugin)
+                insert_benchmark_data(cursor, benchmark_data)
+                processed_since_commit += 1
+                if processed_since_commit >= commit_every:
+                    con.commit()
+                    processed_since_commit = 0
+                message = metrics.create_message()
+                if args.wandb_api_key:
+                    import wandb
+                    wandb.log(message)
+                if args.swanlab_api_key:
+                    import swanlab
+                    swanlab.log(message)
+                if int(metrics.n_total_queries) % args.log_every_n_query == 0:
+                    msg = json.dumps(message, ensure_ascii=False, indent=2)
+                    logger.info(msg)
+                benchmark_data_queue.task_done()
+                pbar.update(1)
         con.commit()
     return metrics, result_db_path
@@ -179,31 +149,46 @@ async def benchmark(args: Arguments) -> Tuple[Dict, Dict]:
         loop = asyncio.get_running_loop()
         add_signal_handlers(loop)
-    # Create API plugin instance for request/response processing
     api_plugin_class = ApiRegistry.get_class(args.api)
     api_plugin = api_plugin_class(args)
-    # init queue
-    benchmark_data_queue = asyncio.Queue()
-    # reset event
+    benchmark_data_queue: asyncio.Queue = asyncio.Queue(maxsize=max(1, args.parallel * args.queue_size_multiplier))
     data_process_completed_event.clear()
     # test connection
     await connect_test(args, api_plugin)
-    # start statistic benchmark metric
-    statistic_benchmark_metric_task = asyncio.create_task(
-        statistic_benchmark_metric(benchmark_data_queue, args, api_plugin)
-    )
-    # start send request
-    semaphore = asyncio.Semaphore(args.parallel)
-    send_request_tasks: List[asyncio.Task] = []
-    async for request in get_requests(args, api_plugin):
-        task = asyncio.create_task(send_request(semaphore, request, benchmark_data_queue, args, api_plugin))
-        send_request_tasks.append(task)
-    await asyncio.gather(*send_request_tasks, return_exceptions=True)
-    await benchmark_data_queue.join()
-    data_process_completed_event.set()
-    metrics, result_db_path = await statistic_benchmark_metric_task
+    # Create a single shared client session for all requests
+    client = AioHttpClient(args, api_plugin)
+    async with client:
+        # start statistic benchmark metric (consumer)
+        statistic_benchmark_metric_task = asyncio.create_task(
+            statistic_benchmark_metric(benchmark_data_queue, args, api_plugin)
+        )
+        # start sending requests with bounded in-flight tasks
+        semaphore = asyncio.Semaphore(args.parallel)
+        in_flight: set[asyncio.Task] = set()
+        max_in_flight = args.parallel * args.in_flight_task_multiplier
+        async for request in get_requests(args, api_plugin):
+            # Keep the number of scheduled tasks bounded to avoid OOM
+            if len(in_flight) >= max_in_flight:
+                done, pending = await asyncio.wait(in_flight, return_when=asyncio.FIRST_COMPLETED)
+                in_flight = pending
+            task = asyncio.create_task(send_request(semaphore, request, benchmark_data_queue, args, client))
+            in_flight.add(task)
+        # Wait for remaining in-flight tasks
+        if in_flight:
+            await asyncio.gather(*in_flight, return_exceptions=True)
+        # Drain queue and finish
+        await benchmark_data_queue.join()
+        data_process_completed_event.set()
+        metrics, result_db_path = await statistic_benchmark_metric_task
     metrics_result, percentile_result = summary_result(args, metrics, result_db_path)
     return metrics_result, percentile_result

evalscope/perf/http_client.py CHANGED Viewed

@@ -3,6 +3,7 @@ import asyncio
 import time
 from typing import TYPE_CHECKING, AsyncGenerator, Dict, List, Tuple
+from evalscope.perf.utils.benchmark_util import BenchmarkData
 from evalscope.utils.logger import get_logger
 from .arguments import Arguments
@@ -24,7 +25,22 @@ class AioHttpClient:
         self.read_timeout = args.read_timeout
         self.connect_timeout = args.connect_timeout
         self.api_plugin = api_plugin
+        # Configure connector similar to vLLM bench for better TTFT under load.
+        connector = aiohttp.TCPConnector(
+            limit=args.parallel or 0,  # 0 means no limit in aiohttp; use parallel as limit if set
+            limit_per_host=args.parallel or 0,
+            ttl_dns_cache=300,
+            use_dns_cache=True,
+            keepalive_timeout=60,
+            enable_cleanup_closed=True,
+            force_close=False,
+            ssl=('https://' in self.url),
+        )
         self.client = aiohttp.ClientSession(
+            connector=connector,
+            trust_env=True,
             timeout=aiohttp.ClientTimeout(connect=self.connect_timeout, sock_read=self.read_timeout),
             trace_configs=[self._create_trace_config()] if args.debug else []
         )
@@ -43,23 +59,25 @@ class AioHttpClient:
         trace_config.on_response_chunk_received.append(self.on_response_chunk_received)
         return trace_config
-    async def post(self, body):
-        """Send POST request and delegate response handling to API plugin.
-        Yields:
-            Tuple[bool, int, str]: (is_error, status_code, response_data)
+    async def post(self, body) -> BenchmarkData:
+        """
+        Send POST request and delegate response handling to API plugin.
+        Returns:
+            BenchmarkData: The benchmark data object containing request and response information.
         """
         try:
             # Delegate the request processing to the API plugin
-            async for result in self.api_plugin.process_request(self.client, self.url, self.headers, body):
-                yield result
+            output = await self.api_plugin.process_request(self.client, self.url, self.headers, body)
+            return output
         except asyncio.TimeoutError as e:
             logger.error(
                 f'TimeoutError: connect_timeout: {self.connect_timeout}, read_timeout: {self.read_timeout}. Please set longer timeout.'  # noqa: E501
             )
-            yield (True, None, str(e))
+            return BenchmarkData(success=False, error=str(e))
         except (aiohttp.ClientConnectorError, Exception) as e:
             logger.error(e)
-            yield (True, None, str(e))
+            return BenchmarkData(success=False, error=str(e))
     @staticmethod
     async def on_request_start(session, context, params: aiohttp.TraceRequestStartParams):
@@ -91,7 +109,6 @@ class AioHttpClient:
 async def test_connection(args: Arguments, api_plugin: 'ApiPluginBase') -> bool:
-    is_error = True
     start_time = time.perf_counter()
     async def attempt_connection():
@@ -100,18 +117,16 @@ async def test_connection(args: Arguments, api_plugin: 'ApiPluginBase') -> bool:
             messages = [{'role': 'user', 'content': 'hello'}] if args.apply_chat_template else 'hello'
             request = api_plugin.build_request(messages)
-            async for is_error, state_code, response_data in client.post(request):
-                return is_error, state_code, response_data
+            output = await client.post(request)
+            return output
     while True:
         try:
-            is_error, state_code, response_data = await asyncio.wait_for(
-                attempt_connection(), timeout=args.connect_timeout
-            )
-            if not is_error:
+            output = await asyncio.wait_for(attempt_connection(), timeout=args.connect_timeout)
+            if output.success:
                 logger.info('Test connection successful.')
                 return True
-            logger.warning(f'Retrying...  <{state_code}> {response_data}')
+            logger.warning(f'Retrying... <{output.error}>')
         except Exception as e:
             logger.warning(f'Retrying... <{e}>')

evalscope/perf/main.py CHANGED Viewed

@@ -4,7 +4,9 @@ import os
 import platform
 import threading
 import time
+import warnings
 from argparse import Namespace
+from logging import warn
 from evalscope.perf.utils.local_server import start_app
 from evalscope.perf.utils.log_utils import init_swanlab, init_wandb
@@ -79,9 +81,20 @@ def run_perf_benchmark(args):
     configure_logging(args.debug, os.path.join(output_path, 'benchmark.log'))
     # Initialize wandb and swanlab
-    if args.wandb_api_key:
+    visualizer = args.visualizer
+    if visualizer is None:
+        if args.wandb_api_key is not None:
+            visualizer = 'wandb'
+            warnings.warn('--wandb-api-key is deprecated. Please use `--visualizer wandb` instead.', DeprecationWarning)
+        elif args.swanlab_api_key is not None:
+            visualizer = 'swanlab'
+            warnings.warn(
+                '--swanlab-api-key is deprecated. Please use `--visualizer swanlab` instead.', DeprecationWarning
+            )
+    args.visualizer = visualizer
+    if visualizer == 'wandb':
         init_wandb(args)
-    if args.swanlab_api_key:
+    elif visualizer == 'swanlab':
         init_swanlab(args)
     # Initialize local server if needed

evalscope/perf/plugin/api/base.py CHANGED Viewed

@@ -3,6 +3,7 @@ from abc import abstractmethod
 from typing import Any, AsyncGenerator, Dict, List, Tuple
 from evalscope.perf.arguments import Arguments
+from evalscope.perf.utils.benchmark_util import BenchmarkData
 class ApiPluginBase:
@@ -28,13 +29,13 @@ class ApiPluginBase:
         raise NotImplementedError
     @abstractmethod
-    def parse_responses(self, responses: List, request: Any = None, **kwargs: Any) -> Tuple[int, int]:
+    def parse_responses(self, responses: List[Dict], request: str = None, **kwargs: Any) -> Tuple[int, int]:
         """Parser responses and return number of request and response tokens.
         Args:
-            responses (List[bytes]): List of http response body, for stream output,
+            responses (List[Dict]): List of http response body, for stream output,
                 there are multiple responses, each is bytes, for general only one.
-            request (Any): The request body.
+            request (str): The json string of request.
         Returns:
             Tuple: (Number of prompt_tokens and number of completion_tokens).
@@ -42,8 +43,9 @@ class ApiPluginBase:
         raise NotImplementedError
     @abstractmethod
-    async def process_request(self, client_session: aiohttp.ClientSession, url: str, headers: Dict,
-                              body: Dict) -> AsyncGenerator[Tuple[bool, int, Any], None]:
+    async def process_request(
+        self, client_session: aiohttp.ClientSession, url: str, headers: Dict, body: Dict
+    ) -> BenchmarkData:
         """Process the HTTP request and handle the response.
         Args:
@@ -52,8 +54,8 @@ class ApiPluginBase:
             headers: The request headers
             body: The request body
-        Yields:
-            Tuple[bool, int, Any]: (is_error, status_code, response_data)
+        Returns:
+            BenchmarkData: The benchmark data including response and timing info.
         """
         raise NotImplementedError

evalscope/perf/plugin/api/custom_api.py CHANGED Viewed

@@ -5,6 +5,7 @@ from typing import Any, AsyncGenerator, Dict, List, Tuple, Union
 from evalscope.perf.arguments import Arguments
 from evalscope.perf.plugin.api.base import ApiPluginBase
 from evalscope.perf.plugin.registry import register_api
+from evalscope.perf.utils.benchmark_util import BenchmarkData
 from evalscope.utils.logger import get_logger
 logger = get_logger()
@@ -98,7 +99,7 @@ class CustomPlugin(ApiPluginBase):
         return payload
-    def parse_responses(self, responses: List[str], request: Any = None, **kwargs) -> Tuple[int, int]:
+    def parse_responses(self, responses: List[Dict], request: str = None, **kwargs) -> Tuple[int, int]:
         """Parse API responses and return token counts.
         This method extracts the number of input and output tokens from the API responses.
@@ -106,8 +107,8 @@ class CustomPlugin(ApiPluginBase):
         to calculate it using a tokenizer.
         Args:
-            responses (List[str]): List of API response strings.
-            request (Any, optional): The original request, which might be needed for token calculation.
+            responses (List[Dict]): List of API response strings.
+            request (str, optional): The original request, which might be needed for token calculation.
             **kwargs: Additional arguments.
         Returns:
@@ -160,8 +161,9 @@ class CustomPlugin(ApiPluginBase):
             logger.error(f'Error parsing responses: {e}')
             return 0, 0
-    async def process_request(self, client_session: aiohttp.ClientSession, url: str, headers: Dict,
-                              body: Dict) -> AsyncGenerator[Tuple[bool, int, str], None]:
+    async def process_request(
+        self, client_session: aiohttp.ClientSession, url: str, headers: Dict, body: Dict
+    ) -> BenchmarkData:
         """Process the HTTP request and handle the response.
         This method handles sending the request to your API and processing the response,
@@ -173,60 +175,13 @@ class CustomPlugin(ApiPluginBase):
             headers (Dict): The request headers.
             body (Dict): The request body.
-        Yields:
-            Tuple[bool, int, str]: (is_error, status_code, response_data)
-                - is_error: Whether the response indicates an error
-                - status_code: HTTP status code
-                - response_data: Response content
+        Returns:
+            BenchmarkData: The benchmark data including response and timing info.
         """
-        try:
-            # Set content type header
-            headers = {'Content-Type': 'application/json', **headers}
-            # Convert body to JSON
-            data = json.dumps(body, ensure_ascii=False)
-            # Send the request
-            async with client_session.request('POST', url=url, data=data, headers=headers) as response:  # noqa: E125
-                # Get the status code
-                status_code = response.status
-                # Check if it's a streaming response
-                if 'text/event-stream' in response.content_type:
-                    # Handle streaming response
-                    async for line in response.content:
-                        line_str = line.decode('utf-8').strip()
-                        if not line_str:
-                            continue
-                        # Check for data prefix in server-sent events
-                        if line_str.startswith('data: '):
-                            data = line_str[6:]  # Remove 'data: ' prefix
-                            # Check if it's the end of the stream
-                            if data == '[DONE]':
-                                break
-                            try:
-                                # Parse the JSON data
-                                parsed_data = json.loads(data)
-                                yield (False, status_code, json.dumps(parsed_data))
-                            except json.JSONDecodeError:
-                                yield (True, status_code, f'Failed to parse JSON: {data}')
-                else:
-                    # Handle regular response
-                    if 'application/json' in response.content_type:
-                        # JSON response
-                        content = await response.json()
-                        yield (status_code >= 400, status_code, json.dumps(content))
-                    else:
-                        # Text response
-                        content = await response.text()
-                        yield (status_code >= 400, status_code, content)
-        except Exception as e:
-            logger.error(f'Error in process_request: {e}')
-            yield (True, 500, str(e))
+        raise NotImplementedError(
+            'The `process_request` method must be implemented in a subclass. '
+            'For OpenAI-compatible APIs, consider inheriting from `DefaultApiPlugin` to reuse the default implementation.'  # noqa: E501
+        )
 if __name__ == '__main__':

evalscope 1.0.2__py3-none-any.whl → 1.1.1__py3-none-any.whl

Potentially problematic release.

evalscope 1.0.2py3-none-any.whl → 1.1.1py3-none-any.whl