PyPI - evalscope - Versions diffs - 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

evalscope 1.0.0py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (324) hide show

evalscope/api/benchmark/__init__.py +9 -1
evalscope/api/benchmark/adapters/__init__.py +4 -0
evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
evalscope/api/benchmark/benchmark.py +85 -2
evalscope/api/benchmark/meta.py +10 -1
evalscope/api/dataset/dataset.py +27 -6
evalscope/api/dataset/loader.py +8 -3
evalscope/api/evaluator/cache.py +31 -4
evalscope/api/evaluator/evaluator.py +5 -0
evalscope/api/evaluator/state.py +17 -1
evalscope/api/messages/__init__.py +1 -0
evalscope/api/messages/chat_message.py +52 -2
evalscope/api/metric/__init__.py +1 -1
evalscope/api/metric/metric.py +6 -1
evalscope/api/metric/scorer.py +15 -7
evalscope/api/mixin/__init__.py +1 -1
evalscope/api/mixin/llm_judge_mixin.py +2 -0
evalscope/api/mixin/sandbox_mixin.py +182 -0
evalscope/api/model/generate_config.py +10 -6
evalscope/api/model/model.py +5 -2
evalscope/api/tool/tool_info.py +1 -1
evalscope/app/app.py +3 -0
evalscope/app/ui/multi_model.py +6 -1
evalscope/app/ui/single_model.py +11 -5
evalscope/app/utils/data_utils.py +8 -7
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/text_utils.py +14 -12
evalscope/app/utils/visualization.py +2 -2
evalscope/arguments.py +8 -4
evalscope/backend/opencompass/backend_manager.py +0 -2
evalscope/backend/rag_eval/utils/embedding.py +9 -1
evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
evalscope/benchmarks/aime/aime24_adapter.py +5 -0
evalscope/benchmarks/aime/aime25_adapter.py +136 -1
evalscope/benchmarks/aime/grader.py +307 -0
evalscope/benchmarks/aime/math_normalize.py +189 -0
evalscope/benchmarks/amc/amc_adapter.py +51 -0
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
evalscope/benchmarks/bfcl/v3/utils.py +23 -0
evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
evalscope/benchmarks/bfcl/v4/utils.py +410 -0
evalscope/benchmarks/biomix_qa/__init__.py +0 -0
evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
evalscope/benchmarks/blink/__init__.py +0 -0
evalscope/benchmarks/blink/blink_adapter.py +61 -0
evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
evalscope/benchmarks/chartqa/__init__.py +0 -0
evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
evalscope/benchmarks/chartqa/utils.py +38 -0
evalscope/benchmarks/coin_flip/__init__.py +0 -0
evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
evalscope/benchmarks/docvqa/__init__.py +0 -0
evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
evalscope/benchmarks/drivelology/__init__.py +0 -0
evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
evalscope/benchmarks/drop/drop_adapter.py +15 -44
evalscope/benchmarks/drop/utils.py +97 -0
evalscope/benchmarks/frames/frames_adapter.py +2 -1
evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
evalscope/benchmarks/general_arena/utils.py +2 -1
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
evalscope/benchmarks/halu_eval/__init__.py +0 -0
evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
evalscope/benchmarks/healthbench/__init__.py +0 -0
evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
evalscope/benchmarks/healthbench/utils.py +102 -0
evalscope/benchmarks/hle/hle_adapter.py +3 -2
evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
evalscope/benchmarks/humaneval/utils.py +235 -0
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/image_edit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/infovqa/__init__.py +0 -0
evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
evalscope/benchmarks/logi_qa/__int__.py +0 -0
evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
evalscope/benchmarks/math_qa/__init__.py +0 -0
evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
evalscope/benchmarks/math_verse/__init__.py +0 -0
evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
evalscope/benchmarks/math_vision/__init__.py +0 -0
evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
evalscope/benchmarks/math_vista/__init__.py +0 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
evalscope/benchmarks/med_mcqa/__init__.py +0 -0
evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
evalscope/benchmarks/minerva_math/__init__.py +0 -0
evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
evalscope/benchmarks/mm_bench/__init__.py +0 -0
evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
evalscope/benchmarks/mm_star/__init__.py +0 -0
evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
evalscope/benchmarks/multi_if/__init__.py +0 -0
evalscope/benchmarks/multi_if/ifeval.py +3354 -0
evalscope/benchmarks/multi_if/metrics.py +120 -0
evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
evalscope/benchmarks/music_trivia/__init__.py +0 -0
evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
evalscope/benchmarks/ner/__init__.py +0 -0
evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
evalscope/benchmarks/ner/copious_adapter.py +85 -0
evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
evalscope/benchmarks/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
evalscope/benchmarks/olympiad_bench/utils.py +565 -0
evalscope/benchmarks/omni_bench/__init__.py +0 -0
evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
evalscope/benchmarks/piqa/__init__.py +0 -0
evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
evalscope/benchmarks/poly_math/__init__.py +0 -0
evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
evalscope/benchmarks/pope/__init__.py +0 -0
evalscope/benchmarks/pope/pope_adapter.py +112 -0
evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
evalscope/benchmarks/pumed_qa/__init__.py +0 -0
evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
evalscope/benchmarks/qasc/__init__.py +0 -0
evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
evalscope/benchmarks/real_world_qa/__init__.py +0 -0
evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
evalscope/benchmarks/sciq/__init__.py +0 -0
evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
evalscope/benchmarks/simple_vqa/__init__.py +0 -0
evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
evalscope/benchmarks/siqa/__init__.py +0 -0
evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
evalscope/benchmarks/visu_logic/__init__.py +0 -0
evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
evalscope/benchmarks/wmt/__init__.py +0 -0
evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
evalscope/benchmarks/zerobench/__init__.py +0 -0
evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
evalscope/cli/start_app.py +7 -1
evalscope/cli/start_perf.py +7 -1
evalscope/config.py +103 -18
evalscope/constants.py +18 -0
evalscope/evaluator/evaluator.py +138 -82
evalscope/metrics/bert_score/__init__.py +0 -0
evalscope/metrics/bert_score/scorer.py +338 -0
evalscope/metrics/bert_score/utils.py +697 -0
evalscope/metrics/llm_judge.py +19 -7
evalscope/metrics/math_parser.py +14 -0
evalscope/metrics/metric.py +317 -13
evalscope/metrics/metrics.py +37 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
evalscope/models/image_edit_model.py +125 -0
evalscope/models/model_apis.py +22 -0
evalscope/models/openai_compatible.py +21 -0
evalscope/models/text2image_model.py +2 -2
evalscope/models/utils/openai.py +16 -6
evalscope/perf/arguments.py +26 -4
evalscope/perf/benchmark.py +76 -89
evalscope/perf/http_client.py +31 -16
evalscope/perf/main.py +15 -2
evalscope/perf/plugin/api/base.py +9 -7
evalscope/perf/plugin/api/custom_api.py +13 -58
evalscope/perf/plugin/api/default_api.py +188 -79
evalscope/perf/plugin/api/openai_api.py +85 -20
evalscope/perf/plugin/datasets/base.py +21 -0
evalscope/perf/plugin/datasets/custom.py +2 -3
evalscope/perf/plugin/datasets/flickr8k.py +2 -2
evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
evalscope/perf/plugin/datasets/line_by_line.py +2 -3
evalscope/perf/plugin/datasets/longalpaca.py +2 -3
evalscope/perf/plugin/datasets/openqa.py +2 -4
evalscope/perf/plugin/datasets/random_dataset.py +1 -3
evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
evalscope/perf/utils/benchmark_util.py +43 -27
evalscope/perf/utils/db_util.py +14 -19
evalscope/perf/utils/local_server.py +3 -44
evalscope/perf/utils/log_utils.py +21 -6
evalscope/report/__init__.py +13 -3
evalscope/report/combinator.py +91 -20
evalscope/report/generator.py +8 -87
evalscope/report/report.py +8 -4
evalscope/run.py +13 -5
evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
evalscope/utils/argument_utils.py +1 -1
evalscope/utils/chat_service.py +1 -1
evalscope/utils/function_utils.py +249 -12
evalscope/utils/import_utils.py +73 -1
evalscope/utils/io_utils.py +132 -7
evalscope/utils/json_schema.py +25 -2
evalscope/utils/logger.py +69 -18
evalscope/utils/model_utils.py +4 -3
evalscope/utils/multi_choices.py +39 -7
evalscope/utils/ner.py +377 -0
evalscope/version.py +2 -2
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
evalscope/api/mixin/dataset_mixin.py +0 -105
evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
tests/__init__.py +0 -1
tests/aigc/__init__.py +0 -1
tests/aigc/test_t2i.py +0 -142
tests/benchmark/__init__.py +0 -1
tests/benchmark/test_eval.py +0 -386
tests/cli/__init__.py +0 -1
tests/cli/test_all.py +0 -229
tests/cli/test_collection.py +0 -96
tests/cli/test_custom.py +0 -268
tests/perf/__init__.py +0 -1
tests/perf/test_perf.py +0 -176
tests/rag/test_clip_benchmark.py +0 -90
tests/rag/test_mteb.py +0 -213
tests/rag/test_ragas.py +0 -128
tests/swift/__init__.py +0 -1
tests/swift/test_run_swift_eval.py +0 -146
tests/swift/test_run_swift_vlm_eval.py +0 -128
tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
tests/test_run_all.py +0 -12
tests/utils.py +0 -13
tests/vlm/__init__.py +0 -1
tests/vlm/test_vlmeval.py +0 -102
/evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
{tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0

evalscope/perf/http_client.py CHANGED Viewed

@@ -3,6 +3,7 @@ import asyncio
 import time
 from typing import TYPE_CHECKING, AsyncGenerator, Dict, List, Tuple
+from evalscope.perf.utils.benchmark_util import BenchmarkData
 from evalscope.utils.logger import get_logger
 from .arguments import Arguments
@@ -24,7 +25,22 @@ class AioHttpClient:
         self.read_timeout = args.read_timeout
         self.connect_timeout = args.connect_timeout
         self.api_plugin = api_plugin
+        # Configure connector similar to vLLM bench for better TTFT under load.
+        connector = aiohttp.TCPConnector(
+            limit=args.parallel or 0,  # 0 means no limit in aiohttp; use parallel as limit if set
+            limit_per_host=args.parallel or 0,
+            ttl_dns_cache=300,
+            use_dns_cache=True,
+            keepalive_timeout=60,
+            enable_cleanup_closed=True,
+            force_close=False,
+            ssl=('https://' in self.url),
+        )
         self.client = aiohttp.ClientSession(
+            connector=connector,
+            trust_env=True,
             timeout=aiohttp.ClientTimeout(connect=self.connect_timeout, sock_read=self.read_timeout),
             trace_configs=[self._create_trace_config()] if args.debug else []
         )
@@ -43,23 +59,25 @@ class AioHttpClient:
         trace_config.on_response_chunk_received.append(self.on_response_chunk_received)
         return trace_config
-    async def post(self, body):
-        """Send POST request and delegate response handling to API plugin.
-        Yields:
-            Tuple[bool, int, str]: (is_error, status_code, response_data)
+    async def post(self, body) -> BenchmarkData:
+        """
+        Send POST request and delegate response handling to API plugin.
+        Returns:
+            BenchmarkData: The benchmark data object containing request and response information.
         """
         try:
             # Delegate the request processing to the API plugin
-            async for result in self.api_plugin.process_request(self.client, self.url, self.headers, body):
-                yield result
+            output = await self.api_plugin.process_request(self.client, self.url, self.headers, body)
+            return output
         except asyncio.TimeoutError as e:
             logger.error(
                 f'TimeoutError: connect_timeout: {self.connect_timeout}, read_timeout: {self.read_timeout}. Please set longer timeout.'  # noqa: E501
             )
-            yield (True, None, str(e))
+            return BenchmarkData(success=False, error=str(e))
         except (aiohttp.ClientConnectorError, Exception) as e:
             logger.error(e)
-            yield (True, None, str(e))
+            return BenchmarkData(success=False, error=str(e))
     @staticmethod
     async def on_request_start(session, context, params: aiohttp.TraceRequestStartParams):
@@ -91,7 +109,6 @@ class AioHttpClient:
 async def test_connection(args: Arguments, api_plugin: 'ApiPluginBase') -> bool:
-    is_error = True
     start_time = time.perf_counter()
     async def attempt_connection():
@@ -100,18 +117,16 @@ async def test_connection(args: Arguments, api_plugin: 'ApiPluginBase') -> bool:
             messages = [{'role': 'user', 'content': 'hello'}] if args.apply_chat_template else 'hello'
             request = api_plugin.build_request(messages)
-            async for is_error, state_code, response_data in client.post(request):
-                return is_error, state_code, response_data
+            output = await client.post(request)
+            return output
     while True:
         try:
-            is_error, state_code, response_data = await asyncio.wait_for(
-                attempt_connection(), timeout=args.connect_timeout
-            )
-            if not is_error:
+            output = await asyncio.wait_for(attempt_connection(), timeout=args.connect_timeout)
+            if output.success:
                 logger.info('Test connection successful.')
                 return True
-            logger.warning(f'Retrying...  <{state_code}> {response_data}')
+            logger.warning(f'Retrying... <{output.error}>')
         except Exception as e:
             logger.warning(f'Retrying... <{e}>')

evalscope/perf/main.py CHANGED Viewed

@@ -4,7 +4,9 @@ import os
 import platform
 import threading
 import time
+import warnings
 from argparse import Namespace
+from logging import warn
 from evalscope.perf.utils.local_server import start_app
 from evalscope.perf.utils.log_utils import init_swanlab, init_wandb
@@ -79,9 +81,20 @@ def run_perf_benchmark(args):
     configure_logging(args.debug, os.path.join(output_path, 'benchmark.log'))
     # Initialize wandb and swanlab
-    if args.wandb_api_key:
+    visualizer = args.visualizer
+    if visualizer is None:
+        if args.wandb_api_key is not None:
+            visualizer = 'wandb'
+            warnings.warn('--wandb-api-key is deprecated. Please use `--visualizer wandb` instead.', DeprecationWarning)
+        elif args.swanlab_api_key is not None:
+            visualizer = 'swanlab'
+            warnings.warn(
+                '--swanlab-api-key is deprecated. Please use `--visualizer swanlab` instead.', DeprecationWarning
+            )
+    args.visualizer = visualizer
+    if visualizer == 'wandb':
         init_wandb(args)
-    if args.swanlab_api_key:
+    elif visualizer == 'swanlab':
         init_swanlab(args)
     # Initialize local server if needed

evalscope/perf/plugin/api/base.py CHANGED Viewed

@@ -3,6 +3,7 @@ from abc import abstractmethod
 from typing import Any, AsyncGenerator, Dict, List, Tuple
 from evalscope.perf.arguments import Arguments
+from evalscope.perf.utils.benchmark_util import BenchmarkData
 class ApiPluginBase:
@@ -28,13 +29,13 @@ class ApiPluginBase:
         raise NotImplementedError
     @abstractmethod
-    def parse_responses(self, responses: List, request: Any = None, **kwargs: Any) -> Tuple[int, int]:
+    def parse_responses(self, responses: List[Dict], request: str = None, **kwargs: Any) -> Tuple[int, int]:
         """Parser responses and return number of request and response tokens.
         Args:
-            responses (List[bytes]): List of http response body, for stream output,
+            responses (List[Dict]): List of http response body, for stream output,
                 there are multiple responses, each is bytes, for general only one.
-            request (Any): The request body.
+            request (str): The json string of request.
         Returns:
             Tuple: (Number of prompt_tokens and number of completion_tokens).
@@ -42,8 +43,9 @@ class ApiPluginBase:
         raise NotImplementedError
     @abstractmethod
-    async def process_request(self, client_session: aiohttp.ClientSession, url: str, headers: Dict,
-                              body: Dict) -> AsyncGenerator[Tuple[bool, int, str], None]:
+    async def process_request(
+        self, client_session: aiohttp.ClientSession, url: str, headers: Dict, body: Dict
+    ) -> BenchmarkData:
         """Process the HTTP request and handle the response.
         Args:
@@ -52,8 +54,8 @@ class ApiPluginBase:
             headers: The request headers
             body: The request body
-        Yields:
-            Tuple[bool, int, str]: (is_error, status_code, response_data)
+        Returns:
+            BenchmarkData: The benchmark data including response and timing info.
         """
         raise NotImplementedError

evalscope/perf/plugin/api/custom_api.py CHANGED Viewed

@@ -5,6 +5,7 @@ from typing import Any, AsyncGenerator, Dict, List, Tuple, Union
 from evalscope.perf.arguments import Arguments
 from evalscope.perf.plugin.api.base import ApiPluginBase
 from evalscope.perf.plugin.registry import register_api
+from evalscope.perf.utils.benchmark_util import BenchmarkData
 from evalscope.utils.logger import get_logger
 logger = get_logger()
@@ -98,7 +99,7 @@ class CustomPlugin(ApiPluginBase):
         return payload
-    def parse_responses(self, responses: List[str], request: Any = None, **kwargs) -> Tuple[int, int]:
+    def parse_responses(self, responses: List[Dict], request: str = None, **kwargs) -> Tuple[int, int]:
         """Parse API responses and return token counts.
         This method extracts the number of input and output tokens from the API responses.
@@ -106,8 +107,8 @@ class CustomPlugin(ApiPluginBase):
         to calculate it using a tokenizer.
         Args:
-            responses (List[str]): List of API response strings.
-            request (Any, optional): The original request, which might be needed for token calculation.
+            responses (List[Dict]): List of API response strings.
+            request (str, optional): The original request, which might be needed for token calculation.
             **kwargs: Additional arguments.
         Returns:
@@ -160,8 +161,9 @@ class CustomPlugin(ApiPluginBase):
             logger.error(f'Error parsing responses: {e}')
             return 0, 0
-    async def process_request(self, client_session: aiohttp.ClientSession, url: str, headers: Dict,
-                              body: Dict) -> AsyncGenerator[Tuple[bool, int, str], None]:
+    async def process_request(
+        self, client_session: aiohttp.ClientSession, url: str, headers: Dict, body: Dict
+    ) -> BenchmarkData:
         """Process the HTTP request and handle the response.
         This method handles sending the request to your API and processing the response,
@@ -173,60 +175,13 @@ class CustomPlugin(ApiPluginBase):
             headers (Dict): The request headers.
             body (Dict): The request body.
-        Yields:
-            Tuple[bool, int, str]: (is_error, status_code, response_data)
-                - is_error: Whether the response indicates an error
-                - status_code: HTTP status code
-                - response_data: Response content
+        Returns:
+            BenchmarkData: The benchmark data including response and timing info.
         """
-        try:
-            # Set content type header
-            headers = {'Content-Type': 'application/json', **headers}
-            # Convert body to JSON
-            data = json.dumps(body, ensure_ascii=False)
-            # Send the request
-            async with client_session.request('POST', url=url, data=data, headers=headers) as response:  # noqa: E125
-                # Get the status code
-                status_code = response.status
-                # Check if it's a streaming response
-                if 'text/event-stream' in response.content_type:
-                    # Handle streaming response
-                    async for line in response.content:
-                        line_str = line.decode('utf-8').strip()
-                        if not line_str:
-                            continue
-                        # Check for data prefix in server-sent events
-                        if line_str.startswith('data: '):
-                            data = line_str[6:]  # Remove 'data: ' prefix
-                            # Check if it's the end of the stream
-                            if data == '[DONE]':
-                                break
-                            try:
-                                # Parse the JSON data
-                                parsed_data = json.loads(data)
-                                yield (False, status_code, json.dumps(parsed_data))
-                            except json.JSONDecodeError:
-                                yield (True, status_code, f'Failed to parse JSON: {data}')
-                else:
-                    # Handle regular response
-                    if 'application/json' in response.content_type:
-                        # JSON response
-                        content = await response.json()
-                        yield (status_code >= 400, status_code, json.dumps(content))
-                    else:
-                        # Text response
-                        content = await response.text()
-                        yield (status_code >= 400, status_code, content)
-        except Exception as e:
-            logger.error(f'Error in process_request: {e}')
-            yield (True, 500, str(e))
+        raise NotImplementedError(
+            'The `process_request` method must be implemented in a subclass. '
+            'For OpenAI-compatible APIs, consider inheriting from `DefaultApiPlugin` to reuse the default implementation.'  # noqa: E501
+        )
 if __name__ == '__main__':

evalscope/perf/plugin/api/default_api.py CHANGED Viewed

@@ -1,24 +1,77 @@
 import aiohttp
+import codecs
 import json
-from http import HTTPStatus
-from typing import Any, AsyncGenerator, Dict, List, Tuple
+import sys
+import time
+import traceback
+from typing import Any, Dict
 from evalscope.perf.arguments import Arguments
 from evalscope.perf.plugin.api.base import ApiPluginBase
-from evalscope.perf.utils.local_server import ServerSentEvent
+from evalscope.perf.utils.benchmark_util import BenchmarkData
 from evalscope.utils.logger import get_logger
 logger = get_logger()
+class StreamedResponseHandler:
+    """Handles streaming HTTP responses by accumulating chunks until complete
+    messages are available."""
+    def __init__(self):
+        self.buffer = ''
+        # Keep decoder state across chunks to handle split multibyte sequences
+        self.decoder = codecs.getincrementaldecoder('utf-8')()
+    def add_chunk(self, chunk_bytes: bytes) -> list[str]:
+        """Add a chunk of bytes to the buffer and return any complete
+        messages."""
+        # Use incremental decoding so incomplete multibyte sequences don't error
+        try:
+            chunk_str = self.decoder.decode(chunk_bytes, final=False)
+        except UnicodeDecodeError:
+            # Bad bytes: drop them and reset decoder state to avoid corruption
+            self.decoder.reset()
+            chunk_str = chunk_bytes.decode('utf-8', errors='ignore')
+        self.buffer += chunk_str
+        messages = []
+        # Split by double newlines (SSE message separator)
+        while '\n\n' in self.buffer:
+            message, self.buffer = self.buffer.split('\n\n', 1)
+            message = message.strip()
+            if message:
+                messages.append(message)
+        # if self.buffer is not empty, check if it is a complete message
+        # by removing data: prefix and check if it is a valid JSON
+        if self.buffer.startswith('data: '):
+            message_content = self.buffer.removeprefix('data: ').strip()
+            if message_content == '[DONE]':
+                messages.append(self.buffer.strip())
+                self.buffer = ''
+            elif message_content:
+                try:
+                    json.loads(message_content)
+                    messages.append(self.buffer.strip())
+                    self.buffer = ''
+                except json.JSONDecodeError:
+                    # Incomplete JSON, wait for more chunks.
+                    pass
+        return messages
 class DefaultApiPlugin(ApiPluginBase):
     """Default implementation of API plugin with common HTTP handling methods."""
     def __init__(self, param: Arguments):
         super().__init__(param)
-    async def process_request(self, client_session: aiohttp.ClientSession, url: str, headers: Dict,
-                              body: Dict) -> AsyncGenerator[Tuple[bool, int, str], None]:
+    async def process_request(
+        self, client_session: aiohttp.ClientSession, url: str, headers: Dict, body: Dict
+    ) -> BenchmarkData:
         """Process the HTTP request and handle the response.
         Args:
@@ -27,79 +80,135 @@ class DefaultApiPlugin(ApiPluginBase):
             headers: The request headers
             body: The request body
-        Yields:
-            Tuple[bool, int, str]: (is_error, status_code, response_data)
-        """
-        try:
-            headers = {'Content-Type': 'application/json', **headers}
-            data = json.dumps(body, ensure_ascii=False)  # serialize to JSON
-            async with client_session.request('POST', url=url, data=data, headers=headers) as response:
-                async for result in self._handle_response(response):
-                    yield result
-        except Exception as e:
-            logger.error(f'Error in process_request: {e}')
-            yield (True, None, str(e))
-    async def _handle_stream(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, str], None]:
-        """Handle streaming response from server-sent events.
-        Args:
-            response: The aiohttp response object containing a stream
-        Yields:
-            Tuple[bool, int, Any]: (is_error, status_code, data)
+        Returns:
+            BenchmarkData: Aggregated benchmarking data for the request/response.
         """
+        headers = {'Content-Type': 'application/json', **headers}
+        data = json.dumps(body, ensure_ascii=False)  # serialize to JSON
+        output = BenchmarkData()
+        ttft = 0.0
+        generated_text = ''
+        st = time.perf_counter()
+        output.start_time = st
+        output.request = data
+        most_recent_timestamp = st
         try:
-            async for chunk_bytes in response.content:
-                chunk_bytes = chunk_bytes.strip()
-                if not chunk_bytes:
-                    continue
-                chunk_bytes = chunk_bytes.decode('utf-8')
-                # NOTE: SSE comments (often used as pings) start with a colon.
-                # These are not JSON data payload and should be skipped.
-                if chunk_bytes.startswith(':'):
-                    continue
-                chunk = chunk_bytes.removeprefix('data: ')
-                if chunk != '[DONE]':
-                    data = json.loads(chunk)
-                    yield False, response.status, data
-        except Exception as e:
-            logger.error(f'Error in _handle_stream: {e}')
-            yield True, response.status, str(e)
-    async def _handle_response(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, str], None]:
-        """Handle the HTTP response based on content type and status.
-        Args:
-            response: The aiohttp response object
-        Yields:
-            Tuple[bool, int, str]: (is_error, status_code, response_data)
-        """
-        response_status = response.status
-        response_content_type = response.content_type
-        content_type_json = 'application/json'
-        content_type_stream = 'text/event-stream'
-        is_success = (response_status == HTTPStatus.OK)
-        if is_success:
-            # Handle successful response with 'text/event-stream' content type
-            if content_type_stream in response_content_type:
-                async for is_error, response_status, content in self._handle_stream(response):
-                    yield (is_error, response_status, content)
-            # Handle successful response with 'application/json' content type
-            elif content_type_json in response_content_type:
-                content = await response.json()
-                yield (False, response_status, json.dumps(content, ensure_ascii=False))
-            # Handle other successful responses
-            else:
-                content = await response.read()
-                yield (False, response_status, content.decode('utf-8'))
-        else:
-            # error is always in JSON format
-            error = await response.json()
-            yield (True, response_status, json.dumps(error, ensure_ascii=False))
+            async with client_session.post(url=url, data=data, headers=headers) as response:
+                content_type = response.headers.get('Content-Type', '')
+                if response.status == 200:
+                    # Handle streaming responses (SSE)
+                    if 'text/event-stream' in content_type:
+                        handler = StreamedResponseHandler()
+                        async for chunk_bytes in response.content.iter_any():
+                            if not chunk_bytes:
+                                continue
+                            messages = handler.add_chunk(chunk_bytes)
+                            for message in messages:
+                                # NOTE: SSE comments (often used as pings) start with
+                                # a colon. These are not JSON data payload and should
+                                # be skipped.
+                                if message.startswith(':'):
+                                    continue
+                                chunk = message.removeprefix('data: ')
+                                if chunk != '[DONE]':
+                                    timestamp = time.perf_counter()
+                                    data = json.loads(chunk)
+                                    if choices := data.get('choices'):
+                                        content = choices[0]['delta'].get('content')
+                                        # First token
+                                        if ttft == 0.0:
+                                            ttft = timestamp - st
+                                            output.first_chunk_latency = ttft
+                                        # Decoding phase
+                                        else:
+                                            output.inter_chunk_latency.append(timestamp - most_recent_timestamp)
+                                        generated_text += content or ''
+                                        output.response_messages.append(data)
+                                    elif usage := data.get('usage'):
+                                        output.prompt_tokens = usage.get('prompt_tokens')
+                                        output.completion_tokens = usage.get('completion_tokens')
+                                    most_recent_timestamp = timestamp
+                        output.generated_text = generated_text
+                        output.success = True
+                        output.completed_time = most_recent_timestamp
+                        output.query_latency = most_recent_timestamp - st
+                    # Handle non-stream JSON responses
+                    elif 'application/json' in content_type or 'application/' in content_type:
+                        payload: Any
+                        try:
+                            payload = await response.json()
+                        except Exception:
+                            # Fallback to text if JSON parsing fails
+                            payload = await response.text()
+                        timestamp = time.perf_counter()
+                        output.completed_time = timestamp
+                        output.query_latency = timestamp - st
+                        # For non-stream, first chunk equals full latency
+                        output.first_chunk_latency = output.query_latency
+                        if isinstance(payload, dict):
+                            # Extract generated text from choices
+                            text = ''
+                            if choices := payload.get('choices'):
+                                first = choices[0] if choices else {}
+                                # Chat Completions format
+                                msg = first.get('message') or {}
+                                if isinstance(msg, dict) and msg.get('content') is not None:
+                                    text = msg.get('content') or ''
+                                else:
+                                    # Legacy Completions format
+                                    text = first.get('text') or ''
+                            generated_text = text
+                            # Extract usage if provided
+                            if usage := payload.get('usage'):
+                                output.prompt_tokens = usage.get('prompt_tokens')
+                                output.completion_tokens = usage.get('completion_tokens')
+                            output.response_messages.append(payload)
+                        else:
+                            generated_text = str(payload)
+                        output.generated_text = generated_text
+                        output.success = True
+                    else:
+                        # Unknown successful content-type: read as text
+                        raw = await response.text()
+                        timestamp = time.perf_counter()
+                        output.completed_time = timestamp
+                        output.query_latency = timestamp - st
+                        output.first_chunk_latency = output.query_latency
+                        output.generated_text = raw
+                        output.response_messages.append(raw)
+                        output.success = True
+                else:
+                    # Try to parse structured error, fallback to reason/text
+                    try:
+                        err_payload = await response.json()
+                        output.error = json.dumps(err_payload, ensure_ascii=False)
+                    except Exception:
+                        try:
+                            output.error = await response.text()
+                        except Exception:
+                            output.error = response.reason or ''
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = ''.join(traceback.format_exception(*exc_info))
+            logger.error(output.error)
+        return output

evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

evalscope 1.0.0py3-none-any.whl → 1.2.0py3-none-any.whl