PyPI - evalscope - Versions diffs - 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

evalscope 1.0.0py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (324) hide show

evalscope/api/benchmark/__init__.py +9 -1
evalscope/api/benchmark/adapters/__init__.py +4 -0
evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
evalscope/api/benchmark/benchmark.py +85 -2
evalscope/api/benchmark/meta.py +10 -1
evalscope/api/dataset/dataset.py +27 -6
evalscope/api/dataset/loader.py +8 -3
evalscope/api/evaluator/cache.py +31 -4
evalscope/api/evaluator/evaluator.py +5 -0
evalscope/api/evaluator/state.py +17 -1
evalscope/api/messages/__init__.py +1 -0
evalscope/api/messages/chat_message.py +52 -2
evalscope/api/metric/__init__.py +1 -1
evalscope/api/metric/metric.py +6 -1
evalscope/api/metric/scorer.py +15 -7
evalscope/api/mixin/__init__.py +1 -1
evalscope/api/mixin/llm_judge_mixin.py +2 -0
evalscope/api/mixin/sandbox_mixin.py +182 -0
evalscope/api/model/generate_config.py +10 -6
evalscope/api/model/model.py +5 -2
evalscope/api/tool/tool_info.py +1 -1
evalscope/app/app.py +3 -0
evalscope/app/ui/multi_model.py +6 -1
evalscope/app/ui/single_model.py +11 -5
evalscope/app/utils/data_utils.py +8 -7
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/text_utils.py +14 -12
evalscope/app/utils/visualization.py +2 -2
evalscope/arguments.py +8 -4
evalscope/backend/opencompass/backend_manager.py +0 -2
evalscope/backend/rag_eval/utils/embedding.py +9 -1
evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
evalscope/benchmarks/aime/aime24_adapter.py +5 -0
evalscope/benchmarks/aime/aime25_adapter.py +136 -1
evalscope/benchmarks/aime/grader.py +307 -0
evalscope/benchmarks/aime/math_normalize.py +189 -0
evalscope/benchmarks/amc/amc_adapter.py +51 -0
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
evalscope/benchmarks/bfcl/v3/utils.py +23 -0
evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
evalscope/benchmarks/bfcl/v4/utils.py +410 -0
evalscope/benchmarks/biomix_qa/__init__.py +0 -0
evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
evalscope/benchmarks/blink/__init__.py +0 -0
evalscope/benchmarks/blink/blink_adapter.py +61 -0
evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
evalscope/benchmarks/chartqa/__init__.py +0 -0
evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
evalscope/benchmarks/chartqa/utils.py +38 -0
evalscope/benchmarks/coin_flip/__init__.py +0 -0
evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
evalscope/benchmarks/docvqa/__init__.py +0 -0
evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
evalscope/benchmarks/drivelology/__init__.py +0 -0
evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
evalscope/benchmarks/drop/drop_adapter.py +15 -44
evalscope/benchmarks/drop/utils.py +97 -0
evalscope/benchmarks/frames/frames_adapter.py +2 -1
evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
evalscope/benchmarks/general_arena/utils.py +2 -1
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
evalscope/benchmarks/halu_eval/__init__.py +0 -0
evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
evalscope/benchmarks/healthbench/__init__.py +0 -0
evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
evalscope/benchmarks/healthbench/utils.py +102 -0
evalscope/benchmarks/hle/hle_adapter.py +3 -2
evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
evalscope/benchmarks/humaneval/utils.py +235 -0
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/image_edit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/infovqa/__init__.py +0 -0
evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
evalscope/benchmarks/logi_qa/__int__.py +0 -0
evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
evalscope/benchmarks/math_qa/__init__.py +0 -0
evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
evalscope/benchmarks/math_verse/__init__.py +0 -0
evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
evalscope/benchmarks/math_vision/__init__.py +0 -0
evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
evalscope/benchmarks/math_vista/__init__.py +0 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
evalscope/benchmarks/med_mcqa/__init__.py +0 -0
evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
evalscope/benchmarks/minerva_math/__init__.py +0 -0
evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
evalscope/benchmarks/mm_bench/__init__.py +0 -0
evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
evalscope/benchmarks/mm_star/__init__.py +0 -0
evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
evalscope/benchmarks/multi_if/__init__.py +0 -0
evalscope/benchmarks/multi_if/ifeval.py +3354 -0
evalscope/benchmarks/multi_if/metrics.py +120 -0
evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
evalscope/benchmarks/music_trivia/__init__.py +0 -0
evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
evalscope/benchmarks/ner/__init__.py +0 -0
evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
evalscope/benchmarks/ner/copious_adapter.py +85 -0
evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
evalscope/benchmarks/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
evalscope/benchmarks/olympiad_bench/utils.py +565 -0
evalscope/benchmarks/omni_bench/__init__.py +0 -0
evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
evalscope/benchmarks/piqa/__init__.py +0 -0
evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
evalscope/benchmarks/poly_math/__init__.py +0 -0
evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
evalscope/benchmarks/pope/__init__.py +0 -0
evalscope/benchmarks/pope/pope_adapter.py +112 -0
evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
evalscope/benchmarks/pumed_qa/__init__.py +0 -0
evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
evalscope/benchmarks/qasc/__init__.py +0 -0
evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
evalscope/benchmarks/real_world_qa/__init__.py +0 -0
evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
evalscope/benchmarks/sciq/__init__.py +0 -0
evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
evalscope/benchmarks/simple_vqa/__init__.py +0 -0
evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
evalscope/benchmarks/siqa/__init__.py +0 -0
evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
evalscope/benchmarks/visu_logic/__init__.py +0 -0
evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
evalscope/benchmarks/wmt/__init__.py +0 -0
evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
evalscope/benchmarks/zerobench/__init__.py +0 -0
evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
evalscope/cli/start_app.py +7 -1
evalscope/cli/start_perf.py +7 -1
evalscope/config.py +103 -18
evalscope/constants.py +18 -0
evalscope/evaluator/evaluator.py +138 -82
evalscope/metrics/bert_score/__init__.py +0 -0
evalscope/metrics/bert_score/scorer.py +338 -0
evalscope/metrics/bert_score/utils.py +697 -0
evalscope/metrics/llm_judge.py +19 -7
evalscope/metrics/math_parser.py +14 -0
evalscope/metrics/metric.py +317 -13
evalscope/metrics/metrics.py +37 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
evalscope/models/image_edit_model.py +125 -0
evalscope/models/model_apis.py +22 -0
evalscope/models/openai_compatible.py +21 -0
evalscope/models/text2image_model.py +2 -2
evalscope/models/utils/openai.py +16 -6
evalscope/perf/arguments.py +26 -4
evalscope/perf/benchmark.py +76 -89
evalscope/perf/http_client.py +31 -16
evalscope/perf/main.py +15 -2
evalscope/perf/plugin/api/base.py +9 -7
evalscope/perf/plugin/api/custom_api.py +13 -58
evalscope/perf/plugin/api/default_api.py +188 -79
evalscope/perf/plugin/api/openai_api.py +85 -20
evalscope/perf/plugin/datasets/base.py +21 -0
evalscope/perf/plugin/datasets/custom.py +2 -3
evalscope/perf/plugin/datasets/flickr8k.py +2 -2
evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
evalscope/perf/plugin/datasets/line_by_line.py +2 -3
evalscope/perf/plugin/datasets/longalpaca.py +2 -3
evalscope/perf/plugin/datasets/openqa.py +2 -4
evalscope/perf/plugin/datasets/random_dataset.py +1 -3
evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
evalscope/perf/utils/benchmark_util.py +43 -27
evalscope/perf/utils/db_util.py +14 -19
evalscope/perf/utils/local_server.py +3 -44
evalscope/perf/utils/log_utils.py +21 -6
evalscope/report/__init__.py +13 -3
evalscope/report/combinator.py +91 -20
evalscope/report/generator.py +8 -87
evalscope/report/report.py +8 -4
evalscope/run.py +13 -5
evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
evalscope/utils/argument_utils.py +1 -1
evalscope/utils/chat_service.py +1 -1
evalscope/utils/function_utils.py +249 -12
evalscope/utils/import_utils.py +73 -1
evalscope/utils/io_utils.py +132 -7
evalscope/utils/json_schema.py +25 -2
evalscope/utils/logger.py +69 -18
evalscope/utils/model_utils.py +4 -3
evalscope/utils/multi_choices.py +39 -7
evalscope/utils/ner.py +377 -0
evalscope/version.py +2 -2
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
evalscope/api/mixin/dataset_mixin.py +0 -105
evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
tests/__init__.py +0 -1
tests/aigc/__init__.py +0 -1
tests/aigc/test_t2i.py +0 -142
tests/benchmark/__init__.py +0 -1
tests/benchmark/test_eval.py +0 -386
tests/cli/__init__.py +0 -1
tests/cli/test_all.py +0 -229
tests/cli/test_collection.py +0 -96
tests/cli/test_custom.py +0 -268
tests/perf/__init__.py +0 -1
tests/perf/test_perf.py +0 -176
tests/rag/test_clip_benchmark.py +0 -90
tests/rag/test_mteb.py +0 -213
tests/rag/test_ragas.py +0 -128
tests/swift/__init__.py +0 -1
tests/swift/test_run_swift_eval.py +0 -146
tests/swift/test_run_swift_vlm_eval.py +0 -128
tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
tests/test_run_all.py +0 -12
tests/utils.py +0 -13
tests/vlm/__init__.py +0 -1
tests/vlm/test_vlmeval.py +0 -102
/evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
{tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0

evalscope/perf/plugin/api/openai_api.py CHANGED Viewed

@@ -1,10 +1,13 @@
 import json
+import math
 import os
+from collections import defaultdict
 from typing import Any, Dict, List, Tuple, Union
 from evalscope.perf.arguments import Arguments
 from evalscope.perf.plugin.api.default_api import DefaultApiPlugin
 from evalscope.perf.plugin.registry import register_api
+from evalscope.utils.io_utils import base64_to_PIL
 from evalscope.utils.logger import get_logger
 logger = get_logger()
@@ -99,7 +102,7 @@ class OpenaiPlugin(DefaultApiPlugin):
             payload.update(param.extra_args)
         return payload
-    def parse_responses(self, responses, request: Any = None, **kwargs) -> tuple[int, int]:
+    def parse_responses(self, responses: List[Dict], request: str = None, **kwargs) -> tuple[int, int]:
         """Parser responses and return number of request and response tokens.
         Only one response for non-stream, multiple responses for stream.
         """
@@ -113,7 +116,7 @@ class OpenaiPlugin(DefaultApiPlugin):
             return input_tokens, output_tokens
         # no usage information in the response, parse the response to get the tokens
-        delta_contents = {}
+        delta_contents = defaultdict(list)
         for response in responses:
             if 'object' in response:
                 self.__process_response_object(response, delta_contents)
@@ -123,41 +126,46 @@ class OpenaiPlugin(DefaultApiPlugin):
         input_tokens, output_tokens = self.__calculate_tokens_from_content(request, delta_contents)
         return input_tokens, output_tokens
-    def __process_response_object(self, js, delta_contents):
-        if js['object'] == 'chat.completion':
-            for choice in js['choices']:
+    def __process_response_object(self, response, delta_contents):
+        if not response.get('choices'):
+            return
+        if response['object'] == 'chat.completion':
+            for choice in response['choices']:
                 delta_contents[choice['index']] = [choice['message']['content']]
-        elif js['object'] == 'text_completion':
-            for choice in js['choices']:
-                delta_contents[choice['index']] = [choice['text']]
-        elif js['object'] == 'chat.completion.chunk':
-            for choice in js.get('choices', []):
+        elif response['object'] == 'text_completion':
+            for choice in response['choices']:
+                if 'text' in choice and 'index' in choice:
+                    delta_contents[choice['index']].append(choice['text'])
+        elif response['object'] == 'chat.completion.chunk':
+            for choice in response['choices']:
                 if 'delta' in choice and 'index' in choice:
                     delta = choice['delta']
                     idx = choice['index']
                     if 'content' in delta:
-                        delta_content = delta['content']
-                        delta_contents.setdefault(idx, []).append(delta_content)
+                        delta_contents[idx].append(delta['content'])
-    def __process_no_object(self, js, delta_contents):
+    def __process_no_object(self, response, delta_contents):
         #  assume the response is a single choice
-        for choice in js['choices']:
+        if not response.get('choices'):
+            return
+        for choice in response['choices']:
             if 'delta' in choice:
                 delta = choice['delta']
                 idx = choice['index']
                 if 'content' in delta:
-                    delta_content = delta['content']
-                    delta_contents.setdefault(idx, []).append(delta_content)
+                    delta_contents[idx].append(delta['content'])
             else:
                 delta_contents[choice['index']] = [choice['message']['content']]
-    def __calculate_tokens_from_content(self, request, delta_contents):
+    def __calculate_tokens_from_content(self, request, content):
         input_tokens = output_tokens = 0
         if self.tokenizer is not None:
-            for idx, choice_contents in delta_contents.items():
+            # Calculate input tokens
+            input_tokens += self._count_input_tokens(request)
+            for idx, choice_contents in content.items():
                 full_response_content = ''.join(choice_contents)
-                input_tokens += len(self.tokenizer.encode(request['messages'][0]['content']))
-                output_tokens += len(self.tokenizer.encode(full_response_content))
+                # Calculate output tokens
+                output_tokens += self._count_output_tokens(full_response_content)
         else:
             raise ValueError(
                 'Error: Unable to retrieve usage information\n\n'
@@ -171,3 +179,60 @@ class OpenaiPlugin(DefaultApiPlugin):
                 'please open an issue on our GitHub repository https://github.com/modelscope/evalscope .'
             )
         return input_tokens, output_tokens
+    def _count_input_tokens(self, request_str: str) -> int:
+        """Count the number of input tokens in the request.
+        This method handles different types of requests and calculates tokens for:
+        - Text content in messages or prompts
+        - Images in multimodal messages (converted to patch tokens)
+        Args:
+            request_str (str): The request json str containing either 'messages' for chat
+                          completion or 'prompt' for text completion.
+        Returns:
+            int: The total number of input tokens including text and image tokens.
+        """
+        input_tokens = 0
+        request = json.loads(request_str)
+        if 'messages' in request:
+            input_content = self.tokenizer.apply_chat_template(
+                request['messages'], tokenize=True, add_generation_prompt=True
+            )
+            input_tokens += len(input_content)
+            # handle image tokens if any
+            for message in request['messages']:
+                content = message.get('content', '')
+                if isinstance(content, str):
+                    continue
+                for cont in content:
+                    if cont['type'] == 'image_url':
+                        try:
+                            # assuming image_url is base64 string
+                            image_base64 = cont['image_url']['url']
+                            image = base64_to_PIL(image_base64)
+                            # Use math.ceil for more accurate token count when image dimensions
+                            # aren't perfectly divisible by patch size
+                            n_patches = (
+                                math.ceil(image.height / self.param.image_patch_size)
+                                * math.ceil(image.width / self.param.image_patch_size)
+                            )
+                            input_tokens += n_patches
+                        except Exception as e:
+                            logger.warning(f'Failed to process image for token counting: {e}')
+                            # Continue processing other content without failing
+        elif 'prompt' in request:
+            input_tokens += len(self.tokenizer.encode(request['prompt'], add_special_tokens=False))
+        return input_tokens
+    def _count_output_tokens(self, response: str) -> int:
+        """Count the number of output tokens in the response. Only string response is supported.
+        Args:
+            response (str): The API response text.
+        Returns:
+            int: The number of output tokens.
+        """
+        return len(self.tokenizer.encode(response, add_special_tokens=False))

evalscope/perf/plugin/datasets/base.py CHANGED Viewed

@@ -15,6 +15,11 @@ class DatasetPluginBase:
             dataset_path (str, optional): The input dataset path. Defaults to None.
         """
         self.query_parameters = query_parameters
+        if query_parameters.tokenizer_path:
+            from modelscope import AutoTokenizer
+            self.tokenizer = AutoTokenizer.from_pretrained(query_parameters.tokenizer_path, trust_remote_code=True)
+        else:
+            self.tokenizer = None
     def __next__(self):
         for item in self.build_messages():
@@ -85,3 +90,19 @@ class DatasetPluginBase:
             for url in image_urls:
                 message['content'].append({'type': 'image_url', 'image_url': {'url': url}})
         return message
+    def check_prompt_length(self, prompt: str) -> Tuple[bool, int]:
+        """Check if the prompt length is within the specified range.
+        Args:
+            prompt (str): The input prompt string.
+        Returns:
+            Tuple[bool, int]: A tuple containing a boolean indicating whether the prompt is valid and its length.
+        """
+        if self.tokenizer is None:
+            prompt_length = len(prompt)
+        else:
+            prompt_length = len(self.tokenizer.encode(prompt))
+        is_valid = self.query_parameters.min_prompt_length <= prompt_length <= self.query_parameters.max_prompt_length
+        return is_valid, prompt_length

evalscope/perf/plugin/datasets/custom.py CHANGED Viewed

@@ -16,9 +16,8 @@ class CustomDatasetPlugin(DatasetPluginBase):
     def build_messages(self) -> Iterator[List[Dict]]:
         for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
             prompt = item.strip()
-            if len(prompt) > self.query_parameters.min_prompt_length and len(
-                prompt
-            ) < self.query_parameters.max_prompt_length:
+            is_valid, _ = self.check_prompt_length(prompt)
+            if is_valid:
                 if self.query_parameters.apply_chat_template:
                     message = self.create_message(prompt)
                     yield [message]

evalscope/perf/plugin/datasets/flickr8k.py CHANGED Viewed

@@ -22,7 +22,7 @@ class FlickrDatasetPlugin(DatasetPluginBase):
         for item in dataset:
             pil_image = item['jpg']
             text = item['txt']
-            base64_image = PIL_to_base64(pil_image)
+            base64_image = PIL_to_base64(pil_image, add_header=True)
-            message = self.create_message(text=text, image_urls=f'data:image/jpeg;base64,{base64_image}')
+            message = self.create_message(text=text, image_urls=base64_image)
             yield [message]

evalscope/perf/plugin/datasets/kontext_bench.py CHANGED Viewed

@@ -22,7 +22,7 @@ class KontextDatasetPlugin(DatasetPluginBase):
         for item in dataset:
             pil_image = item['image']
             text = item['instruction']
-            base64_image = PIL_to_base64(pil_image)
+            base64_image = PIL_to_base64(pil_image, add_header=True)
-            message = self.create_message(text=text, image_urls=f'data:image/jpeg;base64,{base64_image}')
+            message = self.create_message(text=text, image_urls=base64_image)
             yield [message]

evalscope/perf/plugin/datasets/line_by_line.py CHANGED Viewed

@@ -17,9 +17,8 @@ class LineByLineDatasetPlugin(DatasetPluginBase):
     def build_messages(self) -> Iterator[List[Dict]]:
         for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
             prompt = item.strip()
-            if len(prompt) > self.query_parameters.min_prompt_length and len(
-                prompt
-            ) < self.query_parameters.max_prompt_length:
+            is_valid, _ = self.check_prompt_length(prompt)
+            if is_valid:
                 if self.query_parameters.apply_chat_template:
                     message = self.create_message(prompt)
                     yield [message]

evalscope/perf/plugin/datasets/longalpaca.py CHANGED Viewed

@@ -22,9 +22,8 @@ class LongAlpacaDatasetPlugin(DatasetPluginBase):
             ds = self.dataset_json_list(self.query_parameters.dataset_path)
         for item in ds:
             prompt = item['instruction'].strip()
-            if len(prompt) > self.query_parameters.min_prompt_length and len(
-                prompt
-            ) < self.query_parameters.max_prompt_length:
+            is_valid, _ = self.check_prompt_length(prompt)
+            if is_valid:
                 if self.query_parameters.apply_chat_template:
                     message = self.create_message(prompt)
                     yield [message]

evalscope/perf/plugin/datasets/openqa.py CHANGED Viewed

@@ -27,10 +27,8 @@ class OpenqaDatasetPlugin(DatasetPluginBase):
         for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
             item = json.loads(item)
             prompt = item['question'].strip()
-            if (
-                len(prompt) > self.query_parameters.min_prompt_length
-                and len(prompt) < self.query_parameters.max_prompt_length
-            ):
+            is_valid, _ = self.check_prompt_length(prompt)
+            if is_valid:
                 if self.query_parameters.apply_chat_template:
                     message = self.create_message(prompt)
                     yield [message]

evalscope/perf/plugin/datasets/random_dataset.py CHANGED Viewed

@@ -12,11 +12,9 @@ class RandomDatasetPlugin(DatasetPluginBase):
     """
     def __init__(self, query_parameters: Arguments):
+        assert query_parameters.tokenizer_path, 'Tokenizer path is required for random data generation, please provide it with `--tokenizer-path`.'  # noqa: E501
         super().__init__(query_parameters)
-        assert self.query_parameters.tokenizer_path, 'Tokenizer path is required for random data generation, please provide it with `--tokenizer_path`.'  # noqa: E501
-        from modelscope import AutoTokenizer
-        self.tokenizer = AutoTokenizer.from_pretrained(self.query_parameters.tokenizer_path, trust_remote_code=True)
         self.prefix_length = self.query_parameters.prefix_length
         self.prefix_ids = self.get_random_inputs(self.prefix_length)
         self.template_len = self.get_template_len()

evalscope/perf/plugin/datasets/random_vl_dataset.py CHANGED Viewed

@@ -31,7 +31,7 @@ class RandomVLDatasetPlugin(RandomDatasetPlugin):
             # Generate random images based on image_num
             images_b64 = []
             for _ in range(self.image_num):
-                images_b64.append(f'data:image/png;base64,{self._generate_random_image_b64()}')
+                images_b64.append(self._generate_random_image_b64())
             message = self.create_message(text=prompt, image_urls=images_b64)
             yield [message]
@@ -77,4 +77,4 @@ class RandomVLDatasetPlugin(RandomDatasetPlugin):
                 draw.line(coords, fill=shape_color, width=random.randint(1, 5))
         # Convert to base64
-        return PIL_to_base64(image, format='PNG')
+        return PIL_to_base64(image, format='PNG', add_header=True)

evalscope/perf/utils/benchmark_util.py CHANGED Viewed

@@ -1,8 +1,7 @@
-import time
-import torch
 from dataclasses import dataclass, field
 from typing import Any, List, Optional, Tuple
+from evalscope.utils.import_utils import check_import
 from evalscope.utils.logger import get_logger
 logger = get_logger()
@@ -10,7 +9,7 @@ logger = get_logger()
 @dataclass
 class BenchmarkData:
-    request: Any = None
+    request: str = None  # json serialized request body
     start_time: float = 0.0
     completed_time: float = 0.0
     chunk_times: List[float] = field(default_factory=list)
@@ -24,30 +23,34 @@ class BenchmarkData:
     time_per_output_token: float = 0.0
     inter_chunk_latency: List[float] = field(default_factory=list)
-    prompt_tokens = None
-    completion_tokens = None
-    def _calculate_query_stream_metric(self) -> None:
-        self.query_latency = self.completed_time - self.start_time
-        # only for stream responses
-        if len(self.chunk_times) > 1:
-            self.first_chunk_latency = self.chunk_times[0] - self.start_time
-            # remove the first chunk time from the total latency
-            self.time_per_output_token = (self.query_latency - self.first_chunk_latency
-                                          ) / (self.completion_tokens - 1) if self.completion_tokens > 1 else 0.0
-            self.inter_chunk_latency = [t2 - t1 for t1, t2 in zip(self.chunk_times[:-1], self.chunk_times[1:])]
-        else:
-            self.first_chunk_latency = self.query_latency
+    # response content
+    generated_text: str = ''
+    error: Optional[str] = None
+    prompt_tokens: Optional[int] = None
+    completion_tokens: Optional[int] = None
     def _calculate_tokens(self, api_plugin):
-        self.prompt_tokens, self.completion_tokens = \
-            api_plugin.parse_responses(self.response_messages, request=self.request)
+        if self.prompt_tokens is None or self.completion_tokens is None:
+            self.prompt_tokens, self.completion_tokens = api_plugin.parse_responses(
+                self.response_messages, request=self.request
+            )
+        # Calculate time per output token
+        if self.completion_tokens and self.completion_tokens > 1:
+            # tpot = (latency - ttft) / (output_len - 1)
+            self.time_per_output_token = (self.query_latency - self.first_chunk_latency) / (self.completion_tokens - 1)
+        # Ensure inter-chunk latency is available (compute from chunk_times if needed)
+        if not self.inter_chunk_latency and self.chunk_times:
+            self.inter_chunk_latency = [t2 - t1 for t1, t2 in zip(self.chunk_times[:-1], self.chunk_times[1:])]
     def update_gpu_usage(self):
-        total_memory = 0
-        for i in range(torch.cuda.device_count()):
-            total_memory += (torch.cuda.max_memory_allocated(i) / 2**30)  # GB
-        self.max_gpu_memory_cost = max(self.max_gpu_memory_cost, total_memory)
+        if check_import('torch', raise_warning=False):
+            import torch
+            total_memory = 0
+            for i in range(torch.cuda.device_count()):
+                total_memory += (torch.cuda.max_memory_allocated(i) / 2**30)  # GB
+            self.max_gpu_memory_cost = max(self.max_gpu_memory_cost, total_memory)
 class Metrics:
@@ -77,6 +80,7 @@ class BenchmarkMetrics:
     n_total_prompt_tokens: int = 0
     n_total_completion_tokens: int = 0
     start_time: Optional[float] = None
+    last_completed_time: Optional[float] = None
     total_time: float = 1.0
     n_total_queries: int = 0
     n_time_per_output_token: float = 0.0
@@ -95,9 +99,6 @@ class BenchmarkMetrics:
     def update_metrics(self, benchmark_data: BenchmarkData, api_plugin):
         self.n_total_queries += 1
-        if self.start_time is None:
-            self.start_time = benchmark_data.start_time
-        self.total_time = time.perf_counter() - self.start_time
         if benchmark_data.success:
             self.n_succeed_queries += 1
@@ -106,7 +107,6 @@ class BenchmarkMetrics:
             self.n_total_prompt_tokens += benchmark_data.prompt_tokens
             self.n_total_completion_tokens += benchmark_data.completion_tokens
-            benchmark_data._calculate_query_stream_metric()
             self.total_latency += benchmark_data.query_latency
             self.total_first_chunk_latency += benchmark_data.first_chunk_latency
             self.n_time_per_output_token += benchmark_data.time_per_output_token
@@ -115,6 +115,22 @@ class BenchmarkMetrics:
             self.n_failed_queries += 1
         self.calculate_averages()
+        self.update_total_time(benchmark_data)
+    def update_total_time(self, benchmark_data: BenchmarkData):
+        # Use the earliest start_time seen so far
+        if self.start_time is None:
+            self.start_time = benchmark_data.start_time
+        else:
+            self.start_time = min(self.start_time, benchmark_data.start_time)
+        # Track the latest completion time
+        if self.last_completed_time is None:
+            self.last_completed_time = benchmark_data.completed_time
+        else:
+            self.last_completed_time = max(self.last_completed_time, benchmark_data.completed_time)
+        # Compute total_time from request lifecycle timestamps to avoid consumer overhead
+        if self.start_time is not None and self.last_completed_time is not None:
+            self.total_time = max(self.last_completed_time - self.start_time, 0.0)
     def calculate_averages(self):
         if self.n_succeed_queries == 0:

evalscope/perf/utils/db_util.py CHANGED Viewed

@@ -19,7 +19,7 @@ logger = get_logger()
 class DatabaseColumns:
     REQUEST = 'request'
     START_TIME = 'start_time'
-    CHUNK_TIMES = 'chunk_times'
+    INTER_TOKEN_LATENCIES = 'inter_token_latencies'
     SUCCESS = 'success'
     RESPONSE_MESSAGES = 'response_messages'
     COMPLETED_TIME = 'completed_time'
@@ -60,7 +60,7 @@ def create_result_table(cursor):
         f'''CREATE TABLE IF NOT EXISTS result(
                       {DatabaseColumns.REQUEST} TEXT,
                       {DatabaseColumns.START_TIME} REAL,
-                      {DatabaseColumns.CHUNK_TIMES} TEXT,
+                      {DatabaseColumns.INTER_TOKEN_LATENCIES} TEXT,
                       {DatabaseColumns.SUCCESS} INTEGER,
                       {DatabaseColumns.RESPONSE_MESSAGES} TEXT,
                       {DatabaseColumns.COMPLETED_TIME} REAL,
@@ -75,15 +75,15 @@ def create_result_table(cursor):
 def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData):
-    request = encode_data(benchmark_data.request)
-    chunk_times = json.dumps(benchmark_data.chunk_times)
+    request = benchmark_data.request
+    inter_token_latencies = json.dumps(benchmark_data.inter_chunk_latency)
     response_messages = encode_data(benchmark_data.response_messages)
     # Columns common to both success and failure cases
     common_columns = (
         request,
         benchmark_data.start_time,
-        chunk_times,
+        inter_token_latencies,
         benchmark_data.success,
         response_messages,
         benchmark_data.completed_time,
@@ -96,7 +96,7 @@ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData)
             benchmark_data.completion_tokens, benchmark_data.max_gpu_memory_cost, benchmark_data.time_per_output_token
         )
         query = f"""INSERT INTO result(
-                      {DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.CHUNK_TIMES},
+                      {DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.INTER_TOKEN_LATENCIES},
                       {DatabaseColumns.SUCCESS}, {DatabaseColumns.RESPONSE_MESSAGES}, {DatabaseColumns.COMPLETED_TIME},
                       {DatabaseColumns.LATENCY}, {DatabaseColumns.FIRST_CHUNK_LATENCY}, {DatabaseColumns.PROMPT_TOKENS},
                       {DatabaseColumns.COMPLETION_TOKENS}, {DatabaseColumns.MAX_GPU_MEMORY_COST},
@@ -105,7 +105,7 @@ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData)
         cursor.execute(query, common_columns + additional_columns)
     else:
         query = f"""INSERT INTO result(
-                      {DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.CHUNK_TIMES},
+                      {DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.INTER_TOKEN_LATENCIES},
                       {DatabaseColumns.SUCCESS}, {DatabaseColumns.RESPONSE_MESSAGES}, {DatabaseColumns.COMPLETED_TIME}
                    ) VALUES (?, ?, ?, ?, ?, ?)"""
         cursor.execute(query, common_columns)
@@ -173,20 +173,11 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
     :param result_db_path: Path to the SQLite database file.
     :return: Dictionary of percentiles for various metrics.
     """
-    def inter_token_latencies(chunk_times_json: str) -> List[float]:
-        try:
-            chunk_times = json.loads(chunk_times_json)
-            return [t2 - t1 for t1, t2 in zip(chunk_times[:-1], chunk_times[1:])]
-        except (json.JSONDecodeError, TypeError) as e:
-            logger.error(f'Error parsing chunk times: {e}')
-            return []
-    query_sql = f'''SELECT {DatabaseColumns.START_TIME}, {DatabaseColumns.CHUNK_TIMES}, {DatabaseColumns.SUCCESS},
+    query_sql = f'''SELECT {DatabaseColumns.START_TIME}, {DatabaseColumns.INTER_TOKEN_LATENCIES}, {DatabaseColumns.SUCCESS},
                     {DatabaseColumns.COMPLETED_TIME}, {DatabaseColumns.LATENCY}, {DatabaseColumns.FIRST_CHUNK_LATENCY},
                     {DatabaseColumns.PROMPT_TOKENS},
                     {DatabaseColumns.COMPLETION_TOKENS}, {DatabaseColumns.TIME_PER_OUTPUT_TOKEN}
-                    FROM result WHERE {DatabaseColumns.SUCCESS}=1'''
+                    FROM result WHERE {DatabaseColumns.SUCCESS}=1'''  # noqa: E501
     percentiles = [10, 25, 50, 66, 75, 80, 90, 95, 98, 99]
@@ -202,7 +193,11 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
     # Prepare data for each metric
     inter_token_latencies_all = []
     for row in rows:
-        inter_token_latencies_all.extend(inter_token_latencies(row[col_indices[DatabaseColumns.CHUNK_TIMES]]))
+        try:
+            itl = json.loads(row[col_indices[DatabaseColumns.INTER_TOKEN_LATENCIES]]) or []
+            inter_token_latencies_all.extend(itl)
+        except (json.JSONDecodeError, TypeError) as e:
+            logger.error(f'Error parsing inter token latencies: {e}')
     metrics = {
         PercentileMetrics.TTFT: [row[col_indices[DatabaseColumns.FIRST_CHUNK_LATENCY]] for row in rows],

evalscope/perf/utils/local_server.py CHANGED Viewed

@@ -2,61 +2,18 @@ import os
 import subprocess
 import uvicorn
 from contextlib import asynccontextmanager
-from dataclasses import dataclass
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from sse_starlette.sse import EventSourceResponse
 from evalscope.perf.arguments import Arguments
 from evalscope.utils.chat_service import ChatCompletionRequest, ChatService, ModelList, TextCompletionRequest
+from evalscope.utils.import_utils import check_import
 from evalscope.utils.logger import get_logger
 logger = get_logger()
-@dataclass
-class ServerSentEvent(object):
-    def __init__(self, data='', event=None, id=None, retry=None):
-        self.data = data
-        self.event = event
-        self.id = id
-        self.retry = retry
-    @classmethod
-    def decode(cls, line):
-        """Decode line to ServerSentEvent
-        Args:
-            line (str): The line.
-        Return:
-            ServerSentEvent (obj:`ServerSentEvent`): The ServerSentEvent object.
-        """
-        if not line:
-            return None
-        sse_msg = cls()
-        # format data:xxx
-        field_type, _, field_value = line.partition(':')
-        if field_value.startswith(' '):  # compatible with openai api
-            field_value = field_value[1:]
-        if field_type == 'event':
-            sse_msg.event = field_value
-        elif field_type == 'data':
-            field_value = field_value.rstrip()
-            sse_msg.data = field_value
-        elif field_type == 'id':
-            sse_msg.id = field_value
-        elif field_type == 'retry':
-            sse_msg.retry = field_value
-        else:
-            pass
-        return sse_msg
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     yield
@@ -101,6 +58,8 @@ def create_app(model, attn_implementation=None) -> FastAPI:
 def start_app(args: Arguments):
     logger.info('Starting local server, please wait...')
     if args.api == 'local':
+        check_import('torch', 'torch', raise_error=True)
         app = create_app(args.model, args.attn_implementation)
         uvicorn.run(app, host='0.0.0.0', port=args.port, workers=1)

evalscope/perf/utils/log_utils.py CHANGED Viewed

@@ -15,29 +15,42 @@ def init_wandb(args: Arguments) -> None:
         raise RuntimeError('Cannot import wandb. Please install it with command: \n pip install wandb')
     os.environ['WANDB_SILENT'] = 'true'
     os.environ['WANDB_DIR'] = args.outputs_dir
-    wandb.login(key=args.wandb_api_key)
     current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
     name = args.name if args.name else f'{args.model_id}_{current_time}'
-    wandb.init(project='perf_benchmark', name=name, config=args.to_dict())
+    # Remove sensitive information from logging config
+    logging_config = args.to_dict()
+    logging_config.pop('api_key', None)
+    logging_config.pop('wandb_api_key', None)
+    if args.wandb_api_key is not None:
+        wandb.login(key=args.wandb_api_key)
+    wandb.init(project='perf_benchmark', name=name, config=logging_config)
 def init_swanlab(args: Arguments) -> None:
+    """
+    Initialize SwanLab for logging.
+    """
     import datetime
     try:
         import swanlab
     except ImportError:
         raise RuntimeError('Cannot import swanlab. Please install it with command: \n pip install swanlab')
     os.environ['SWANLAB_LOG_DIR'] = args.outputs_dir
-    if not args.swanlab_api_key == 'local':
-        swanlab.login(api_key=args.swanlab_api_key)
     current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
     name = args.name if args.name else f'{args.model_id}_{current_time}'
     swanlab.config.update({'framework': '📏evalscope'})
+    # Remove sensitive information from logging config
+    logging_config = args.to_dict()
+    logging_config.pop('api_key', None)
+    logging_config.pop('swanlab_api_key', None)
     init_kwargs = {
         'project': os.getenv('SWANLAB_PROJ_NAME', 'perf_benchmark'),
         'name': name,
-        'config': args.to_dict(),
+        'config': logging_config,
         'mode': 'local' if args.swanlab_api_key == 'local' else None
     }
@@ -45,4 +58,6 @@ def init_swanlab(args: Arguments) -> None:
     if workspace:
         init_kwargs['workspace'] = workspace
+    if isinstance(args.swanlab_api_key, str) and not args.swanlab_api_key == 'local':
+        swanlab.login(api_key=args.swanlab_api_key)
     swanlab.init(**init_kwargs)

evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

evalscope 1.0.0py3-none-any.whl → 1.2.0py3-none-any.whl