PyPI - evalscope - Versions diffs - 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

evalscope 1.0.0py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (324) hide show

evalscope/api/benchmark/__init__.py +9 -1
evalscope/api/benchmark/adapters/__init__.py +4 -0
evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
evalscope/api/benchmark/benchmark.py +85 -2
evalscope/api/benchmark/meta.py +10 -1
evalscope/api/dataset/dataset.py +27 -6
evalscope/api/dataset/loader.py +8 -3
evalscope/api/evaluator/cache.py +31 -4
evalscope/api/evaluator/evaluator.py +5 -0
evalscope/api/evaluator/state.py +17 -1
evalscope/api/messages/__init__.py +1 -0
evalscope/api/messages/chat_message.py +52 -2
evalscope/api/metric/__init__.py +1 -1
evalscope/api/metric/metric.py +6 -1
evalscope/api/metric/scorer.py +15 -7
evalscope/api/mixin/__init__.py +1 -1
evalscope/api/mixin/llm_judge_mixin.py +2 -0
evalscope/api/mixin/sandbox_mixin.py +182 -0
evalscope/api/model/generate_config.py +10 -6
evalscope/api/model/model.py +5 -2
evalscope/api/tool/tool_info.py +1 -1
evalscope/app/app.py +3 -0
evalscope/app/ui/multi_model.py +6 -1
evalscope/app/ui/single_model.py +11 -5
evalscope/app/utils/data_utils.py +8 -7
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/text_utils.py +14 -12
evalscope/app/utils/visualization.py +2 -2
evalscope/arguments.py +8 -4
evalscope/backend/opencompass/backend_manager.py +0 -2
evalscope/backend/rag_eval/utils/embedding.py +9 -1
evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
evalscope/benchmarks/aime/aime24_adapter.py +5 -0
evalscope/benchmarks/aime/aime25_adapter.py +136 -1
evalscope/benchmarks/aime/grader.py +307 -0
evalscope/benchmarks/aime/math_normalize.py +189 -0
evalscope/benchmarks/amc/amc_adapter.py +51 -0
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
evalscope/benchmarks/bfcl/v3/utils.py +23 -0
evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
evalscope/benchmarks/bfcl/v4/utils.py +410 -0
evalscope/benchmarks/biomix_qa/__init__.py +0 -0
evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
evalscope/benchmarks/blink/__init__.py +0 -0
evalscope/benchmarks/blink/blink_adapter.py +61 -0
evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
evalscope/benchmarks/chartqa/__init__.py +0 -0
evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
evalscope/benchmarks/chartqa/utils.py +38 -0
evalscope/benchmarks/coin_flip/__init__.py +0 -0
evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
evalscope/benchmarks/docvqa/__init__.py +0 -0
evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
evalscope/benchmarks/drivelology/__init__.py +0 -0
evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
evalscope/benchmarks/drop/drop_adapter.py +15 -44
evalscope/benchmarks/drop/utils.py +97 -0
evalscope/benchmarks/frames/frames_adapter.py +2 -1
evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
evalscope/benchmarks/general_arena/utils.py +2 -1
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
evalscope/benchmarks/halu_eval/__init__.py +0 -0
evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
evalscope/benchmarks/healthbench/__init__.py +0 -0
evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
evalscope/benchmarks/healthbench/utils.py +102 -0
evalscope/benchmarks/hle/hle_adapter.py +3 -2
evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
evalscope/benchmarks/humaneval/utils.py +235 -0
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/image_edit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/infovqa/__init__.py +0 -0
evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
evalscope/benchmarks/logi_qa/__int__.py +0 -0
evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
evalscope/benchmarks/math_qa/__init__.py +0 -0
evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
evalscope/benchmarks/math_verse/__init__.py +0 -0
evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
evalscope/benchmarks/math_vision/__init__.py +0 -0
evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
evalscope/benchmarks/math_vista/__init__.py +0 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
evalscope/benchmarks/med_mcqa/__init__.py +0 -0
evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
evalscope/benchmarks/minerva_math/__init__.py +0 -0
evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
evalscope/benchmarks/mm_bench/__init__.py +0 -0
evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
evalscope/benchmarks/mm_star/__init__.py +0 -0
evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
evalscope/benchmarks/multi_if/__init__.py +0 -0
evalscope/benchmarks/multi_if/ifeval.py +3354 -0
evalscope/benchmarks/multi_if/metrics.py +120 -0
evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
evalscope/benchmarks/music_trivia/__init__.py +0 -0
evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
evalscope/benchmarks/ner/__init__.py +0 -0
evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
evalscope/benchmarks/ner/copious_adapter.py +85 -0
evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
evalscope/benchmarks/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
evalscope/benchmarks/olympiad_bench/utils.py +565 -0
evalscope/benchmarks/omni_bench/__init__.py +0 -0
evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
evalscope/benchmarks/piqa/__init__.py +0 -0
evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
evalscope/benchmarks/poly_math/__init__.py +0 -0
evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
evalscope/benchmarks/pope/__init__.py +0 -0
evalscope/benchmarks/pope/pope_adapter.py +112 -0
evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
evalscope/benchmarks/pumed_qa/__init__.py +0 -0
evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
evalscope/benchmarks/qasc/__init__.py +0 -0
evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
evalscope/benchmarks/real_world_qa/__init__.py +0 -0
evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
evalscope/benchmarks/sciq/__init__.py +0 -0
evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
evalscope/benchmarks/simple_vqa/__init__.py +0 -0
evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
evalscope/benchmarks/siqa/__init__.py +0 -0
evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
evalscope/benchmarks/visu_logic/__init__.py +0 -0
evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
evalscope/benchmarks/wmt/__init__.py +0 -0
evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
evalscope/benchmarks/zerobench/__init__.py +0 -0
evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
evalscope/cli/start_app.py +7 -1
evalscope/cli/start_perf.py +7 -1
evalscope/config.py +103 -18
evalscope/constants.py +18 -0
evalscope/evaluator/evaluator.py +138 -82
evalscope/metrics/bert_score/__init__.py +0 -0
evalscope/metrics/bert_score/scorer.py +338 -0
evalscope/metrics/bert_score/utils.py +697 -0
evalscope/metrics/llm_judge.py +19 -7
evalscope/metrics/math_parser.py +14 -0
evalscope/metrics/metric.py +317 -13
evalscope/metrics/metrics.py +37 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
evalscope/models/image_edit_model.py +125 -0
evalscope/models/model_apis.py +22 -0
evalscope/models/openai_compatible.py +21 -0
evalscope/models/text2image_model.py +2 -2
evalscope/models/utils/openai.py +16 -6
evalscope/perf/arguments.py +26 -4
evalscope/perf/benchmark.py +76 -89
evalscope/perf/http_client.py +31 -16
evalscope/perf/main.py +15 -2
evalscope/perf/plugin/api/base.py +9 -7
evalscope/perf/plugin/api/custom_api.py +13 -58
evalscope/perf/plugin/api/default_api.py +188 -79
evalscope/perf/plugin/api/openai_api.py +85 -20
evalscope/perf/plugin/datasets/base.py +21 -0
evalscope/perf/plugin/datasets/custom.py +2 -3
evalscope/perf/plugin/datasets/flickr8k.py +2 -2
evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
evalscope/perf/plugin/datasets/line_by_line.py +2 -3
evalscope/perf/plugin/datasets/longalpaca.py +2 -3
evalscope/perf/plugin/datasets/openqa.py +2 -4
evalscope/perf/plugin/datasets/random_dataset.py +1 -3
evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
evalscope/perf/utils/benchmark_util.py +43 -27
evalscope/perf/utils/db_util.py +14 -19
evalscope/perf/utils/local_server.py +3 -44
evalscope/perf/utils/log_utils.py +21 -6
evalscope/report/__init__.py +13 -3
evalscope/report/combinator.py +91 -20
evalscope/report/generator.py +8 -87
evalscope/report/report.py +8 -4
evalscope/run.py +13 -5
evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
evalscope/utils/argument_utils.py +1 -1
evalscope/utils/chat_service.py +1 -1
evalscope/utils/function_utils.py +249 -12
evalscope/utils/import_utils.py +73 -1
evalscope/utils/io_utils.py +132 -7
evalscope/utils/json_schema.py +25 -2
evalscope/utils/logger.py +69 -18
evalscope/utils/model_utils.py +4 -3
evalscope/utils/multi_choices.py +39 -7
evalscope/utils/ner.py +377 -0
evalscope/version.py +2 -2
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
evalscope/api/mixin/dataset_mixin.py +0 -105
evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
tests/__init__.py +0 -1
tests/aigc/__init__.py +0 -1
tests/aigc/test_t2i.py +0 -142
tests/benchmark/__init__.py +0 -1
tests/benchmark/test_eval.py +0 -386
tests/cli/__init__.py +0 -1
tests/cli/test_all.py +0 -229
tests/cli/test_collection.py +0 -96
tests/cli/test_custom.py +0 -268
tests/perf/__init__.py +0 -1
tests/perf/test_perf.py +0 -176
tests/rag/test_clip_benchmark.py +0 -90
tests/rag/test_mteb.py +0 -213
tests/rag/test_ragas.py +0 -128
tests/swift/__init__.py +0 -1
tests/swift/test_run_swift_eval.py +0 -146
tests/swift/test_run_swift_vlm_eval.py +0 -128
tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
tests/test_run_all.py +0 -12
tests/utils.py +0 -13
tests/vlm/__init__.py +0 -1
tests/vlm/test_vlmeval.py +0 -102
/evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
{tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0

evalscope/models/image_edit_model.py ADDED Viewed

@@ -0,0 +1,125 @@
+from __future__ import annotations
+import importlib
+import time
+import torch
+from logging import getLogger
+from typing import Any, Dict, List, Literal, Optional, Protocol, Tuple, Union, cast
+from evalscope.api.messages import (
+    ChatMessage,
+    ChatMessageAssistant,
+    ContentAudio,
+    ContentImage,
+    ContentText,
+    ContentVideo,
+)
+from evalscope.api.model import (
+    ChatCompletionChoice,
+    GenerateConfig,
+    Logprob,
+    Logprobs,
+    ModelAPI,
+    ModelOutput,
+    ModelUsage,
+    TopLogprob,
+)
+from evalscope.api.tool import ToolChoice, ToolInfo
+from evalscope.utils.io_utils import PIL_to_base64, base64_to_PIL
+from evalscope.utils.model_utils import get_device
+logger = getLogger()
+class ImageEditAPI(ModelAPI):
+    def __init__(
+        self,
+        model_name: str,
+        base_url: Optional[str] = None,
+        api_key: Optional[str] = None,
+        config: GenerateConfig = GenerateConfig(),
+        **model_args: Any,
+    ):
+        super().__init__(
+            model_name=model_name,
+            base_url=base_url,
+            api_key=api_key,
+            config=config,
+        )
+        # collect known model_args (then delete them so we can pass the rest on)
+        def collect_model_arg(name: str) -> Optional[Any]:
+            nonlocal model_args
+            value = model_args.get(name, None)
+            if value is not None:
+                model_args.pop(name)
+            return value
+        model_path = collect_model_arg('model_path')
+        torch_dtype = collect_model_arg('precision') or collect_model_arg('torch_dtype')
+        device_map = collect_model_arg('device_map')
+        # torch dtype
+        DTYPE_MAP = {'float16': torch.float16, 'float32': torch.float32, 'bfloat16': torch.bfloat16, 'auto': 'auto'}
+        if isinstance(torch_dtype, str) and torch_dtype != 'auto':
+            torch_dtype = DTYPE_MAP.get(torch_dtype, torch.float32)
+        self.torch_dtype = torch_dtype
+        self.device = device_map or get_device()
+        self.pipeline_cls = collect_model_arg('pipeline_cls')
+        # default to DiffusionPipeline if not specified
+        if self.pipeline_cls is None:
+            if 'qwen' in model_name.lower():
+                self.pipeline_cls = 'QwenImageEditPipeline'
+            else:
+                logger.error('Pipeline class not found. Please provide a valid `pipeline_cls` in model args.')
+                raise ValueError('Invalid pipeline class.')
+        model_name_or_path = model_path or model_name
+        # from modelscope import pipeline_cls
+        module = getattr(importlib.import_module('modelscope'), self.pipeline_cls)
+        logger.info(f'Loading model {model_name_or_path} with {self.pipeline_cls} ...')
+        self.model = module.from_pretrained(
+            model_name_or_path,
+            torch_dtype=self.torch_dtype,
+            **model_args,
+        )
+        self.model.to(self.device)
+    def generate(
+        self,
+        input: List[ChatMessage],
+        tools: List[ToolInfo],
+        tool_choice: ToolChoice,
+        config: GenerateConfig,
+    ) -> ModelOutput:
+        # prepare generator
+        kwargs: Dict[str, Any] = {}
+        if config.num_inference_steps is not None:
+            kwargs['num_inference_steps'] = config.num_inference_steps
+        kwargs.update(config.model_extra)
+        # assume the first text as prompt
+        content = input[0].content
+        assert isinstance(content[0], ContentText) and isinstance(content[1], ContentImage), \
+            'Invalid content types, expected (ContentText, ContentImage)'
+        prompt = content[0].text
+        input_image_base64 = content[1].image
+        input_image = base64_to_PIL(input_image_base64)
+        # get the first image as output
+        output = self.model(image=input_image, prompt=prompt, **kwargs)
+        image = output.images[0]
+        image_base64 = PIL_to_base64(image)
+        return ModelOutput(
+            model=self.model_name,
+            choices=[ChatCompletionChoice.from_content(content=[ContentImage(image=image_base64)])],
+            time=time.time(),
+        )

evalscope/models/model_apis.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from evalscope.api.model import ModelAPI
 from evalscope.api.registry import register_model_api
 from evalscope.utils.deprecation_utils import deprecated
+from evalscope.utils.import_utils import check_import
 @register_model_api(name='mock_llm')
@@ -27,6 +28,8 @@ def server() -> type[ModelAPI]:
 @register_model_api(name='llm_ckpt')
 def llm_ckpt() -> type[ModelAPI]:
+    check_import('torch', package='torch', raise_error=True, feature_name='llm_ckpt')
     from .modelscope import ModelScopeAPI
     return ModelScopeAPI
@@ -35,6 +38,8 @@ def llm_ckpt() -> type[ModelAPI]:
 @register_model_api(name='checkpoint')
 @deprecated(since='1.0.0', remove_in='1.1.0', alternative='llm_ckpt')
 def checkpoint() -> type[ModelAPI]:
+    check_import('torch', package='torch', raise_error=True, feature_name='llm_ckpt')
     from .modelscope import ModelScopeAPI
     return ModelScopeAPI
@@ -42,6 +47,23 @@ def checkpoint() -> type[ModelAPI]:
 @register_model_api(name='text2image')
 def text2image() -> type[ModelAPI]:
+    check_import(['torch', 'torchvision', 'diffusers'],
+                 package='evalscope[aigc]',
+                 raise_error=True,
+                 feature_name='text2image')
     from .text2image_model import Text2ImageAPI
     return Text2ImageAPI
+@register_model_api(name='image_editing')
+def image_editing() -> type[ModelAPI]:
+    check_import(['torch', 'torchvision', 'diffusers'],
+                 package='evalscope[aigc]',
+                 raise_error=True,
+                 feature_name='image_editing')
+    from .image_edit_model import ImageEditAPI
+    return ImageEditAPI

evalscope/models/openai_compatible.py CHANGED Viewed

@@ -8,6 +8,7 @@ from evalscope.api.messages import ChatMessage
 from evalscope.api.model import ChatCompletionChoice, GenerateConfig, ModelAPI, ModelOutput
 from evalscope.api.tool import ToolChoice, ToolInfo
 from evalscope.utils import get_logger
+from evalscope.utils.argument_utils import get_supported_params
 from .utils.openai import (
     chat_choices_from_openai,
     collect_stream_response,
@@ -48,6 +49,9 @@ class OpenAICompatibleAPI(ModelAPI):
         self.base_url = base_url or os.environ.get('EVALSCOPE_BASE_URL', None)
         assert self.base_url, f'Base URL for {model_name} not found'
+        # remove trailing slash from base_url
+        self.base_url = self.base_url.rstrip('/').removesuffix('/chat/completions')
         # create http client
         self.client = OpenAI(
             api_key=self.api_key,
@@ -81,6 +85,8 @@ class OpenAICompatibleAPI(ModelAPI):
             **completion_params,
         )
+        self.validate_request_params(request)
         try:
             # generate completion and save response for model call
             completion = self.client.chat.completions.create(**request)
@@ -109,6 +115,21 @@ class OpenAICompatibleAPI(ModelAPI):
             tools=tools,
         )
+    def validate_request_params(self, params: Dict[str, Any]):
+        """Hook for subclasses to do custom request parameter validation."""
+        # Cache supported params to avoid repeated calls to inspect.signature.
+        if not hasattr(self, '_valid_params'):
+            self._valid_params = get_supported_params(self.client.chat.completions.create)
+        # Move unsupported parameters to extra_body.
+        extra_body = params.get('extra_body', {})
+        for key in list(params.keys()):
+            if key not in self._valid_params:
+                extra_body[key] = params.pop(key)
+        if extra_body:
+            params['extra_body'] = extra_body
     def on_response(self, response: Dict[str, Any]) -> None:
         """Hook for subclasses to do custom response handling."""
         pass

evalscope/models/text2image_model.py CHANGED Viewed

@@ -107,8 +107,8 @@ class Text2ImageAPI(ModelAPI):
             kwargs['num_inference_steps'] = config.num_inference_steps
         if config.guidance_scale is not None:
             kwargs['guidance_scale'] = config.guidance_scale
-        if config.extra_body is not None:
-            kwargs.update(config.extra_body)
+        # update with extra model parameters
+        kwargs.update(config.model_extra)
         # assume the first text as prompt
         prompt = input[0].text

evalscope/models/utils/openai.py CHANGED Viewed

@@ -104,10 +104,9 @@ def openai_chat_completion_part(content: Content) -> ChatCompletionContentPartPa
         )
     elif content.type == 'audio':
         audio_data_uri = file_as_data_uri(content.audio)
-        audio_data = audio_data_uri.split('base64,')[1]
         return ChatCompletionContentPartInputAudioParam(
-            type='input_audio', input_audio=dict(data=audio_data, format=content.format)
+            type='input_audio', input_audio=dict(data=audio_data_uri, format=content.format)
         )
     else:
@@ -175,6 +174,8 @@ def openai_completion_params(model: str, config: GenerateConfig, tools: bool) ->
         params['stop'] = config.stop_seqs
     if config.presence_penalty is not None:
         params['presence_penalty'] = config.presence_penalty
+    if config.repetition_penalty is not None:
+        params['repetition_penalty'] = config.repetition_penalty
     if config.logit_bias is not None:
         params['logit_bias'] = config.logit_bias
     if config.seed is not None:
@@ -183,6 +184,8 @@ def openai_completion_params(model: str, config: GenerateConfig, tools: bool) ->
         params['temperature'] = config.temperature
     if config.top_p is not None:
         params['top_p'] = config.top_p
+    if config.top_k is not None:
+        params['top_k'] = config.top_k
     if config.n is not None:
         params['n'] = config.n
     if config.logprobs is not None:
@@ -205,11 +208,15 @@ def openai_completion_params(model: str, config: GenerateConfig, tools: bool) ->
         )
     if config.extra_body:
         params['extra_body'] = config.extra_body
+    if config.extra_query:
+        params['extra_query'] = config.extra_query
+    if config.extra_headers:
+        params['extra_headers'] = config.extra_headers
     return params
-def openai_assistant_content(message: ChatMessageAssistant) -> str:
+def openai_assistant_content(message: ChatMessageAssistant, include_reasoning=True) -> str:
     # In agent bridge scenarios, we could encounter concepts such as reasoning and
     # .internal use in the ChatMessageAssistant that are not supported by the OpenAI
     # choices API. This code smuggles that data into the plain text so that it
@@ -220,7 +227,7 @@ def openai_assistant_content(message: ChatMessageAssistant) -> str:
     else:
         content = ''
         for c in message.content:
-            if c.type == 'reasoning':
+            if c.type == 'reasoning' and include_reasoning:
                 attribs = ''
                 if c.signature is not None:
                     attribs = f'{attribs} signature="{c.signature}"'
@@ -239,11 +246,14 @@ def openai_assistant_content(message: ChatMessageAssistant) -> str:
     return content
-def openai_chat_choices(choices: List[ChatCompletionChoice]) -> List[Choice]:
+def openai_chat_choices(choices: List[ChatCompletionChoice], include_reasoning: bool = True) -> List[Choice]:
     oai_choices: List[Choice] = []
     for index, choice in enumerate(choices):
-        content = openai_assistant_content(choice.message)
+        # Handle content
+        content = openai_assistant_content(choice.message, include_reasoning=include_reasoning)
+        # Handle tool calls
         if choice.message.tool_calls:
             tool_calls = [openai_chat_tool_call(tc) for tc in choice.message.tool_calls]
         else:

evalscope/perf/arguments.py CHANGED Viewed

@@ -33,11 +33,17 @@ class Arguments(BaseArgument):
     rate: int = -1  # Rate limit for requests (default: -1, no limit)
     sleep_interval: int = 5  # Sleep interval between performance runs, in seconds
+    # Tuning knobs
+    db_commit_interval: int = 1000  # Number of rows buffered before committing to the DB
+    queue_size_multiplier: int = 5  # Maxsize for queue = parallel * this multiplier
+    in_flight_task_multiplier: int = 2  # Max scheduled tasks = parallel * this multiplier
     # Logging and debugging
     log_every_n_query: int = 10  # Log every N queries
     debug: bool = False  # Debug mode
-    wandb_api_key: Optional[str] = None  # WandB API key for logging
-    swanlab_api_key: Optional[str] = None  # SwanLab API key for logging
+    visualizer: Optional[str] = None  # Visualizer for logging, supports 'swanlab' or 'wandb'
+    wandb_api_key: Optional[str] = None  # Will be deprecated in the future
+    swanlab_api_key: Optional[str] = None  # Will be deprecated in the future
     name: Optional[str] = None  # Name for the run
     # Output settings
@@ -55,6 +61,7 @@ class Arguments(BaseArgument):
     image_height: int = 224  # Height of the image for random VL dataset
     image_format: str = 'RGB'  # Image format for random VL dataset
     image_num: int = 1  # Number of images for random VL dataset
+    image_patch_size: int = 28  # Patch size for image tokenizer, only for local image token calculation
     # Dataset settings
     dataset: str = 'openqa'  # Dataset type (default: 'line_by_line')
@@ -67,7 +74,7 @@ class Arguments(BaseArgument):
     max_tokens: Optional[int] = 2048  # Maximum number of tokens in the response
     min_tokens: Optional[int] = None  # Minimum number of tokens in the response
     n_choices: Optional[int] = None  # Number of response choices
-    seed: Optional[int] = 0  # Random seed for reproducibility
+    seed: Optional[int] = None  # Random seed for reproducibility
     stop: Optional[List[str]] = None  # Stop sequences for the response
     stop_token_ids: Optional[List[str]] = None  # Stop token IDs for the response
     stream: Optional[bool] = True  # Whether to stream the response
@@ -106,6 +113,14 @@ class Arguments(BaseArgument):
             self.parallel
         ), f'The length of number and parallel should be the same, but got number: {self.number} and parallel: {self.parallel}'  # noqa: E501
+        # Validate tuning knobs
+        if self.db_commit_interval <= 0:
+            self.db_commit_interval = 1
+        if self.queue_size_multiplier <= 0:
+            self.queue_size_multiplier = 1
+        if self.in_flight_task_multiplier <= 0:
+            self.in_flight_task_multiplier = 1
 class ParseKVAction(argparse.Action):
@@ -151,9 +166,15 @@ def add_argument(parser: argparse.ArgumentParser):
     parser.add_argument(
         '--sleep-interval', type=int, default=5, help='Sleep interval between performance runs, in seconds. Default 5')  # noqa: E501
+    # Tuning knobs
+    parser.add_argument('--db-commit-interval', type=int, default=1000, help='Rows buffered before SQLite commit')
+    parser.add_argument('--queue-size-multiplier', type=int, default=5, help='Queue maxsize = parallel * multiplier')
+    parser.add_argument('--in-flight-task-multiplier', type=int, default=2, help='Max scheduled tasks = parallel * multiplier')  # noqa: E501
     # Logging and debugging
     parser.add_argument('--log-every-n-query', type=int, default=10, help='Logging every n query')
     parser.add_argument('--debug', action='store_true', default=False, help='Debug request send')
+    parser.add_argument('--visualizer', type=str, default=None, help='The visualizer to use, default None')
     parser.add_argument('--wandb-api-key', type=str, default=None, help='The wandb API key')
     parser.add_argument('--swanlab-api-key', type=str, default=None, help='The swanlab API key')
     parser.add_argument('--name', type=str, help='The wandb/swanlab db result name and result db name')
@@ -171,6 +192,7 @@ def add_argument(parser: argparse.ArgumentParser):
     parser.add_argument('--image-height', type=int, default=224, help='Height of the image for random VL dataset')
     parser.add_argument('--image-format', type=str, default='RGB', help='Image format for random VL dataset')
     parser.add_argument('--image-num', type=int, default=1, help='Number of images for random VL dataset')
+    parser.add_argument('--image-patch-size', type=int, default=28, help='Patch size for image tokenizer, only for local image token calculation')  # noqa: E501
     # Output settings
     parser.add_argument('--outputs-dir', help='Outputs dir.', default='outputs')
@@ -188,7 +210,7 @@ def add_argument(parser: argparse.ArgumentParser):
     parser.add_argument(
         '--min-tokens', type=int, help='The minimum number of tokens that can be generated', default=None)
     parser.add_argument('--n-choices', type=int, help='How many completion choices to generate', default=None)
-    parser.add_argument('--seed', type=int, help='The random seed', default=0)
+    parser.add_argument('--seed', type=int, help='The random seed', default=None)
     parser.add_argument('--stop', nargs='*', help='The stop tokens', default=None)
     parser.add_argument('--stop-token-ids', nargs='*', help='Set the stop token IDs', default=None)
     parser.add_argument('--stream', action=argparse.BooleanOptionalAction, help='Stream output with SSE', default=True)

evalscope/perf/benchmark.py CHANGED Viewed

@@ -3,8 +3,6 @@ import json
 import numpy as np
 import platform
 import sqlite3
-import time
-from http import HTTPStatus
 from tqdm import tqdm
 from typing import TYPE_CHECKING, AsyncGenerator, Dict, List, Tuple
@@ -42,6 +40,8 @@ async def get_requests(args: Arguments, api_plugin: 'ApiPluginBase') -> AsyncGen
         try:
             for messages in message_generator.build_messages():
                 dataset_messages.append(messages)
+                if len(dataset_messages) >= args.number:
+                    break
         except StopIteration:
             pass
@@ -80,86 +80,58 @@ async def send_request(
     request: dict,
     benchmark_data_queue: asyncio.Queue,
     args: Arguments,
-    api_plugin: 'ApiPluginBase',
+    client: AioHttpClient,  # reuse shared client
 ):
     async with semaphore:
-        client = AioHttpClient(args, api_plugin)
-        async with client:
-            benchmark_data = BenchmarkData(request=request)
-            benchmark_data.start_time = time.perf_counter()
-            collected_messages = []
-            try:
-                async for is_error, state_code, response_data in client.post(request):
-                    if is_error or state_code != HTTPStatus.OK:
-                        error_msg = str(response_data) if response_data else 'Unknown error'
-                        logger.error(f'Request: {request} failed, state_code: {state_code}, data: {error_msg}')
-                        benchmark_data.success = False
-                        break
-                    if response_data:
-                        collected_messages.append(response_data)
-                        benchmark_data.chunk_times.append(time.perf_counter())
-                        benchmark_data.success = True
-                        benchmark_data.update_gpu_usage()
-            except Exception as e:
-                if response_data:
-                    collected_messages.append(response_data)
-                benchmark_data.success = False
-                logger.exception(e)
-                logger.error(f'Request query: {request} exception')
-            finally:
-                benchmark_data.completed_time = time.perf_counter()
-                benchmark_data.response_messages = collected_messages
-                await benchmark_data_queue.put(benchmark_data)
+        benchmark_data = await client.post(request)
+        benchmark_data.update_gpu_usage()
+        await benchmark_data_queue.put(benchmark_data)
 @exception_handler
 async def statistic_benchmark_metric(benchmark_data_queue: asyncio.Queue, args: Arguments, api_plugin: 'ApiPluginBase'):
     metrics = BenchmarkMetrics(concurrency=args.parallel)
     result_db_path = get_result_db_path(args)
-    collected_benchmark_data = []
-    with tqdm(desc='Processing', total=args.number) as pbar:
-        while not (data_process_completed_event.is_set() and benchmark_data_queue.empty()):
-            try:
-                # Attempt to get benchmark data from the queue with a timeout
-                benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=0.01)
-                benchmark_data_queue.task_done()
-            except asyncio.TimeoutError:
-                # If timeout, continue to the next iteration
-                continue
-            # Update metrics based on the benchmark data
-            metrics.update_metrics(benchmark_data, api_plugin)
-            # Collect benchmark data for later database insertion
-            collected_benchmark_data.append(benchmark_data)
-            # Create a message with the updated metrics
-            message = metrics.create_message()
+    # Stream inserts to DB to avoid accumulating all results in memory
+    commit_every = args.db_commit_interval
+    processed_since_commit = 0
-            # Log the message to wandb\swanlab if the api key is provided
-            if args.wandb_api_key:
-                import wandb
-                wandb.log(message)
-            if args.swanlab_api_key:
-                import swanlab
-                swanlab.log(message)
-            # Log the message to the logger every n queries
-            if int(metrics.n_total_queries) % args.log_every_n_query == 0:
-                msg = json.dumps(message, ensure_ascii=False, indent=2)
-                logger.info(msg)
-            pbar.update(1)  # Update the progress bar
-    # Now perform database operations after all benchmark data has been processed
     with sqlite3.connect(result_db_path) as con:
         cursor = con.cursor()
         create_result_table(cursor)
-        for benchmark_data in collected_benchmark_data:
-            insert_benchmark_data(cursor, benchmark_data)
+        with tqdm(desc='Processing', total=args.number) as pbar:
+            while not (data_process_completed_event.is_set() and benchmark_data_queue.empty()):
+                try:
+                    benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=0.1)
+                except asyncio.TimeoutError:
+                    continue
+                # Update metrics and write to DB immediately
+                metrics.update_metrics(benchmark_data, api_plugin)
+                insert_benchmark_data(cursor, benchmark_data)
+                processed_since_commit += 1
+                if processed_since_commit >= commit_every:
+                    con.commit()
+                    processed_since_commit = 0
+                message = metrics.create_message()
+                if args.wandb_api_key:
+                    import wandb
+                    wandb.log(message)
+                if args.swanlab_api_key:
+                    import swanlab
+                    swanlab.log(message)
+                if int(metrics.n_total_queries) % args.log_every_n_query == 0:
+                    msg = json.dumps(message, ensure_ascii=False, indent=2)
+                    logger.info(msg)
+                benchmark_data_queue.task_done()
+                pbar.update(1)
         con.commit()
     return metrics, result_db_path
@@ -177,31 +149,46 @@ async def benchmark(args: Arguments) -> Tuple[Dict, Dict]:
         loop = asyncio.get_running_loop()
         add_signal_handlers(loop)
-    # Create API plugin instance for request/response processing
     api_plugin_class = ApiRegistry.get_class(args.api)
     api_plugin = api_plugin_class(args)
-    # init queue
-    benchmark_data_queue = asyncio.Queue()
-    # reset event
+    benchmark_data_queue: asyncio.Queue = asyncio.Queue(maxsize=max(1, args.parallel * args.queue_size_multiplier))
     data_process_completed_event.clear()
     # test connection
     await connect_test(args, api_plugin)
-    # start statistic benchmark metric
-    statistic_benchmark_metric_task = asyncio.create_task(
-        statistic_benchmark_metric(benchmark_data_queue, args, api_plugin)
-    )
-    # start send request
-    semaphore = asyncio.Semaphore(args.parallel)
-    send_request_tasks: List[asyncio.Task] = []
-    async for request in get_requests(args, api_plugin):
-        task = asyncio.create_task(send_request(semaphore, request, benchmark_data_queue, args, api_plugin))
-        send_request_tasks.append(task)
-    await asyncio.gather(*send_request_tasks, return_exceptions=True)
-    await benchmark_data_queue.join()
-    data_process_completed_event.set()
-    metrics, result_db_path = await statistic_benchmark_metric_task
+    # Create a single shared client session for all requests
+    client = AioHttpClient(args, api_plugin)
+    async with client:
+        # start statistic benchmark metric (consumer)
+        statistic_benchmark_metric_task = asyncio.create_task(
+            statistic_benchmark_metric(benchmark_data_queue, args, api_plugin)
+        )
+        # start sending requests with bounded in-flight tasks
+        semaphore = asyncio.Semaphore(args.parallel)
+        in_flight: set[asyncio.Task] = set()
+        max_in_flight = args.parallel * args.in_flight_task_multiplier
+        async for request in get_requests(args, api_plugin):
+            # Keep the number of scheduled tasks bounded to avoid OOM
+            if len(in_flight) >= max_in_flight:
+                done, pending = await asyncio.wait(in_flight, return_when=asyncio.FIRST_COMPLETED)
+                in_flight = pending
+            task = asyncio.create_task(send_request(semaphore, request, benchmark_data_queue, args, client))
+            in_flight.add(task)
+        # Wait for remaining in-flight tasks
+        if in_flight:
+            await asyncio.gather(*in_flight, return_exceptions=True)
+        # Drain queue and finish
+        await benchmark_data_queue.join()
+        data_process_completed_event.set()
+        metrics, result_db_path = await statistic_benchmark_metric_task
     metrics_result, percentile_result = summary_result(args, metrics, result_db_path)
     return metrics_result, percentile_result

evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

evalscope 1.0.0py3-none-any.whl → 1.2.0py3-none-any.whl