evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/api/benchmark/__init__.py +9 -1
- evalscope/api/benchmark/adapters/__init__.py +4 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +85 -2
- evalscope/api/benchmark/meta.py +10 -1
- evalscope/api/dataset/dataset.py +27 -6
- evalscope/api/dataset/loader.py +8 -3
- evalscope/api/evaluator/cache.py +31 -4
- evalscope/api/evaluator/evaluator.py +5 -0
- evalscope/api/evaluator/state.py +17 -1
- evalscope/api/messages/__init__.py +1 -0
- evalscope/api/messages/chat_message.py +52 -2
- evalscope/api/metric/__init__.py +1 -1
- evalscope/api/metric/metric.py +6 -1
- evalscope/api/metric/scorer.py +15 -7
- evalscope/api/mixin/__init__.py +1 -1
- evalscope/api/mixin/llm_judge_mixin.py +2 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/generate_config.py +10 -6
- evalscope/api/model/model.py +5 -2
- evalscope/api/tool/tool_info.py +1 -1
- evalscope/app/app.py +3 -0
- evalscope/app/ui/multi_model.py +6 -1
- evalscope/app/ui/single_model.py +11 -5
- evalscope/app/utils/data_utils.py +8 -7
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -12
- evalscope/app/utils/visualization.py +2 -2
- evalscope/arguments.py +8 -4
- evalscope/backend/opencompass/backend_manager.py +0 -2
- evalscope/backend/rag_eval/utils/embedding.py +9 -1
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/aime24_adapter.py +5 -0
- evalscope/benchmarks/aime/aime25_adapter.py +136 -1
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/benchmarks/aime/math_normalize.py +189 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
- evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
- evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/drop_adapter.py +15 -44
- evalscope/benchmarks/drop/utils.py +97 -0
- evalscope/benchmarks/frames/frames_adapter.py +2 -1
- evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
- evalscope/benchmarks/general_arena/utils.py +2 -1
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hle/hle_adapter.py +3 -2
- evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
- evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/config.py +103 -18
- evalscope/constants.py +18 -0
- evalscope/evaluator/evaluator.py +138 -82
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/llm_judge.py +19 -7
- evalscope/metrics/math_parser.py +14 -0
- evalscope/metrics/metric.py +317 -13
- evalscope/metrics/metrics.py +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/model_apis.py +22 -0
- evalscope/models/openai_compatible.py +21 -0
- evalscope/models/text2image_model.py +2 -2
- evalscope/models/utils/openai.py +16 -6
- evalscope/perf/arguments.py +26 -4
- evalscope/perf/benchmark.py +76 -89
- evalscope/perf/http_client.py +31 -16
- evalscope/perf/main.py +15 -2
- evalscope/perf/plugin/api/base.py +9 -7
- evalscope/perf/plugin/api/custom_api.py +13 -58
- evalscope/perf/plugin/api/default_api.py +188 -79
- evalscope/perf/plugin/api/openai_api.py +85 -20
- evalscope/perf/plugin/datasets/base.py +21 -0
- evalscope/perf/plugin/datasets/custom.py +2 -3
- evalscope/perf/plugin/datasets/flickr8k.py +2 -2
- evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
- evalscope/perf/plugin/datasets/line_by_line.py +2 -3
- evalscope/perf/plugin/datasets/longalpaca.py +2 -3
- evalscope/perf/plugin/datasets/openqa.py +2 -4
- evalscope/perf/plugin/datasets/random_dataset.py +1 -3
- evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
- evalscope/perf/utils/benchmark_util.py +43 -27
- evalscope/perf/utils/db_util.py +14 -19
- evalscope/perf/utils/local_server.py +3 -44
- evalscope/perf/utils/log_utils.py +21 -6
- evalscope/report/__init__.py +13 -3
- evalscope/report/combinator.py +91 -20
- evalscope/report/generator.py +8 -87
- evalscope/report/report.py +8 -4
- evalscope/run.py +13 -5
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/argument_utils.py +1 -1
- evalscope/utils/chat_service.py +1 -1
- evalscope/utils/function_utils.py +249 -12
- evalscope/utils/import_utils.py +73 -1
- evalscope/utils/io_utils.py +132 -7
- evalscope/utils/json_schema.py +25 -2
- evalscope/utils/logger.py +69 -18
- evalscope/utils/model_utils.py +4 -3
- evalscope/utils/multi_choices.py +39 -7
- evalscope/utils/ner.py +377 -0
- evalscope/version.py +2 -2
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/api/mixin/dataset_mixin.py +0 -105
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
- tests/__init__.py +0 -1
- tests/aigc/__init__.py +0 -1
- tests/aigc/test_t2i.py +0 -142
- tests/benchmark/__init__.py +0 -1
- tests/benchmark/test_eval.py +0 -386
- tests/cli/__init__.py +0 -1
- tests/cli/test_all.py +0 -229
- tests/cli/test_collection.py +0 -96
- tests/cli/test_custom.py +0 -268
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -176
- tests/rag/test_clip_benchmark.py +0 -90
- tests/rag/test_mteb.py +0 -213
- tests/rag/test_ragas.py +0 -128
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -146
- tests/swift/test_run_swift_vlm_eval.py +0 -128
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
- tests/test_run_all.py +0 -12
- tests/utils.py +0 -13
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -102
- /evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
- {tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import importlib
|
|
4
|
+
import time
|
|
5
|
+
import torch
|
|
6
|
+
from logging import getLogger
|
|
7
|
+
from typing import Any, Dict, List, Literal, Optional, Protocol, Tuple, Union, cast
|
|
8
|
+
|
|
9
|
+
from evalscope.api.messages import (
|
|
10
|
+
ChatMessage,
|
|
11
|
+
ChatMessageAssistant,
|
|
12
|
+
ContentAudio,
|
|
13
|
+
ContentImage,
|
|
14
|
+
ContentText,
|
|
15
|
+
ContentVideo,
|
|
16
|
+
)
|
|
17
|
+
from evalscope.api.model import (
|
|
18
|
+
ChatCompletionChoice,
|
|
19
|
+
GenerateConfig,
|
|
20
|
+
Logprob,
|
|
21
|
+
Logprobs,
|
|
22
|
+
ModelAPI,
|
|
23
|
+
ModelOutput,
|
|
24
|
+
ModelUsage,
|
|
25
|
+
TopLogprob,
|
|
26
|
+
)
|
|
27
|
+
from evalscope.api.tool import ToolChoice, ToolInfo
|
|
28
|
+
from evalscope.utils.io_utils import PIL_to_base64, base64_to_PIL
|
|
29
|
+
from evalscope.utils.model_utils import get_device
|
|
30
|
+
|
|
31
|
+
logger = getLogger()
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ImageEditAPI(ModelAPI):
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
model_name: str,
|
|
39
|
+
base_url: Optional[str] = None,
|
|
40
|
+
api_key: Optional[str] = None,
|
|
41
|
+
config: GenerateConfig = GenerateConfig(),
|
|
42
|
+
**model_args: Any,
|
|
43
|
+
):
|
|
44
|
+
super().__init__(
|
|
45
|
+
model_name=model_name,
|
|
46
|
+
base_url=base_url,
|
|
47
|
+
api_key=api_key,
|
|
48
|
+
config=config,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# collect known model_args (then delete them so we can pass the rest on)
|
|
52
|
+
def collect_model_arg(name: str) -> Optional[Any]:
|
|
53
|
+
nonlocal model_args
|
|
54
|
+
value = model_args.get(name, None)
|
|
55
|
+
if value is not None:
|
|
56
|
+
model_args.pop(name)
|
|
57
|
+
return value
|
|
58
|
+
|
|
59
|
+
model_path = collect_model_arg('model_path')
|
|
60
|
+
torch_dtype = collect_model_arg('precision') or collect_model_arg('torch_dtype')
|
|
61
|
+
device_map = collect_model_arg('device_map')
|
|
62
|
+
# torch dtype
|
|
63
|
+
DTYPE_MAP = {'float16': torch.float16, 'float32': torch.float32, 'bfloat16': torch.bfloat16, 'auto': 'auto'}
|
|
64
|
+
|
|
65
|
+
if isinstance(torch_dtype, str) and torch_dtype != 'auto':
|
|
66
|
+
torch_dtype = DTYPE_MAP.get(torch_dtype, torch.float32)
|
|
67
|
+
self.torch_dtype = torch_dtype
|
|
68
|
+
self.device = device_map or get_device()
|
|
69
|
+
|
|
70
|
+
self.pipeline_cls = collect_model_arg('pipeline_cls')
|
|
71
|
+
# default to DiffusionPipeline if not specified
|
|
72
|
+
if self.pipeline_cls is None:
|
|
73
|
+
if 'qwen' in model_name.lower():
|
|
74
|
+
self.pipeline_cls = 'QwenImageEditPipeline'
|
|
75
|
+
else:
|
|
76
|
+
logger.error('Pipeline class not found. Please provide a valid `pipeline_cls` in model args.')
|
|
77
|
+
raise ValueError('Invalid pipeline class.')
|
|
78
|
+
|
|
79
|
+
model_name_or_path = model_path or model_name
|
|
80
|
+
|
|
81
|
+
# from modelscope import pipeline_cls
|
|
82
|
+
module = getattr(importlib.import_module('modelscope'), self.pipeline_cls)
|
|
83
|
+
logger.info(f'Loading model {model_name_or_path} with {self.pipeline_cls} ...')
|
|
84
|
+
|
|
85
|
+
self.model = module.from_pretrained(
|
|
86
|
+
model_name_or_path,
|
|
87
|
+
torch_dtype=self.torch_dtype,
|
|
88
|
+
**model_args,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
self.model.to(self.device)
|
|
92
|
+
|
|
93
|
+
def generate(
|
|
94
|
+
self,
|
|
95
|
+
input: List[ChatMessage],
|
|
96
|
+
tools: List[ToolInfo],
|
|
97
|
+
tool_choice: ToolChoice,
|
|
98
|
+
config: GenerateConfig,
|
|
99
|
+
) -> ModelOutput:
|
|
100
|
+
|
|
101
|
+
# prepare generator
|
|
102
|
+
kwargs: Dict[str, Any] = {}
|
|
103
|
+
if config.num_inference_steps is not None:
|
|
104
|
+
kwargs['num_inference_steps'] = config.num_inference_steps
|
|
105
|
+
kwargs.update(config.model_extra)
|
|
106
|
+
|
|
107
|
+
# assume the first text as prompt
|
|
108
|
+
content = input[0].content
|
|
109
|
+
assert isinstance(content[0], ContentText) and isinstance(content[1], ContentImage), \
|
|
110
|
+
'Invalid content types, expected (ContentText, ContentImage)'
|
|
111
|
+
|
|
112
|
+
prompt = content[0].text
|
|
113
|
+
input_image_base64 = content[1].image
|
|
114
|
+
input_image = base64_to_PIL(input_image_base64)
|
|
115
|
+
# get the first image as output
|
|
116
|
+
output = self.model(image=input_image, prompt=prompt, **kwargs)
|
|
117
|
+
image = output.images[0]
|
|
118
|
+
|
|
119
|
+
image_base64 = PIL_to_base64(image)
|
|
120
|
+
|
|
121
|
+
return ModelOutput(
|
|
122
|
+
model=self.model_name,
|
|
123
|
+
choices=[ChatCompletionChoice.from_content(content=[ContentImage(image=image_base64)])],
|
|
124
|
+
time=time.time(),
|
|
125
|
+
)
|
evalscope/models/model_apis.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from evalscope.api.model import ModelAPI
|
|
2
2
|
from evalscope.api.registry import register_model_api
|
|
3
3
|
from evalscope.utils.deprecation_utils import deprecated
|
|
4
|
+
from evalscope.utils.import_utils import check_import
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
@register_model_api(name='mock_llm')
|
|
@@ -27,6 +28,8 @@ def server() -> type[ModelAPI]:
|
|
|
27
28
|
|
|
28
29
|
@register_model_api(name='llm_ckpt')
|
|
29
30
|
def llm_ckpt() -> type[ModelAPI]:
|
|
31
|
+
check_import('torch', package='torch', raise_error=True, feature_name='llm_ckpt')
|
|
32
|
+
|
|
30
33
|
from .modelscope import ModelScopeAPI
|
|
31
34
|
|
|
32
35
|
return ModelScopeAPI
|
|
@@ -35,6 +38,8 @@ def llm_ckpt() -> type[ModelAPI]:
|
|
|
35
38
|
@register_model_api(name='checkpoint')
|
|
36
39
|
@deprecated(since='1.0.0', remove_in='1.1.0', alternative='llm_ckpt')
|
|
37
40
|
def checkpoint() -> type[ModelAPI]:
|
|
41
|
+
check_import('torch', package='torch', raise_error=True, feature_name='llm_ckpt')
|
|
42
|
+
|
|
38
43
|
from .modelscope import ModelScopeAPI
|
|
39
44
|
|
|
40
45
|
return ModelScopeAPI
|
|
@@ -42,6 +47,23 @@ def checkpoint() -> type[ModelAPI]:
|
|
|
42
47
|
|
|
43
48
|
@register_model_api(name='text2image')
|
|
44
49
|
def text2image() -> type[ModelAPI]:
|
|
50
|
+
check_import(['torch', 'torchvision', 'diffusers'],
|
|
51
|
+
package='evalscope[aigc]',
|
|
52
|
+
raise_error=True,
|
|
53
|
+
feature_name='text2image')
|
|
54
|
+
|
|
45
55
|
from .text2image_model import Text2ImageAPI
|
|
46
56
|
|
|
47
57
|
return Text2ImageAPI
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@register_model_api(name='image_editing')
|
|
61
|
+
def image_editing() -> type[ModelAPI]:
|
|
62
|
+
check_import(['torch', 'torchvision', 'diffusers'],
|
|
63
|
+
package='evalscope[aigc]',
|
|
64
|
+
raise_error=True,
|
|
65
|
+
feature_name='image_editing')
|
|
66
|
+
|
|
67
|
+
from .image_edit_model import ImageEditAPI
|
|
68
|
+
|
|
69
|
+
return ImageEditAPI
|
|
@@ -8,6 +8,7 @@ from evalscope.api.messages import ChatMessage
|
|
|
8
8
|
from evalscope.api.model import ChatCompletionChoice, GenerateConfig, ModelAPI, ModelOutput
|
|
9
9
|
from evalscope.api.tool import ToolChoice, ToolInfo
|
|
10
10
|
from evalscope.utils import get_logger
|
|
11
|
+
from evalscope.utils.argument_utils import get_supported_params
|
|
11
12
|
from .utils.openai import (
|
|
12
13
|
chat_choices_from_openai,
|
|
13
14
|
collect_stream_response,
|
|
@@ -48,6 +49,9 @@ class OpenAICompatibleAPI(ModelAPI):
|
|
|
48
49
|
self.base_url = base_url or os.environ.get('EVALSCOPE_BASE_URL', None)
|
|
49
50
|
assert self.base_url, f'Base URL for {model_name} not found'
|
|
50
51
|
|
|
52
|
+
# remove trailing slash from base_url
|
|
53
|
+
self.base_url = self.base_url.rstrip('/').removesuffix('/chat/completions')
|
|
54
|
+
|
|
51
55
|
# create http client
|
|
52
56
|
self.client = OpenAI(
|
|
53
57
|
api_key=self.api_key,
|
|
@@ -81,6 +85,8 @@ class OpenAICompatibleAPI(ModelAPI):
|
|
|
81
85
|
**completion_params,
|
|
82
86
|
)
|
|
83
87
|
|
|
88
|
+
self.validate_request_params(request)
|
|
89
|
+
|
|
84
90
|
try:
|
|
85
91
|
# generate completion and save response for model call
|
|
86
92
|
completion = self.client.chat.completions.create(**request)
|
|
@@ -109,6 +115,21 @@ class OpenAICompatibleAPI(ModelAPI):
|
|
|
109
115
|
tools=tools,
|
|
110
116
|
)
|
|
111
117
|
|
|
118
|
+
def validate_request_params(self, params: Dict[str, Any]):
|
|
119
|
+
"""Hook for subclasses to do custom request parameter validation."""
|
|
120
|
+
# Cache supported params to avoid repeated calls to inspect.signature.
|
|
121
|
+
if not hasattr(self, '_valid_params'):
|
|
122
|
+
self._valid_params = get_supported_params(self.client.chat.completions.create)
|
|
123
|
+
|
|
124
|
+
# Move unsupported parameters to extra_body.
|
|
125
|
+
extra_body = params.get('extra_body', {})
|
|
126
|
+
for key in list(params.keys()):
|
|
127
|
+
if key not in self._valid_params:
|
|
128
|
+
extra_body[key] = params.pop(key)
|
|
129
|
+
|
|
130
|
+
if extra_body:
|
|
131
|
+
params['extra_body'] = extra_body
|
|
132
|
+
|
|
112
133
|
def on_response(self, response: Dict[str, Any]) -> None:
|
|
113
134
|
"""Hook for subclasses to do custom response handling."""
|
|
114
135
|
pass
|
|
@@ -107,8 +107,8 @@ class Text2ImageAPI(ModelAPI):
|
|
|
107
107
|
kwargs['num_inference_steps'] = config.num_inference_steps
|
|
108
108
|
if config.guidance_scale is not None:
|
|
109
109
|
kwargs['guidance_scale'] = config.guidance_scale
|
|
110
|
-
|
|
111
|
-
|
|
110
|
+
# update with extra model parameters
|
|
111
|
+
kwargs.update(config.model_extra)
|
|
112
112
|
|
|
113
113
|
# assume the first text as prompt
|
|
114
114
|
prompt = input[0].text
|
evalscope/models/utils/openai.py
CHANGED
|
@@ -104,10 +104,9 @@ def openai_chat_completion_part(content: Content) -> ChatCompletionContentPartPa
|
|
|
104
104
|
)
|
|
105
105
|
elif content.type == 'audio':
|
|
106
106
|
audio_data_uri = file_as_data_uri(content.audio)
|
|
107
|
-
audio_data = audio_data_uri.split('base64,')[1]
|
|
108
107
|
|
|
109
108
|
return ChatCompletionContentPartInputAudioParam(
|
|
110
|
-
type='input_audio', input_audio=dict(data=
|
|
109
|
+
type='input_audio', input_audio=dict(data=audio_data_uri, format=content.format)
|
|
111
110
|
)
|
|
112
111
|
|
|
113
112
|
else:
|
|
@@ -175,6 +174,8 @@ def openai_completion_params(model: str, config: GenerateConfig, tools: bool) ->
|
|
|
175
174
|
params['stop'] = config.stop_seqs
|
|
176
175
|
if config.presence_penalty is not None:
|
|
177
176
|
params['presence_penalty'] = config.presence_penalty
|
|
177
|
+
if config.repetition_penalty is not None:
|
|
178
|
+
params['repetition_penalty'] = config.repetition_penalty
|
|
178
179
|
if config.logit_bias is not None:
|
|
179
180
|
params['logit_bias'] = config.logit_bias
|
|
180
181
|
if config.seed is not None:
|
|
@@ -183,6 +184,8 @@ def openai_completion_params(model: str, config: GenerateConfig, tools: bool) ->
|
|
|
183
184
|
params['temperature'] = config.temperature
|
|
184
185
|
if config.top_p is not None:
|
|
185
186
|
params['top_p'] = config.top_p
|
|
187
|
+
if config.top_k is not None:
|
|
188
|
+
params['top_k'] = config.top_k
|
|
186
189
|
if config.n is not None:
|
|
187
190
|
params['n'] = config.n
|
|
188
191
|
if config.logprobs is not None:
|
|
@@ -205,11 +208,15 @@ def openai_completion_params(model: str, config: GenerateConfig, tools: bool) ->
|
|
|
205
208
|
)
|
|
206
209
|
if config.extra_body:
|
|
207
210
|
params['extra_body'] = config.extra_body
|
|
211
|
+
if config.extra_query:
|
|
212
|
+
params['extra_query'] = config.extra_query
|
|
213
|
+
if config.extra_headers:
|
|
214
|
+
params['extra_headers'] = config.extra_headers
|
|
208
215
|
|
|
209
216
|
return params
|
|
210
217
|
|
|
211
218
|
|
|
212
|
-
def openai_assistant_content(message: ChatMessageAssistant) -> str:
|
|
219
|
+
def openai_assistant_content(message: ChatMessageAssistant, include_reasoning=True) -> str:
|
|
213
220
|
# In agent bridge scenarios, we could encounter concepts such as reasoning and
|
|
214
221
|
# .internal use in the ChatMessageAssistant that are not supported by the OpenAI
|
|
215
222
|
# choices API. This code smuggles that data into the plain text so that it
|
|
@@ -220,7 +227,7 @@ def openai_assistant_content(message: ChatMessageAssistant) -> str:
|
|
|
220
227
|
else:
|
|
221
228
|
content = ''
|
|
222
229
|
for c in message.content:
|
|
223
|
-
if c.type == 'reasoning':
|
|
230
|
+
if c.type == 'reasoning' and include_reasoning:
|
|
224
231
|
attribs = ''
|
|
225
232
|
if c.signature is not None:
|
|
226
233
|
attribs = f'{attribs} signature="{c.signature}"'
|
|
@@ -239,11 +246,14 @@ def openai_assistant_content(message: ChatMessageAssistant) -> str:
|
|
|
239
246
|
return content
|
|
240
247
|
|
|
241
248
|
|
|
242
|
-
def openai_chat_choices(choices: List[ChatCompletionChoice]) -> List[Choice]:
|
|
249
|
+
def openai_chat_choices(choices: List[ChatCompletionChoice], include_reasoning: bool = True) -> List[Choice]:
|
|
243
250
|
oai_choices: List[Choice] = []
|
|
244
251
|
|
|
245
252
|
for index, choice in enumerate(choices):
|
|
246
|
-
|
|
253
|
+
# Handle content
|
|
254
|
+
content = openai_assistant_content(choice.message, include_reasoning=include_reasoning)
|
|
255
|
+
|
|
256
|
+
# Handle tool calls
|
|
247
257
|
if choice.message.tool_calls:
|
|
248
258
|
tool_calls = [openai_chat_tool_call(tc) for tc in choice.message.tool_calls]
|
|
249
259
|
else:
|
evalscope/perf/arguments.py
CHANGED
|
@@ -33,11 +33,17 @@ class Arguments(BaseArgument):
|
|
|
33
33
|
rate: int = -1 # Rate limit for requests (default: -1, no limit)
|
|
34
34
|
sleep_interval: int = 5 # Sleep interval between performance runs, in seconds
|
|
35
35
|
|
|
36
|
+
# Tuning knobs
|
|
37
|
+
db_commit_interval: int = 1000 # Number of rows buffered before committing to the DB
|
|
38
|
+
queue_size_multiplier: int = 5 # Maxsize for queue = parallel * this multiplier
|
|
39
|
+
in_flight_task_multiplier: int = 2 # Max scheduled tasks = parallel * this multiplier
|
|
40
|
+
|
|
36
41
|
# Logging and debugging
|
|
37
42
|
log_every_n_query: int = 10 # Log every N queries
|
|
38
43
|
debug: bool = False # Debug mode
|
|
39
|
-
|
|
40
|
-
|
|
44
|
+
visualizer: Optional[str] = None # Visualizer for logging, supports 'swanlab' or 'wandb'
|
|
45
|
+
wandb_api_key: Optional[str] = None # Will be deprecated in the future
|
|
46
|
+
swanlab_api_key: Optional[str] = None # Will be deprecated in the future
|
|
41
47
|
name: Optional[str] = None # Name for the run
|
|
42
48
|
|
|
43
49
|
# Output settings
|
|
@@ -55,6 +61,7 @@ class Arguments(BaseArgument):
|
|
|
55
61
|
image_height: int = 224 # Height of the image for random VL dataset
|
|
56
62
|
image_format: str = 'RGB' # Image format for random VL dataset
|
|
57
63
|
image_num: int = 1 # Number of images for random VL dataset
|
|
64
|
+
image_patch_size: int = 28 # Patch size for image tokenizer, only for local image token calculation
|
|
58
65
|
|
|
59
66
|
# Dataset settings
|
|
60
67
|
dataset: str = 'openqa' # Dataset type (default: 'line_by_line')
|
|
@@ -67,7 +74,7 @@ class Arguments(BaseArgument):
|
|
|
67
74
|
max_tokens: Optional[int] = 2048 # Maximum number of tokens in the response
|
|
68
75
|
min_tokens: Optional[int] = None # Minimum number of tokens in the response
|
|
69
76
|
n_choices: Optional[int] = None # Number of response choices
|
|
70
|
-
seed: Optional[int] =
|
|
77
|
+
seed: Optional[int] = None # Random seed for reproducibility
|
|
71
78
|
stop: Optional[List[str]] = None # Stop sequences for the response
|
|
72
79
|
stop_token_ids: Optional[List[str]] = None # Stop token IDs for the response
|
|
73
80
|
stream: Optional[bool] = True # Whether to stream the response
|
|
@@ -106,6 +113,14 @@ class Arguments(BaseArgument):
|
|
|
106
113
|
self.parallel
|
|
107
114
|
), f'The length of number and parallel should be the same, but got number: {self.number} and parallel: {self.parallel}' # noqa: E501
|
|
108
115
|
|
|
116
|
+
# Validate tuning knobs
|
|
117
|
+
if self.db_commit_interval <= 0:
|
|
118
|
+
self.db_commit_interval = 1
|
|
119
|
+
if self.queue_size_multiplier <= 0:
|
|
120
|
+
self.queue_size_multiplier = 1
|
|
121
|
+
if self.in_flight_task_multiplier <= 0:
|
|
122
|
+
self.in_flight_task_multiplier = 1
|
|
123
|
+
|
|
109
124
|
|
|
110
125
|
class ParseKVAction(argparse.Action):
|
|
111
126
|
|
|
@@ -151,9 +166,15 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
151
166
|
parser.add_argument(
|
|
152
167
|
'--sleep-interval', type=int, default=5, help='Sleep interval between performance runs, in seconds. Default 5') # noqa: E501
|
|
153
168
|
|
|
169
|
+
# Tuning knobs
|
|
170
|
+
parser.add_argument('--db-commit-interval', type=int, default=1000, help='Rows buffered before SQLite commit')
|
|
171
|
+
parser.add_argument('--queue-size-multiplier', type=int, default=5, help='Queue maxsize = parallel * multiplier')
|
|
172
|
+
parser.add_argument('--in-flight-task-multiplier', type=int, default=2, help='Max scheduled tasks = parallel * multiplier') # noqa: E501
|
|
173
|
+
|
|
154
174
|
# Logging and debugging
|
|
155
175
|
parser.add_argument('--log-every-n-query', type=int, default=10, help='Logging every n query')
|
|
156
176
|
parser.add_argument('--debug', action='store_true', default=False, help='Debug request send')
|
|
177
|
+
parser.add_argument('--visualizer', type=str, default=None, help='The visualizer to use, default None')
|
|
157
178
|
parser.add_argument('--wandb-api-key', type=str, default=None, help='The wandb API key')
|
|
158
179
|
parser.add_argument('--swanlab-api-key', type=str, default=None, help='The swanlab API key')
|
|
159
180
|
parser.add_argument('--name', type=str, help='The wandb/swanlab db result name and result db name')
|
|
@@ -171,6 +192,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
171
192
|
parser.add_argument('--image-height', type=int, default=224, help='Height of the image for random VL dataset')
|
|
172
193
|
parser.add_argument('--image-format', type=str, default='RGB', help='Image format for random VL dataset')
|
|
173
194
|
parser.add_argument('--image-num', type=int, default=1, help='Number of images for random VL dataset')
|
|
195
|
+
parser.add_argument('--image-patch-size', type=int, default=28, help='Patch size for image tokenizer, only for local image token calculation') # noqa: E501
|
|
174
196
|
|
|
175
197
|
# Output settings
|
|
176
198
|
parser.add_argument('--outputs-dir', help='Outputs dir.', default='outputs')
|
|
@@ -188,7 +210,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
188
210
|
parser.add_argument(
|
|
189
211
|
'--min-tokens', type=int, help='The minimum number of tokens that can be generated', default=None)
|
|
190
212
|
parser.add_argument('--n-choices', type=int, help='How many completion choices to generate', default=None)
|
|
191
|
-
parser.add_argument('--seed', type=int, help='The random seed', default=
|
|
213
|
+
parser.add_argument('--seed', type=int, help='The random seed', default=None)
|
|
192
214
|
parser.add_argument('--stop', nargs='*', help='The stop tokens', default=None)
|
|
193
215
|
parser.add_argument('--stop-token-ids', nargs='*', help='Set the stop token IDs', default=None)
|
|
194
216
|
parser.add_argument('--stream', action=argparse.BooleanOptionalAction, help='Stream output with SSE', default=True)
|
evalscope/perf/benchmark.py
CHANGED
|
@@ -3,8 +3,6 @@ import json
|
|
|
3
3
|
import numpy as np
|
|
4
4
|
import platform
|
|
5
5
|
import sqlite3
|
|
6
|
-
import time
|
|
7
|
-
from http import HTTPStatus
|
|
8
6
|
from tqdm import tqdm
|
|
9
7
|
from typing import TYPE_CHECKING, AsyncGenerator, Dict, List, Tuple
|
|
10
8
|
|
|
@@ -42,6 +40,8 @@ async def get_requests(args: Arguments, api_plugin: 'ApiPluginBase') -> AsyncGen
|
|
|
42
40
|
try:
|
|
43
41
|
for messages in message_generator.build_messages():
|
|
44
42
|
dataset_messages.append(messages)
|
|
43
|
+
if len(dataset_messages) >= args.number:
|
|
44
|
+
break
|
|
45
45
|
except StopIteration:
|
|
46
46
|
pass
|
|
47
47
|
|
|
@@ -80,86 +80,58 @@ async def send_request(
|
|
|
80
80
|
request: dict,
|
|
81
81
|
benchmark_data_queue: asyncio.Queue,
|
|
82
82
|
args: Arguments,
|
|
83
|
-
|
|
83
|
+
client: AioHttpClient, # reuse shared client
|
|
84
84
|
):
|
|
85
85
|
async with semaphore:
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
benchmark_data.start_time = time.perf_counter()
|
|
90
|
-
collected_messages = []
|
|
91
|
-
try:
|
|
92
|
-
async for is_error, state_code, response_data in client.post(request):
|
|
93
|
-
if is_error or state_code != HTTPStatus.OK:
|
|
94
|
-
error_msg = str(response_data) if response_data else 'Unknown error'
|
|
95
|
-
logger.error(f'Request: {request} failed, state_code: {state_code}, data: {error_msg}')
|
|
96
|
-
benchmark_data.success = False
|
|
97
|
-
break
|
|
98
|
-
if response_data:
|
|
99
|
-
collected_messages.append(response_data)
|
|
100
|
-
benchmark_data.chunk_times.append(time.perf_counter())
|
|
101
|
-
benchmark_data.success = True
|
|
102
|
-
benchmark_data.update_gpu_usage()
|
|
103
|
-
except Exception as e:
|
|
104
|
-
if response_data:
|
|
105
|
-
collected_messages.append(response_data)
|
|
106
|
-
benchmark_data.success = False
|
|
107
|
-
logger.exception(e)
|
|
108
|
-
logger.error(f'Request query: {request} exception')
|
|
109
|
-
finally:
|
|
110
|
-
benchmark_data.completed_time = time.perf_counter()
|
|
111
|
-
benchmark_data.response_messages = collected_messages
|
|
112
|
-
await benchmark_data_queue.put(benchmark_data)
|
|
86
|
+
benchmark_data = await client.post(request)
|
|
87
|
+
benchmark_data.update_gpu_usage()
|
|
88
|
+
await benchmark_data_queue.put(benchmark_data)
|
|
113
89
|
|
|
114
90
|
|
|
115
91
|
@exception_handler
|
|
116
92
|
async def statistic_benchmark_metric(benchmark_data_queue: asyncio.Queue, args: Arguments, api_plugin: 'ApiPluginBase'):
|
|
117
93
|
metrics = BenchmarkMetrics(concurrency=args.parallel)
|
|
118
|
-
|
|
119
94
|
result_db_path = get_result_db_path(args)
|
|
120
95
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
while not (data_process_completed_event.is_set() and benchmark_data_queue.empty()):
|
|
125
|
-
try:
|
|
126
|
-
# Attempt to get benchmark data from the queue with a timeout
|
|
127
|
-
benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=0.01)
|
|
128
|
-
benchmark_data_queue.task_done()
|
|
129
|
-
except asyncio.TimeoutError:
|
|
130
|
-
# If timeout, continue to the next iteration
|
|
131
|
-
continue
|
|
132
|
-
|
|
133
|
-
# Update metrics based on the benchmark data
|
|
134
|
-
metrics.update_metrics(benchmark_data, api_plugin)
|
|
135
|
-
|
|
136
|
-
# Collect benchmark data for later database insertion
|
|
137
|
-
collected_benchmark_data.append(benchmark_data)
|
|
138
|
-
|
|
139
|
-
# Create a message with the updated metrics
|
|
140
|
-
message = metrics.create_message()
|
|
96
|
+
# Stream inserts to DB to avoid accumulating all results in memory
|
|
97
|
+
commit_every = args.db_commit_interval
|
|
98
|
+
processed_since_commit = 0
|
|
141
99
|
|
|
142
|
-
# Log the message to wandb\swanlab if the api key is provided
|
|
143
|
-
if args.wandb_api_key:
|
|
144
|
-
import wandb
|
|
145
|
-
wandb.log(message)
|
|
146
|
-
if args.swanlab_api_key:
|
|
147
|
-
import swanlab
|
|
148
|
-
swanlab.log(message)
|
|
149
|
-
|
|
150
|
-
# Log the message to the logger every n queries
|
|
151
|
-
if int(metrics.n_total_queries) % args.log_every_n_query == 0:
|
|
152
|
-
msg = json.dumps(message, ensure_ascii=False, indent=2)
|
|
153
|
-
logger.info(msg)
|
|
154
|
-
|
|
155
|
-
pbar.update(1) # Update the progress bar
|
|
156
|
-
|
|
157
|
-
# Now perform database operations after all benchmark data has been processed
|
|
158
100
|
with sqlite3.connect(result_db_path) as con:
|
|
159
101
|
cursor = con.cursor()
|
|
160
102
|
create_result_table(cursor)
|
|
161
|
-
|
|
162
|
-
|
|
103
|
+
|
|
104
|
+
with tqdm(desc='Processing', total=args.number) as pbar:
|
|
105
|
+
while not (data_process_completed_event.is_set() and benchmark_data_queue.empty()):
|
|
106
|
+
try:
|
|
107
|
+
benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=0.1)
|
|
108
|
+
except asyncio.TimeoutError:
|
|
109
|
+
continue
|
|
110
|
+
|
|
111
|
+
# Update metrics and write to DB immediately
|
|
112
|
+
metrics.update_metrics(benchmark_data, api_plugin)
|
|
113
|
+
insert_benchmark_data(cursor, benchmark_data)
|
|
114
|
+
processed_since_commit += 1
|
|
115
|
+
if processed_since_commit >= commit_every:
|
|
116
|
+
con.commit()
|
|
117
|
+
processed_since_commit = 0
|
|
118
|
+
|
|
119
|
+
message = metrics.create_message()
|
|
120
|
+
|
|
121
|
+
if args.wandb_api_key:
|
|
122
|
+
import wandb
|
|
123
|
+
wandb.log(message)
|
|
124
|
+
if args.swanlab_api_key:
|
|
125
|
+
import swanlab
|
|
126
|
+
swanlab.log(message)
|
|
127
|
+
|
|
128
|
+
if int(metrics.n_total_queries) % args.log_every_n_query == 0:
|
|
129
|
+
msg = json.dumps(message, ensure_ascii=False, indent=2)
|
|
130
|
+
logger.info(msg)
|
|
131
|
+
|
|
132
|
+
benchmark_data_queue.task_done()
|
|
133
|
+
pbar.update(1)
|
|
134
|
+
|
|
163
135
|
con.commit()
|
|
164
136
|
|
|
165
137
|
return metrics, result_db_path
|
|
@@ -177,31 +149,46 @@ async def benchmark(args: Arguments) -> Tuple[Dict, Dict]:
|
|
|
177
149
|
loop = asyncio.get_running_loop()
|
|
178
150
|
add_signal_handlers(loop)
|
|
179
151
|
|
|
180
|
-
# Create API plugin instance for request/response processing
|
|
181
152
|
api_plugin_class = ApiRegistry.get_class(args.api)
|
|
182
153
|
api_plugin = api_plugin_class(args)
|
|
183
154
|
|
|
184
|
-
|
|
185
|
-
benchmark_data_queue = asyncio.Queue()
|
|
186
|
-
# reset event
|
|
155
|
+
benchmark_data_queue: asyncio.Queue = asyncio.Queue(maxsize=max(1, args.parallel * args.queue_size_multiplier))
|
|
187
156
|
data_process_completed_event.clear()
|
|
157
|
+
|
|
188
158
|
# test connection
|
|
189
159
|
await connect_test(args, api_plugin)
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
160
|
+
|
|
161
|
+
# Create a single shared client session for all requests
|
|
162
|
+
client = AioHttpClient(args, api_plugin)
|
|
163
|
+
async with client:
|
|
164
|
+
# start statistic benchmark metric (consumer)
|
|
165
|
+
statistic_benchmark_metric_task = asyncio.create_task(
|
|
166
|
+
statistic_benchmark_metric(benchmark_data_queue, args, api_plugin)
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
# start sending requests with bounded in-flight tasks
|
|
170
|
+
semaphore = asyncio.Semaphore(args.parallel)
|
|
171
|
+
in_flight: set[asyncio.Task] = set()
|
|
172
|
+
max_in_flight = args.parallel * args.in_flight_task_multiplier
|
|
173
|
+
|
|
174
|
+
async for request in get_requests(args, api_plugin):
|
|
175
|
+
# Keep the number of scheduled tasks bounded to avoid OOM
|
|
176
|
+
if len(in_flight) >= max_in_flight:
|
|
177
|
+
done, pending = await asyncio.wait(in_flight, return_when=asyncio.FIRST_COMPLETED)
|
|
178
|
+
in_flight = pending
|
|
179
|
+
|
|
180
|
+
task = asyncio.create_task(send_request(semaphore, request, benchmark_data_queue, args, client))
|
|
181
|
+
in_flight.add(task)
|
|
182
|
+
|
|
183
|
+
# Wait for remaining in-flight tasks
|
|
184
|
+
if in_flight:
|
|
185
|
+
await asyncio.gather(*in_flight, return_exceptions=True)
|
|
186
|
+
|
|
187
|
+
# Drain queue and finish
|
|
188
|
+
await benchmark_data_queue.join()
|
|
189
|
+
data_process_completed_event.set()
|
|
190
|
+
|
|
191
|
+
metrics, result_db_path = await statistic_benchmark_metric_task
|
|
192
|
+
|
|
206
193
|
metrics_result, percentile_result = summary_result(args, metrics, result_db_path)
|
|
207
194
|
return metrics_result, percentile_result
|