evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +5 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
- evalscope/api/benchmark/benchmark.py +356 -0
- evalscope/api/benchmark/meta.py +121 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +262 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +378 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +275 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +243 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +1 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +155 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/app.py +3 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +26 -14
- evalscope/app/utils/data_utils.py +43 -27
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -14
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +7 -10
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -5
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +10 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +136 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +157 -57
- evalscope/constants.py +37 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +275 -419
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +47 -33
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +67 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +126 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +701 -0
- evalscope/perf/benchmark.py +4 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +15 -10
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +11 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -3
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +51 -35
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +33 -47
- evalscope/summarizer.py +1 -1
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +3 -2
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/import_utils.py +23 -1
- evalscope/utils/io_utils.py +142 -6
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +11 -7
- evalscope/utils/multi_choices.py +288 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
- tests/benchmark/test_eval.py +385 -0
- tests/benchmark/test_image_edit.py +65 -0
- tests/{aigc → benchmark}/test_t2i.py +22 -4
- tests/benchmark/test_vlm.py +80 -0
- tests/cli/test_all.py +85 -47
- tests/cli/test_collection.py +20 -8
- tests/cli/test_custom.py +22 -15
- tests/cli/test_reasoning.py +81 -0
- tests/common.py +73 -0
- tests/perf/test_perf.py +4 -2
- tests/rag/test_clip_benchmark.py +0 -2
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
- /tests/{aigc → benchmark}/__init__.py +0 -0
|
@@ -61,17 +61,18 @@ def t5_tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_IN
|
|
|
61
61
|
|
|
62
62
|
|
|
63
63
|
def load_pretrained_model(
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
64
|
+
model_cls,
|
|
65
|
+
model_args,
|
|
66
|
+
model_path=None,
|
|
67
|
+
tokenizer_path=None,
|
|
68
|
+
model_max_length=None,
|
|
69
|
+
padding_side=None,
|
|
70
|
+
image_aspect_ratio='pad', # or 'square'
|
|
71
|
+
mmprojector_repo=None,
|
|
72
|
+
mmprojector_name=None,
|
|
73
|
+
device='cuda',
|
|
74
|
+
cache_dir=CACHE_DIR
|
|
75
|
+
):
|
|
75
76
|
tokenizer_dict = {}
|
|
76
77
|
if model_max_length:
|
|
77
78
|
tokenizer_dict['model_max_length'] = model_max_length
|
|
@@ -80,7 +81,7 @@ def load_pretrained_model(
|
|
|
80
81
|
|
|
81
82
|
from ..utils import download_file
|
|
82
83
|
|
|
83
|
-
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path,
|
|
84
|
+
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, **tokenizer_dict)
|
|
84
85
|
# tokenizer.pad_token = tokenizer.unk_token # could be redundant
|
|
85
86
|
|
|
86
87
|
model_path = download_file(model_path, cache_dir=cache_dir)
|
|
@@ -106,7 +107,8 @@ def load_pretrained_model(
|
|
|
106
107
|
model_args.pretrain_mm_mlp_adapter = pretrain_mm_mlp_adapter # important to set to correct path
|
|
107
108
|
|
|
108
109
|
model.get_model().initialize_vision_modules(
|
|
109
|
-
model_args
|
|
110
|
+
model_args
|
|
111
|
+
) # This will load the CLIP vision encoder and MLP projector
|
|
110
112
|
else:
|
|
111
113
|
model.resize_token_embeddings(len(tokenizer)) # perhaps not needed
|
|
112
114
|
|
|
@@ -8,8 +8,9 @@ from ..model import ScoreModel
|
|
|
8
8
|
class VQAScoreModel(ScoreModel):
|
|
9
9
|
|
|
10
10
|
@abstractmethod
|
|
11
|
-
def forward(
|
|
12
|
-
|
|
11
|
+
def forward(
|
|
12
|
+
self, images: List[str], texts: List[str], question_template: str, answer_template: str
|
|
13
|
+
) -> torch.Tensor:
|
|
13
14
|
"""Forward pass of the model to return n scores for n (image, text) pairs (in PyTorch Tensor)
|
|
14
15
|
question_template: a string with optional {} to be replaced with the 'text'
|
|
15
16
|
answer_template: a string with optional {} to be replaced with the 'text'
|
evalscope/models/__init__.py
CHANGED
|
@@ -4,38 +4,15 @@ from typing import TYPE_CHECKING
|
|
|
4
4
|
from evalscope.utils.import_utils import _LazyModule
|
|
5
5
|
|
|
6
6
|
if TYPE_CHECKING:
|
|
7
|
-
from .
|
|
8
|
-
CustomModelAdapter, MultiChoiceModelAdapter, ServerModelAdapter, T2IModelAdapter,
|
|
9
|
-
TauBenchAdapter, initialize_model_adapter)
|
|
10
|
-
from .custom import CustomModel, DummyCustomModel
|
|
11
|
-
from .local_model import LocalModel, get_local_model
|
|
12
|
-
from .register import get_model_adapter
|
|
7
|
+
from .model_apis import llm_ckpt, mockllm, openai_api
|
|
13
8
|
|
|
14
9
|
else:
|
|
15
10
|
_import_structure = {
|
|
16
|
-
'
|
|
17
|
-
'
|
|
18
|
-
'
|
|
19
|
-
'
|
|
20
|
-
|
|
21
|
-
'MultiChoiceModelAdapter',
|
|
22
|
-
'CustomModelAdapter',
|
|
23
|
-
'ServerModelAdapter',
|
|
24
|
-
'T2IModelAdapter',
|
|
25
|
-
'TauBenchAdapter',
|
|
26
|
-
'BFCLAdapter',
|
|
27
|
-
],
|
|
28
|
-
'custom': [
|
|
29
|
-
'CustomModel',
|
|
30
|
-
'DummyCustomModel',
|
|
31
|
-
],
|
|
32
|
-
'local_model': [
|
|
33
|
-
'LocalModel',
|
|
34
|
-
'get_local_model',
|
|
35
|
-
],
|
|
36
|
-
'register': [
|
|
37
|
-
'get_model_adapter',
|
|
38
|
-
],
|
|
11
|
+
'model_apis': [
|
|
12
|
+
'openai_api',
|
|
13
|
+
'mockllm',
|
|
14
|
+
'llm_ckpt',
|
|
15
|
+
]
|
|
39
16
|
}
|
|
40
17
|
|
|
41
18
|
import sys
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import importlib
|
|
4
|
+
import time
|
|
5
|
+
import torch
|
|
6
|
+
from logging import getLogger
|
|
7
|
+
from typing import Any, Dict, List, Literal, Optional, Protocol, Tuple, Union, cast
|
|
8
|
+
|
|
9
|
+
from evalscope.api.messages import (
|
|
10
|
+
ChatMessage,
|
|
11
|
+
ChatMessageAssistant,
|
|
12
|
+
ContentAudio,
|
|
13
|
+
ContentImage,
|
|
14
|
+
ContentText,
|
|
15
|
+
ContentVideo,
|
|
16
|
+
)
|
|
17
|
+
from evalscope.api.model import (
|
|
18
|
+
ChatCompletionChoice,
|
|
19
|
+
GenerateConfig,
|
|
20
|
+
Logprob,
|
|
21
|
+
Logprobs,
|
|
22
|
+
ModelAPI,
|
|
23
|
+
ModelOutput,
|
|
24
|
+
ModelUsage,
|
|
25
|
+
TopLogprob,
|
|
26
|
+
)
|
|
27
|
+
from evalscope.api.tool import ToolChoice, ToolInfo
|
|
28
|
+
from evalscope.utils.io_utils import PIL_to_base64, base64_to_PIL
|
|
29
|
+
from evalscope.utils.model_utils import get_device
|
|
30
|
+
|
|
31
|
+
logger = getLogger()
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ImageEditAPI(ModelAPI):
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
model_name: str,
|
|
39
|
+
base_url: Optional[str] = None,
|
|
40
|
+
api_key: Optional[str] = None,
|
|
41
|
+
config: GenerateConfig = GenerateConfig(),
|
|
42
|
+
**model_args: Any,
|
|
43
|
+
):
|
|
44
|
+
super().__init__(
|
|
45
|
+
model_name=model_name,
|
|
46
|
+
base_url=base_url,
|
|
47
|
+
api_key=api_key,
|
|
48
|
+
config=config,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# collect known model_args (then delete them so we can pass the rest on)
|
|
52
|
+
def collect_model_arg(name: str) -> Optional[Any]:
|
|
53
|
+
nonlocal model_args
|
|
54
|
+
value = model_args.get(name, None)
|
|
55
|
+
if value is not None:
|
|
56
|
+
model_args.pop(name)
|
|
57
|
+
return value
|
|
58
|
+
|
|
59
|
+
model_path = collect_model_arg('model_path')
|
|
60
|
+
torch_dtype = collect_model_arg('precision') or collect_model_arg('torch_dtype')
|
|
61
|
+
device_map = collect_model_arg('device_map')
|
|
62
|
+
# torch dtype
|
|
63
|
+
DTYPE_MAP = {'float16': torch.float16, 'float32': torch.float32, 'bfloat16': torch.bfloat16, 'auto': 'auto'}
|
|
64
|
+
|
|
65
|
+
if isinstance(torch_dtype, str) and torch_dtype != 'auto':
|
|
66
|
+
torch_dtype = DTYPE_MAP.get(torch_dtype, torch.float32)
|
|
67
|
+
self.torch_dtype = torch_dtype
|
|
68
|
+
self.device = device_map or get_device()
|
|
69
|
+
|
|
70
|
+
self.pipeline_cls = collect_model_arg('pipeline_cls')
|
|
71
|
+
# default to DiffusionPipeline if not specified
|
|
72
|
+
if self.pipeline_cls is None:
|
|
73
|
+
if 'qwen' in model_name.lower():
|
|
74
|
+
self.pipeline_cls = 'QwenImageEditPipeline'
|
|
75
|
+
else:
|
|
76
|
+
logger.error('Pipeline class not found. Please provide a valid `pipeline_cls` in model args.')
|
|
77
|
+
raise ValueError('Invalid pipeline class.')
|
|
78
|
+
|
|
79
|
+
model_name_or_path = model_path or model_name
|
|
80
|
+
|
|
81
|
+
# from modelscope import pipeline_cls
|
|
82
|
+
module = getattr(importlib.import_module('modelscope'), self.pipeline_cls)
|
|
83
|
+
logger.info(f'Loading model {model_name_or_path} with {self.pipeline_cls} ...')
|
|
84
|
+
|
|
85
|
+
self.model = module.from_pretrained(
|
|
86
|
+
model_name_or_path,
|
|
87
|
+
torch_dtype=self.torch_dtype,
|
|
88
|
+
**model_args,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
self.model.to(self.device)
|
|
92
|
+
|
|
93
|
+
def generate(
|
|
94
|
+
self,
|
|
95
|
+
input: List[ChatMessage],
|
|
96
|
+
tools: List[ToolInfo],
|
|
97
|
+
tool_choice: ToolChoice,
|
|
98
|
+
config: GenerateConfig,
|
|
99
|
+
) -> ModelOutput:
|
|
100
|
+
|
|
101
|
+
# prepare generator
|
|
102
|
+
kwargs: Dict[str, Any] = {}
|
|
103
|
+
if config.num_inference_steps is not None:
|
|
104
|
+
kwargs['num_inference_steps'] = config.num_inference_steps
|
|
105
|
+
kwargs.update(config.model_extra)
|
|
106
|
+
|
|
107
|
+
# assume the first text as prompt
|
|
108
|
+
content = input[0].content
|
|
109
|
+
assert isinstance(content[0], ContentText) and isinstance(content[1], ContentImage), \
|
|
110
|
+
'Invalid content types, expected (ContentText, ContentImage)'
|
|
111
|
+
|
|
112
|
+
prompt = content[0].text
|
|
113
|
+
input_image_base64 = content[1].image
|
|
114
|
+
input_image = base64_to_PIL(input_image_base64)
|
|
115
|
+
# get the first image as output
|
|
116
|
+
output = self.model(image=input_image, prompt=prompt, **kwargs)
|
|
117
|
+
image = output.images[0]
|
|
118
|
+
|
|
119
|
+
image_base64 = PIL_to_base64(image)
|
|
120
|
+
|
|
121
|
+
return ModelOutput(
|
|
122
|
+
model=self.model_name,
|
|
123
|
+
choices=[ChatCompletionChoice.from_content(content=[ContentImage(image=image_base64)])],
|
|
124
|
+
time=time.time(),
|
|
125
|
+
)
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
from typing import Any, Dict, Generator, Iterable, Iterator, List, Optional, Union
|
|
2
|
+
|
|
3
|
+
from evalscope.api.dataset import Dataset
|
|
4
|
+
from evalscope.api.messages import ChatMessage
|
|
5
|
+
from evalscope.api.model import GenerateConfig, ModelAPI, ModelOutput
|
|
6
|
+
from evalscope.api.tool import ToolChoice, ToolInfo
|
|
7
|
+
from evalscope.utils.function_utils import thread_safe
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class MockLLM(ModelAPI):
|
|
11
|
+
"""A mock implementation of the ModelAPI class for testing purposes.
|
|
12
|
+
|
|
13
|
+
Always returns default_output, unless you pass in a model_args
|
|
14
|
+
key "custom_outputs" with a value of an Iterable[ModelOutput]
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
default_output = 'Default output from mockllm/model'
|
|
18
|
+
|
|
19
|
+
outputs: Iterator[ModelOutput]
|
|
20
|
+
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
model_name: str,
|
|
24
|
+
base_url: Optional[str] = None,
|
|
25
|
+
api_key: Optional[str] = None,
|
|
26
|
+
config: GenerateConfig = GenerateConfig(),
|
|
27
|
+
custom_outputs: Iterable[ModelOutput] = None,
|
|
28
|
+
**model_args: Dict[str, Any],
|
|
29
|
+
) -> None:
|
|
30
|
+
super().__init__(model_name, base_url, api_key, config)
|
|
31
|
+
self.model_args = model_args
|
|
32
|
+
if custom_outputs is not None:
|
|
33
|
+
# We cannot rely on the user of this model giving custom_outputs
|
|
34
|
+
# the correct type since they do not call this constructor
|
|
35
|
+
# Hence this type check and the one in generate.
|
|
36
|
+
if not isinstance(custom_outputs, (Iterable, Generator)):
|
|
37
|
+
raise ValueError(
|
|
38
|
+
f"model_args['custom_outputs'] must be an Iterable or a Generator, got {custom_outputs}"
|
|
39
|
+
)
|
|
40
|
+
self.outputs = iter(custom_outputs)
|
|
41
|
+
else:
|
|
42
|
+
self.outputs = iter((
|
|
43
|
+
ModelOutput.from_content(model='mockllm', content=self.default_output)
|
|
44
|
+
for _ in iter(int, 1) # produce an infinite iterator
|
|
45
|
+
))
|
|
46
|
+
|
|
47
|
+
@thread_safe
|
|
48
|
+
def generate(
|
|
49
|
+
self,
|
|
50
|
+
input: List[ChatMessage],
|
|
51
|
+
tools: List[ToolInfo],
|
|
52
|
+
tool_choice: ToolChoice,
|
|
53
|
+
config: GenerateConfig,
|
|
54
|
+
) -> ModelOutput:
|
|
55
|
+
try:
|
|
56
|
+
output = next(self.outputs)
|
|
57
|
+
except StopIteration:
|
|
58
|
+
raise ValueError('custom_outputs ran out of values')
|
|
59
|
+
|
|
60
|
+
if not isinstance(output, ModelOutput):
|
|
61
|
+
raise ValueError(f'output must be an instance of ModelOutput; got {type(output)}; content: {repr(output)}')
|
|
62
|
+
return output
|
|
63
|
+
|
|
64
|
+
def batch_generate(inputs: Dataset, config: GenerateConfig) -> List[ModelOutput]:
|
|
65
|
+
return super().batch_generate(inputs, config)
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from evalscope.api.model import ModelAPI
|
|
2
|
+
from evalscope.api.registry import register_model_api
|
|
3
|
+
from evalscope.utils.deprecation_utils import deprecated
|
|
4
|
+
from evalscope.utils.import_utils import check_import
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@register_model_api(name='mock_llm')
|
|
8
|
+
def mockllm() -> type[ModelAPI]:
|
|
9
|
+
from .mockllm import MockLLM
|
|
10
|
+
|
|
11
|
+
return MockLLM
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@register_model_api(name='openai_api')
|
|
15
|
+
def openai_api() -> type[ModelAPI]:
|
|
16
|
+
from .openai_compatible import OpenAICompatibleAPI
|
|
17
|
+
|
|
18
|
+
return OpenAICompatibleAPI
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@register_model_api(name='server')
|
|
22
|
+
@deprecated(since='1.0.0', remove_in='1.1.0', alternative='openai_api')
|
|
23
|
+
def server() -> type[ModelAPI]:
|
|
24
|
+
from .openai_compatible import OpenAICompatibleAPI
|
|
25
|
+
|
|
26
|
+
return OpenAICompatibleAPI
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@register_model_api(name='llm_ckpt')
|
|
30
|
+
def llm_ckpt() -> type[ModelAPI]:
|
|
31
|
+
check_import('torch', package='torch', raise_error=True)
|
|
32
|
+
|
|
33
|
+
from .modelscope import ModelScopeAPI
|
|
34
|
+
|
|
35
|
+
return ModelScopeAPI
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@register_model_api(name='checkpoint')
|
|
39
|
+
@deprecated(since='1.0.0', remove_in='1.1.0', alternative='llm_ckpt')
|
|
40
|
+
def checkpoint() -> type[ModelAPI]:
|
|
41
|
+
check_import('torch', package='torch', raise_error=True)
|
|
42
|
+
|
|
43
|
+
from .modelscope import ModelScopeAPI
|
|
44
|
+
|
|
45
|
+
return ModelScopeAPI
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@register_model_api(name='text2image')
|
|
49
|
+
def text2image() -> type[ModelAPI]:
|
|
50
|
+
check_import('torch', package='evalscope[aigc]', raise_error=True)
|
|
51
|
+
check_import('torchvision', package='evalscope[aigc]', raise_error=True)
|
|
52
|
+
check_import('diffusers', package='evalscope[aigc]', raise_error=True)
|
|
53
|
+
|
|
54
|
+
from .text2image_model import Text2ImageAPI
|
|
55
|
+
|
|
56
|
+
return Text2ImageAPI
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@register_model_api(name='image_editing')
|
|
60
|
+
def image_editing() -> type[ModelAPI]:
|
|
61
|
+
check_import('torch', package='evalscope[aigc]', raise_error=True)
|
|
62
|
+
check_import('torchvision', package='evalscope[aigc]', raise_error=True)
|
|
63
|
+
check_import('diffusers', package='evalscope[aigc]', raise_error=True)
|
|
64
|
+
|
|
65
|
+
from .image_edit_model import ImageEditAPI
|
|
66
|
+
|
|
67
|
+
return ImageEditAPI
|