evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/api/benchmark/__init__.py +9 -1
- evalscope/api/benchmark/adapters/__init__.py +4 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +85 -2
- evalscope/api/benchmark/meta.py +10 -1
- evalscope/api/dataset/dataset.py +27 -6
- evalscope/api/dataset/loader.py +8 -3
- evalscope/api/evaluator/cache.py +31 -4
- evalscope/api/evaluator/evaluator.py +5 -0
- evalscope/api/evaluator/state.py +17 -1
- evalscope/api/messages/__init__.py +1 -0
- evalscope/api/messages/chat_message.py +52 -2
- evalscope/api/metric/__init__.py +1 -1
- evalscope/api/metric/metric.py +6 -1
- evalscope/api/metric/scorer.py +15 -7
- evalscope/api/mixin/__init__.py +1 -1
- evalscope/api/mixin/llm_judge_mixin.py +2 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/generate_config.py +10 -6
- evalscope/api/model/model.py +5 -2
- evalscope/api/tool/tool_info.py +1 -1
- evalscope/app/app.py +3 -0
- evalscope/app/ui/multi_model.py +6 -1
- evalscope/app/ui/single_model.py +11 -5
- evalscope/app/utils/data_utils.py +8 -7
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -12
- evalscope/app/utils/visualization.py +2 -2
- evalscope/arguments.py +8 -4
- evalscope/backend/opencompass/backend_manager.py +0 -2
- evalscope/backend/rag_eval/utils/embedding.py +9 -1
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/aime24_adapter.py +5 -0
- evalscope/benchmarks/aime/aime25_adapter.py +136 -1
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/benchmarks/aime/math_normalize.py +189 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
- evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
- evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/drop_adapter.py +15 -44
- evalscope/benchmarks/drop/utils.py +97 -0
- evalscope/benchmarks/frames/frames_adapter.py +2 -1
- evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
- evalscope/benchmarks/general_arena/utils.py +2 -1
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hle/hle_adapter.py +3 -2
- evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
- evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/config.py +103 -18
- evalscope/constants.py +18 -0
- evalscope/evaluator/evaluator.py +138 -82
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/llm_judge.py +19 -7
- evalscope/metrics/math_parser.py +14 -0
- evalscope/metrics/metric.py +317 -13
- evalscope/metrics/metrics.py +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/model_apis.py +22 -0
- evalscope/models/openai_compatible.py +21 -0
- evalscope/models/text2image_model.py +2 -2
- evalscope/models/utils/openai.py +16 -6
- evalscope/perf/arguments.py +26 -4
- evalscope/perf/benchmark.py +76 -89
- evalscope/perf/http_client.py +31 -16
- evalscope/perf/main.py +15 -2
- evalscope/perf/plugin/api/base.py +9 -7
- evalscope/perf/plugin/api/custom_api.py +13 -58
- evalscope/perf/plugin/api/default_api.py +188 -79
- evalscope/perf/plugin/api/openai_api.py +85 -20
- evalscope/perf/plugin/datasets/base.py +21 -0
- evalscope/perf/plugin/datasets/custom.py +2 -3
- evalscope/perf/plugin/datasets/flickr8k.py +2 -2
- evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
- evalscope/perf/plugin/datasets/line_by_line.py +2 -3
- evalscope/perf/plugin/datasets/longalpaca.py +2 -3
- evalscope/perf/plugin/datasets/openqa.py +2 -4
- evalscope/perf/plugin/datasets/random_dataset.py +1 -3
- evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
- evalscope/perf/utils/benchmark_util.py +43 -27
- evalscope/perf/utils/db_util.py +14 -19
- evalscope/perf/utils/local_server.py +3 -44
- evalscope/perf/utils/log_utils.py +21 -6
- evalscope/report/__init__.py +13 -3
- evalscope/report/combinator.py +91 -20
- evalscope/report/generator.py +8 -87
- evalscope/report/report.py +8 -4
- evalscope/run.py +13 -5
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/argument_utils.py +1 -1
- evalscope/utils/chat_service.py +1 -1
- evalscope/utils/function_utils.py +249 -12
- evalscope/utils/import_utils.py +73 -1
- evalscope/utils/io_utils.py +132 -7
- evalscope/utils/json_schema.py +25 -2
- evalscope/utils/logger.py +69 -18
- evalscope/utils/model_utils.py +4 -3
- evalscope/utils/multi_choices.py +39 -7
- evalscope/utils/ner.py +377 -0
- evalscope/version.py +2 -2
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/api/mixin/dataset_mixin.py +0 -105
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
- tests/__init__.py +0 -1
- tests/aigc/__init__.py +0 -1
- tests/aigc/test_t2i.py +0 -142
- tests/benchmark/__init__.py +0 -1
- tests/benchmark/test_eval.py +0 -386
- tests/cli/__init__.py +0 -1
- tests/cli/test_all.py +0 -229
- tests/cli/test_collection.py +0 -96
- tests/cli/test_custom.py +0 -268
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -176
- tests/rag/test_clip_benchmark.py +0 -90
- tests/rag/test_mteb.py +0 -213
- tests/rag/test_ragas.py +0 -128
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -146
- tests/swift/test_run_swift_vlm_eval.py +0 -128
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
- tests/test_run_all.py +0 -12
- tests/utils.py +0 -13
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -102
- /evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
- {tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -1,105 +0,0 @@
|
|
|
1
|
-
from abc import ABC
|
|
2
|
-
from collections import defaultdict
|
|
3
|
-
from typing import Any, Callable, Dict
|
|
4
|
-
|
|
5
|
-
from evalscope.api.dataset import Dataset, DatasetDict, RemoteDataLoader
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class DatasetLoaderMixin:
|
|
9
|
-
"""
|
|
10
|
-
Mixin class providing dataset loading functionality for benchmarks.
|
|
11
|
-
|
|
12
|
-
This mixin provides common dataset loading methods that can be shared
|
|
13
|
-
across different data adapters, including support for:
|
|
14
|
-
- Loading multiple subsets
|
|
15
|
-
- Few-shot dataset loading
|
|
16
|
-
- Remote dataset loading with configuration
|
|
17
|
-
"""
|
|
18
|
-
|
|
19
|
-
def load_subsets(self, load_func: Callable[[str], Dataset]) -> DatasetDict:
|
|
20
|
-
"""
|
|
21
|
-
Load multiple subsets of the dataset using the provided loading function.
|
|
22
|
-
|
|
23
|
-
This method handles two loading strategies:
|
|
24
|
-
1. Reformat mode: Load only the default subset and reformat it
|
|
25
|
-
2. Multi-subset mode: Load all subsets specified in subset_list
|
|
26
|
-
|
|
27
|
-
Args:
|
|
28
|
-
load_func (Callable[[str], Dataset]): Function to load individual subsets
|
|
29
|
-
|
|
30
|
-
Returns:
|
|
31
|
-
DatasetDict: Dictionary containing all loaded subsets
|
|
32
|
-
"""
|
|
33
|
-
if self.reformat_subset:
|
|
34
|
-
# Load only the default subset
|
|
35
|
-
subset_data = load_func(self.default_subset)
|
|
36
|
-
# Reformat the subset to create multiple subsets based on sample keys
|
|
37
|
-
# NOTE: subset_list and limit is applied here if specified
|
|
38
|
-
dataset_dict = DatasetDict.from_dataset(dataset=subset_data, subset_list=self.subset_list, limit=self.limit)
|
|
39
|
-
else:
|
|
40
|
-
# Load all specified subsets into separate entries
|
|
41
|
-
subset_dict = defaultdict()
|
|
42
|
-
for subset in self.subset_list:
|
|
43
|
-
subset_data = load_func(subset)
|
|
44
|
-
subset_dict[subset] = subset_data
|
|
45
|
-
dataset_dict = DatasetDict(subset_dict)
|
|
46
|
-
return dataset_dict
|
|
47
|
-
|
|
48
|
-
def load_subset(self, subset: str) -> Dataset:
|
|
49
|
-
"""
|
|
50
|
-
Load a specific subset of the dataset for evaluation.
|
|
51
|
-
|
|
52
|
-
This method configures and executes the data loading for a single subset,
|
|
53
|
-
handling both split-as-subset and traditional subset configurations.
|
|
54
|
-
|
|
55
|
-
Args:
|
|
56
|
-
subset (str): The subset identifier to load
|
|
57
|
-
|
|
58
|
-
Returns:
|
|
59
|
-
Dataset: The loaded dataset subset with processed samples
|
|
60
|
-
"""
|
|
61
|
-
# Determine the split and subset names based on configuration
|
|
62
|
-
split = subset if self.split_as_subset else self.eval_split
|
|
63
|
-
subset_name = self.default_subset if self.split_as_subset else subset
|
|
64
|
-
|
|
65
|
-
# Create and configure the remote data loader
|
|
66
|
-
loader = RemoteDataLoader(
|
|
67
|
-
data_id_or_path=self.dataset_id,
|
|
68
|
-
split=split,
|
|
69
|
-
subset=subset_name,
|
|
70
|
-
sample_fields=self.record_to_sample, # Custom sample conversion function
|
|
71
|
-
limit=self.limit if not self.reformat_subset else None, # Limit number of samples if specified
|
|
72
|
-
repeats=self._task_config.repeats, # Number of repetitions for each sample
|
|
73
|
-
data_source=self._task_config.dataset_hub, # Data source configuration
|
|
74
|
-
)
|
|
75
|
-
return loader.load()
|
|
76
|
-
|
|
77
|
-
def load_fewshot_subset(self, subset: str) -> Dataset:
|
|
78
|
-
"""
|
|
79
|
-
Load a subset specifically for few-shot examples.
|
|
80
|
-
|
|
81
|
-
This method loads training data to be used as demonstrations in few-shot prompting.
|
|
82
|
-
It typically loads from the training split with limited samples and optional shuffling.
|
|
83
|
-
|
|
84
|
-
Args:
|
|
85
|
-
subset (str): The subset identifier to load few-shot examples from
|
|
86
|
-
|
|
87
|
-
Returns:
|
|
88
|
-
Dataset: The loaded few-shot dataset with demonstration examples
|
|
89
|
-
"""
|
|
90
|
-
# Use training split for few-shot examples
|
|
91
|
-
split = subset if self.split_as_subset else self.train_split
|
|
92
|
-
subset_name = self.default_subset if self.split_as_subset else subset
|
|
93
|
-
|
|
94
|
-
# Create loader specifically configured for few-shot sampling
|
|
95
|
-
loader = RemoteDataLoader(
|
|
96
|
-
data_id_or_path=self.dataset_id,
|
|
97
|
-
split=split,
|
|
98
|
-
subset=subset_name,
|
|
99
|
-
sample_fields=self.record_to_sample,
|
|
100
|
-
limit=self.few_shot_num
|
|
101
|
-
if not self.reformat_subset else None, # Limit to specified number of few-shot examples
|
|
102
|
-
shuffle=self.few_shot_random, # Randomize selection if enabled
|
|
103
|
-
data_source=self._task_config.dataset_hub,
|
|
104
|
-
)
|
|
105
|
-
return loader.load()
|
|
@@ -1,44 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
import os.path
|
|
3
|
-
from collections import defaultdict
|
|
4
|
-
from typing import List, Optional, Union
|
|
5
|
-
|
|
6
|
-
from evalscope.utils.io_utils import jsonl_to_list
|
|
7
|
-
from evalscope.utils.logger import get_logger
|
|
8
|
-
|
|
9
|
-
logger = get_logger()
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class GeneralI2IAdapter:
|
|
13
|
-
|
|
14
|
-
def __init__(self, **kwargs):
|
|
15
|
-
|
|
16
|
-
super().__init__(**kwargs)
|
|
17
|
-
|
|
18
|
-
def load(self, dataset_name_or_path: str = None, subset_list: list = None, **kwargs) -> dict:
|
|
19
|
-
dataset_name_or_path = dataset_name_or_path or self.dataset_id
|
|
20
|
-
subset_list = subset_list or self.subset_list
|
|
21
|
-
|
|
22
|
-
data_file_dict = defaultdict(str)
|
|
23
|
-
data_item_dict = defaultdict(list)
|
|
24
|
-
|
|
25
|
-
# get data file path and subset name
|
|
26
|
-
if os.path.isdir(dataset_name_or_path):
|
|
27
|
-
for subset_name in subset_list:
|
|
28
|
-
data_file_dict[subset_name] = os.path.join(dataset_name_or_path, f'{subset_name}.jsonl')
|
|
29
|
-
elif os.path.isfile(dataset_name_or_path):
|
|
30
|
-
cur_subset_name = os.path.splitext(os.path.basename(dataset_name_or_path))[0]
|
|
31
|
-
data_file_dict[cur_subset_name] = dataset_name_or_path
|
|
32
|
-
else:
|
|
33
|
-
raise ValueError(f'Invalid dataset path: {dataset_name_or_path}')
|
|
34
|
-
|
|
35
|
-
# load data from local disk
|
|
36
|
-
try:
|
|
37
|
-
for subset_name, file_path in data_file_dict.items():
|
|
38
|
-
data_item_dict[subset_name] = jsonl_to_list(file_path)
|
|
39
|
-
except Exception as e:
|
|
40
|
-
raise ValueError(f'Failed to load data from {self.dataset_id}, got error: {e}')
|
|
41
|
-
|
|
42
|
-
data_dict = {subset_name: {'test': data_item_dict[subset_name]} for subset_name in data_file_dict.keys()}
|
|
43
|
-
|
|
44
|
-
return data_dict
|
tests/__init__.py
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
tests/aigc/__init__.py
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
tests/aigc/test_t2i.py
DELETED
|
@@ -1,142 +0,0 @@
|
|
|
1
|
-
from dotenv import dotenv_values
|
|
2
|
-
|
|
3
|
-
env = dotenv_values('.env')
|
|
4
|
-
|
|
5
|
-
import os
|
|
6
|
-
import unittest
|
|
7
|
-
|
|
8
|
-
from evalscope.config import TaskConfig
|
|
9
|
-
from evalscope.constants import EvalType, JudgeStrategy, ModelTask, OutputType
|
|
10
|
-
from evalscope.run import run_task
|
|
11
|
-
from evalscope.utils.logger import get_logger
|
|
12
|
-
from tests.utils import test_level_list
|
|
13
|
-
|
|
14
|
-
os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
|
|
15
|
-
|
|
16
|
-
logger = get_logger()
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
class TestRun(unittest.TestCase):
|
|
20
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
21
|
-
def test_run_general(self):
|
|
22
|
-
from evalscope.config import TaskConfig
|
|
23
|
-
|
|
24
|
-
task_cfg = TaskConfig(
|
|
25
|
-
datasets=[
|
|
26
|
-
'general_t2i'
|
|
27
|
-
],
|
|
28
|
-
model_task=ModelTask.IMAGE_GENERATION, # must be IMAGE_GENERATION
|
|
29
|
-
dataset_args={
|
|
30
|
-
'general_t2i': {
|
|
31
|
-
'metric_list': [
|
|
32
|
-
'PickScore',
|
|
33
|
-
# 'CLIPScore',
|
|
34
|
-
# 'HPSv2Score',
|
|
35
|
-
# 'HPSv2.1Score',
|
|
36
|
-
# 'BLIPv2Score',
|
|
37
|
-
# 'ImageRewardScore',
|
|
38
|
-
# 'VQAScore',
|
|
39
|
-
# 'FGA_BLIP2Score',
|
|
40
|
-
# 'MPS'
|
|
41
|
-
],
|
|
42
|
-
'dataset_id': 'custom_eval/multimodal/t2i/example.jsonl',
|
|
43
|
-
}
|
|
44
|
-
}
|
|
45
|
-
)
|
|
46
|
-
|
|
47
|
-
run_task(task_cfg=task_cfg)
|
|
48
|
-
|
|
49
|
-
def test_run_local_evalmuse(self):
|
|
50
|
-
from evalscope import TaskConfig, run_task
|
|
51
|
-
|
|
52
|
-
task_cfg = TaskConfig(
|
|
53
|
-
model_id='T2I-Model', # 只用于展示,实际运行时不需要指定模型ID
|
|
54
|
-
model_task=ModelTask.IMAGE_GENERATION,
|
|
55
|
-
datasets=[
|
|
56
|
-
'evalmuse', # 使用 EvalMuse benchmark
|
|
57
|
-
],
|
|
58
|
-
dataset_args={
|
|
59
|
-
'evalmuse': {
|
|
60
|
-
'dataset_id': 'data/example.jsonl', # 构建的jsonl路径
|
|
61
|
-
}
|
|
62
|
-
},
|
|
63
|
-
)
|
|
64
|
-
|
|
65
|
-
run_task(task_cfg=task_cfg)
|
|
66
|
-
|
|
67
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
68
|
-
def test_run_benchmark(self):
|
|
69
|
-
|
|
70
|
-
task_cfg = TaskConfig(
|
|
71
|
-
model='stabilityai/stable-diffusion-xl-base-1.0', # model on modelscope
|
|
72
|
-
model_task=ModelTask.IMAGE_GENERATION, # must be IMAGE_GENERATION
|
|
73
|
-
model_args={
|
|
74
|
-
'use_safetensors': True,
|
|
75
|
-
'variant': 'fp16',
|
|
76
|
-
'torch_dtype': 'torch.float16',
|
|
77
|
-
},
|
|
78
|
-
datasets=[
|
|
79
|
-
# 'tifa160',
|
|
80
|
-
# 'genai_bench',
|
|
81
|
-
'evalmuse',
|
|
82
|
-
# 'hpdv2',
|
|
83
|
-
],
|
|
84
|
-
dataset_args={
|
|
85
|
-
'tifa160': {
|
|
86
|
-
'metric_list': [
|
|
87
|
-
# 'PickScore',
|
|
88
|
-
# 'CLIPScore',
|
|
89
|
-
# 'HPSv2Score',
|
|
90
|
-
# 'BLIPv2Score',
|
|
91
|
-
# 'ImageRewardScore',
|
|
92
|
-
# 'VQAScore',
|
|
93
|
-
'FGA_BLIP2Score',
|
|
94
|
-
]
|
|
95
|
-
}
|
|
96
|
-
},
|
|
97
|
-
limit=5,
|
|
98
|
-
generation_config={
|
|
99
|
-
'num_inference_steps': 50,
|
|
100
|
-
'guidance_scale': 7.5
|
|
101
|
-
},
|
|
102
|
-
# use_cache='outputs/20250427_134122',
|
|
103
|
-
)
|
|
104
|
-
|
|
105
|
-
run_task(task_cfg=task_cfg)
|
|
106
|
-
|
|
107
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
108
|
-
def test_run_benchmark_flux(self):
|
|
109
|
-
|
|
110
|
-
task_cfg = TaskConfig(
|
|
111
|
-
model='black-forest-labs/FLUX.1-dev', # model on modelscope
|
|
112
|
-
model_task=ModelTask.IMAGE_GENERATION, # must be IMAGE_GENERATION
|
|
113
|
-
model_args={
|
|
114
|
-
'torch_dtype': 'torch.float16',
|
|
115
|
-
},
|
|
116
|
-
datasets=[
|
|
117
|
-
# 'tifa160',
|
|
118
|
-
# 'genai_bench',
|
|
119
|
-
'evalmuse',
|
|
120
|
-
# 'hpdv2',
|
|
121
|
-
],
|
|
122
|
-
dataset_args={
|
|
123
|
-
'tifa160': {
|
|
124
|
-
'metric_list': [
|
|
125
|
-
'PickScore',
|
|
126
|
-
# 'CLIPScore',
|
|
127
|
-
# 'HPSv2Score',
|
|
128
|
-
# 'BLIPv2Score',
|
|
129
|
-
# 'ImageRewardScore',
|
|
130
|
-
# 'VQAScore',
|
|
131
|
-
# 'FGA_BLIP2Score',
|
|
132
|
-
]
|
|
133
|
-
}
|
|
134
|
-
},
|
|
135
|
-
generation_config={
|
|
136
|
-
'num_inference_steps': 50,
|
|
137
|
-
'guidance_scale': 3.5
|
|
138
|
-
},
|
|
139
|
-
use_cache='outputs/20250520_112314'
|
|
140
|
-
)
|
|
141
|
-
|
|
142
|
-
run_task(task_cfg=task_cfg)
|
tests/benchmark/__init__.py
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|