evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +5 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
- evalscope/api/benchmark/benchmark.py +356 -0
- evalscope/api/benchmark/meta.py +121 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +262 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +378 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +275 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +243 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +1 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +155 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/app.py +3 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +26 -14
- evalscope/app/utils/data_utils.py +43 -27
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -14
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +7 -10
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -5
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +10 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +136 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +157 -57
- evalscope/constants.py +37 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +275 -419
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +47 -33
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +67 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +126 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +701 -0
- evalscope/perf/benchmark.py +4 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +15 -10
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +11 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -3
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +51 -35
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +33 -47
- evalscope/summarizer.py +1 -1
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +3 -2
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/import_utils.py +23 -1
- evalscope/utils/io_utils.py +142 -6
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +11 -7
- evalscope/utils/multi_choices.py +288 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
- tests/benchmark/test_eval.py +385 -0
- tests/benchmark/test_image_edit.py +65 -0
- tests/{aigc → benchmark}/test_t2i.py +22 -4
- tests/benchmark/test_vlm.py +80 -0
- tests/cli/test_all.py +85 -47
- tests/cli/test_collection.py +20 -8
- tests/cli/test_custom.py +22 -15
- tests/cli/test_reasoning.py +81 -0
- tests/common.py +73 -0
- tests/perf/test_perf.py +4 -2
- tests/rag/test_clip_benchmark.py +0 -2
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
- /tests/{aigc → benchmark}/__init__.py +0 -0
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
from collections import OrderedDict
|
|
3
|
+
from dataclasses import asdict, dataclass, field
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Type, Union
|
|
5
|
+
|
|
6
|
+
from evalscope.constants import OutputType
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from evalscope.api.benchmark import DataAdapter
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class BenchmarkMeta:
|
|
14
|
+
"""Metadata for a benchmark, including dataset and model configurations."""
|
|
15
|
+
|
|
16
|
+
name: str
|
|
17
|
+
""" Unique name of the benchmark."""
|
|
18
|
+
|
|
19
|
+
dataset_id: str
|
|
20
|
+
""" Dataset id on modelscope or path to local dataset."""
|
|
21
|
+
|
|
22
|
+
data_adapter: Optional[Type['DataAdapter']] = None
|
|
23
|
+
""" Data adapter class for the benchmark."""
|
|
24
|
+
|
|
25
|
+
output_types: List[str] = field(default_factory=lambda: [OutputType.GENERATION])
|
|
26
|
+
""" List of output types supported by the benchmark."""
|
|
27
|
+
|
|
28
|
+
subset_list: List[str] = field(default_factory=lambda: ['default'])
|
|
29
|
+
""" List of subsets available for the benchmark."""
|
|
30
|
+
|
|
31
|
+
default_subset: str = 'default'
|
|
32
|
+
""" Default subset to use for the benchmark."""
|
|
33
|
+
|
|
34
|
+
few_shot_num: int = 0
|
|
35
|
+
""" Number of few-shot examples to use."""
|
|
36
|
+
|
|
37
|
+
few_shot_random: bool = False
|
|
38
|
+
""" Whether to use random few-shot examples."""
|
|
39
|
+
|
|
40
|
+
train_split: Optional[str] = None
|
|
41
|
+
""" Training split to use for the benchmark."""
|
|
42
|
+
|
|
43
|
+
eval_split: Optional[str] = None
|
|
44
|
+
""" Evaluation split to use for the benchmark."""
|
|
45
|
+
|
|
46
|
+
prompt_template: Optional[str] = None
|
|
47
|
+
""" Prompt template to use for the benchmark."""
|
|
48
|
+
|
|
49
|
+
few_shot_prompt_template: Optional[str] = None
|
|
50
|
+
""" Few-shot prompt template to use for the benchmark."""
|
|
51
|
+
|
|
52
|
+
system_prompt: Optional[str] = None
|
|
53
|
+
""" System prompt to use for the benchmark."""
|
|
54
|
+
|
|
55
|
+
query_template: Optional[str] = None
|
|
56
|
+
""" Query template to use for the benchmark."""
|
|
57
|
+
|
|
58
|
+
pretty_name: Optional[str] = None
|
|
59
|
+
""" Human-readable name for the benchmark."""
|
|
60
|
+
|
|
61
|
+
description: Optional[str] = None
|
|
62
|
+
""" Description of the benchmark."""
|
|
63
|
+
|
|
64
|
+
tags: List[str] = field(default_factory=list)
|
|
65
|
+
""" Tags associated with the benchmark."""
|
|
66
|
+
|
|
67
|
+
filters: Optional[OrderedDict] = None
|
|
68
|
+
""" Filters to apply to the dataset on model output."""
|
|
69
|
+
|
|
70
|
+
metric_list: List[Union[str, Dict[str, Any]]] = field(default_factory=list)
|
|
71
|
+
""" List of metrics to evaluate the benchmark."""
|
|
72
|
+
|
|
73
|
+
aggregation: str = 'mean'
|
|
74
|
+
""" Aggregation function for the metrics. Default is 'mean'. Can be 'mean', 'pass@<k>' or a custom function name."""
|
|
75
|
+
|
|
76
|
+
shuffle: bool = False
|
|
77
|
+
"""Whether to shuffle the dataset before evaluation."""
|
|
78
|
+
|
|
79
|
+
shuffle_choices: bool = False
|
|
80
|
+
"""Whether to shuffle the choices in multiple-choice datasets."""
|
|
81
|
+
|
|
82
|
+
extra_params: Dict = field(default_factory=dict)
|
|
83
|
+
""" Additional parameters for the benchmark."""
|
|
84
|
+
|
|
85
|
+
def __post_init__(self):
|
|
86
|
+
"""Validate fields after initialization."""
|
|
87
|
+
if self.few_shot_num < 0:
|
|
88
|
+
raise ValueError('few_shot_num must be >= 0')
|
|
89
|
+
|
|
90
|
+
def _update(self, args: dict):
|
|
91
|
+
"""Update instance with provided arguments, maintaining backward compatibility."""
|
|
92
|
+
args = copy.deepcopy(args)
|
|
93
|
+
|
|
94
|
+
if args.get('local_path'):
|
|
95
|
+
self.dataset_id = args['local_path']
|
|
96
|
+
del args['local_path']
|
|
97
|
+
|
|
98
|
+
if args.get('filters'):
|
|
99
|
+
if self.filters is None:
|
|
100
|
+
self.filters = OrderedDict()
|
|
101
|
+
new_filters = OrderedDict(args['filters'])
|
|
102
|
+
# insert filters at the beginning
|
|
103
|
+
self.filters = OrderedDict(list(new_filters.items()) + list(self.filters.items()))
|
|
104
|
+
del args['filters']
|
|
105
|
+
# Update fields with validation
|
|
106
|
+
for key, value in args.items():
|
|
107
|
+
if hasattr(self, key):
|
|
108
|
+
setattr(self, key, value) # Validate few_shot_num if it's being updated
|
|
109
|
+
if key == 'few_shot_num' and value < 0:
|
|
110
|
+
raise ValueError('few_shot_num must be >= 0')
|
|
111
|
+
|
|
112
|
+
def to_dict(self) -> dict:
|
|
113
|
+
"""Convert to dictionary, maintaining backward compatibility."""
|
|
114
|
+
return asdict(self)
|
|
115
|
+
|
|
116
|
+
def to_string_dict(self) -> dict:
|
|
117
|
+
"""Convert to string dictionary, excluding data_adapter."""
|
|
118
|
+
cur_dict = copy.deepcopy(asdict(self))
|
|
119
|
+
if 'data_adapter' in cur_dict:
|
|
120
|
+
del cur_dict['data_adapter']
|
|
121
|
+
return cur_dict
|
|
@@ -0,0 +1,349 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
import random
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, Union
|
|
7
|
+
|
|
8
|
+
from evalscope.api.messages import ChatMessage, messages_to_markdown
|
|
9
|
+
from evalscope.api.tool import ToolInfo
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Sample(BaseModel):
|
|
13
|
+
r"""Sample for an evaluation task."""
|
|
14
|
+
|
|
15
|
+
input: Union[str, List[ChatMessage]]
|
|
16
|
+
"""The input to be submitted to the model."""
|
|
17
|
+
|
|
18
|
+
choices: Optional[List[str]] = None
|
|
19
|
+
"""List of available answer choices (used only for multiple-choice evals)."""
|
|
20
|
+
|
|
21
|
+
target: Union[str, List[str]] = ''
|
|
22
|
+
"""Ideal target output. May be a literal value or narrative text to be used by a model grader."""
|
|
23
|
+
|
|
24
|
+
id: Optional[int] = None
|
|
25
|
+
"""Unique identifier for sample."""
|
|
26
|
+
|
|
27
|
+
group_id: Optional[int] = None
|
|
28
|
+
"""Identifier for the group this sample belongs to, used for grouping k repeated samples."""
|
|
29
|
+
|
|
30
|
+
tools: Optional[List[ToolInfo]] = None
|
|
31
|
+
"""List of tools available to the model during inference (optional)."""
|
|
32
|
+
|
|
33
|
+
subset_key: Optional[str] = None
|
|
34
|
+
"""Key for the subset this sample belongs to, used for generating subsets (optional)."""
|
|
35
|
+
|
|
36
|
+
metadata: Dict[str, Any] = Field(default_factory=dict)
|
|
37
|
+
"""Arbitrary metadata associated with the sample."""
|
|
38
|
+
|
|
39
|
+
sandbox: Optional[str] = None
|
|
40
|
+
"""Sandbox environment type and optional config file."""
|
|
41
|
+
|
|
42
|
+
files: Optional[Dict[str, str]] = None
|
|
43
|
+
"""Files that go along with the sample (copied to SandboxEnvironment)"""
|
|
44
|
+
|
|
45
|
+
setup: Optional[str] = None
|
|
46
|
+
"""Setup script to run for sample (run within default SandboxEnvironment)."""
|
|
47
|
+
|
|
48
|
+
def pretty_print(self) -> str:
|
|
49
|
+
"""Return a pretty-printed string representation of the sample."""
|
|
50
|
+
if isinstance(self.input, str):
|
|
51
|
+
input_text = self.input
|
|
52
|
+
else:
|
|
53
|
+
input_text = messages_to_markdown(self.input, max_length=50)
|
|
54
|
+
return f'Sample ID: {self.id}\nInput: {input_text}\nTarget: {self.target}'
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass
|
|
58
|
+
class FieldSpec:
|
|
59
|
+
r"""Specification for mapping data source fields to sample fields."""
|
|
60
|
+
|
|
61
|
+
input: str = field(default='input')
|
|
62
|
+
"""Name of the field containing the sample input."""
|
|
63
|
+
|
|
64
|
+
target: str = field(default='target')
|
|
65
|
+
"""Name of the field containing the sample target."""
|
|
66
|
+
|
|
67
|
+
choices: str = field(default='choices')
|
|
68
|
+
"""Name of field containing the list of answer choices."""
|
|
69
|
+
|
|
70
|
+
id: int = field(default=0)
|
|
71
|
+
""" Unique identifier for the sample."""
|
|
72
|
+
|
|
73
|
+
metadata: Optional[List[str]] = field(default=None)
|
|
74
|
+
"""List of additional field names that should be read as metadata."""
|
|
75
|
+
|
|
76
|
+
sandbox: str = field(default='sandbox')
|
|
77
|
+
"""Sandbox type along with optional config file."""
|
|
78
|
+
|
|
79
|
+
files: str = field(default='files')
|
|
80
|
+
"""Files that go along with the sample."""
|
|
81
|
+
|
|
82
|
+
setup: str = field(default='setup')
|
|
83
|
+
"""Setup script to run for sample (run within default SandboxEnvironment)."""
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class Dataset(Sequence[Sample], abc.ABC):
|
|
87
|
+
r"""A sequence of Sample objects.
|
|
88
|
+
|
|
89
|
+
Datasets provide sequential access (via conventional indexes or slicing)
|
|
90
|
+
to a collection of Sample objects.
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
@abc.abstractmethod
|
|
95
|
+
def name(self) -> Optional[str]:
|
|
96
|
+
...
|
|
97
|
+
|
|
98
|
+
@property
|
|
99
|
+
@abc.abstractmethod
|
|
100
|
+
def location(self) -> Optional[str]:
|
|
101
|
+
...
|
|
102
|
+
|
|
103
|
+
@property
|
|
104
|
+
@abc.abstractmethod
|
|
105
|
+
def shuffled(self) -> bool:
|
|
106
|
+
...
|
|
107
|
+
|
|
108
|
+
@abc.abstractmethod
|
|
109
|
+
def __iter__(self) -> Iterator[Sample]:
|
|
110
|
+
"""Return an iterator over the samples."""
|
|
111
|
+
...
|
|
112
|
+
|
|
113
|
+
@abc.abstractmethod
|
|
114
|
+
def __getitem__(self, index: Union[int, slice]) -> Union[Sample, 'Dataset']:
|
|
115
|
+
...
|
|
116
|
+
|
|
117
|
+
@abc.abstractmethod
|
|
118
|
+
def __len__(self) -> int:
|
|
119
|
+
...
|
|
120
|
+
|
|
121
|
+
@abc.abstractmethod
|
|
122
|
+
def filter(self, predicate: Callable[[Sample], bool], name: Optional[str] = None) -> 'Dataset':
|
|
123
|
+
"""Filter the dataset using a predicate. Only samples matching the predicate will be included.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
predicate: Filtering function.
|
|
127
|
+
name: Name for filtered dataset (optional).
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
Filtered dataset.
|
|
131
|
+
"""
|
|
132
|
+
...
|
|
133
|
+
|
|
134
|
+
@abc.abstractmethod
|
|
135
|
+
def shuffle(self, seed: Optional[int] = None) -> None:
|
|
136
|
+
"""Shuffle the order of the dataset (in place).
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
seed: Random seed for shuffling (optional).
|
|
140
|
+
"""
|
|
141
|
+
...
|
|
142
|
+
|
|
143
|
+
@abc.abstractmethod
|
|
144
|
+
def shuffle_choices(self, seed: Optional[int] = None) -> None:
|
|
145
|
+
"""Shuffle the order of the choices with each sample.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
seed: Random seed for shuffling (optional).
|
|
149
|
+
"""
|
|
150
|
+
...
|
|
151
|
+
|
|
152
|
+
@abc.abstractmethod
|
|
153
|
+
def reindex(self, group_size=1):
|
|
154
|
+
"""Reindex the dataset samples to ensure consistent ordering.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
group_size: Number of samples per group for setting group_id.
|
|
158
|
+
"""
|
|
159
|
+
...
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
class MemoryDataset(Dataset):
|
|
163
|
+
r"""A Dataset stored in memory."""
|
|
164
|
+
|
|
165
|
+
def __init__(
|
|
166
|
+
self,
|
|
167
|
+
samples: List[Sample],
|
|
168
|
+
name: Optional[str] = None,
|
|
169
|
+
location: Optional[str] = None,
|
|
170
|
+
shuffled: bool = False,
|
|
171
|
+
) -> None:
|
|
172
|
+
r"""A dataset of samples held in an in-memory list.
|
|
173
|
+
|
|
174
|
+
Datasets provide sequential access (via conventional indexes or slicing)
|
|
175
|
+
to a collection of Sample objects. The ListDataset is explicitly
|
|
176
|
+
initialized with a list that is held in memory.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
samples (List[Sample]): The list of sample objects.
|
|
180
|
+
name (str | None): Optional name for dataset.
|
|
181
|
+
location (str | None): Optional location for dataset.
|
|
182
|
+
shuffled (bool): Was the dataset shuffled after reading.
|
|
183
|
+
"""
|
|
184
|
+
self.samples = samples
|
|
185
|
+
self._name = name
|
|
186
|
+
self._location = location
|
|
187
|
+
self._shuffled = shuffled
|
|
188
|
+
|
|
189
|
+
@property
|
|
190
|
+
def name(self) -> Optional[str]:
|
|
191
|
+
"""Dataset name."""
|
|
192
|
+
return self._name
|
|
193
|
+
|
|
194
|
+
@property
|
|
195
|
+
def location(self) -> Optional[str]:
|
|
196
|
+
"""Dataset location."""
|
|
197
|
+
return self._location
|
|
198
|
+
|
|
199
|
+
@property
|
|
200
|
+
def shuffled(self) -> bool:
|
|
201
|
+
"""Was the dataset shuffled."""
|
|
202
|
+
return self._shuffled
|
|
203
|
+
|
|
204
|
+
def __iter__(self) -> Iterator[Sample]:
|
|
205
|
+
return iter(self.samples)
|
|
206
|
+
|
|
207
|
+
def __getitem__(self, index: Union[int, slice]) -> Union[Sample, Dataset]:
|
|
208
|
+
if isinstance(index, int):
|
|
209
|
+
return self.samples[index]
|
|
210
|
+
else:
|
|
211
|
+
return MemoryDataset(
|
|
212
|
+
samples=self.samples[index],
|
|
213
|
+
name=self.name,
|
|
214
|
+
location=self.location,
|
|
215
|
+
shuffled=self.shuffled,
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
def __len__(self) -> int:
|
|
219
|
+
return len(self.samples)
|
|
220
|
+
|
|
221
|
+
def shuffle(self, seed: Optional[int] = None) -> None:
|
|
222
|
+
if seed is not None:
|
|
223
|
+
random.Random(seed).shuffle(self.samples)
|
|
224
|
+
else:
|
|
225
|
+
random.shuffle(self.samples)
|
|
226
|
+
self._shuffled = True
|
|
227
|
+
|
|
228
|
+
def shuffle_choices(self, seed: Optional[int] = None) -> None:
|
|
229
|
+
from evalscope.utils.multi_choices import answer_character
|
|
230
|
+
|
|
231
|
+
rand = random.Random(seed)
|
|
232
|
+
for sample in self.samples:
|
|
233
|
+
if not sample.choices:
|
|
234
|
+
continue
|
|
235
|
+
# The original positions
|
|
236
|
+
positions = list(range(len(sample.choices)))
|
|
237
|
+
|
|
238
|
+
# Shuffle the choices
|
|
239
|
+
rand.shuffle(positions)
|
|
240
|
+
shuffled_choices = [sample.choices[i] for i in positions]
|
|
241
|
+
|
|
242
|
+
# Map of original position / target letter
|
|
243
|
+
position_map = {i: answer_character(new_i) for new_i, i in enumerate(positions)}
|
|
244
|
+
|
|
245
|
+
# Update to the shuffled choices and target
|
|
246
|
+
sample.choices = shuffled_choices
|
|
247
|
+
sample.target = self._remap_target(sample.target, position_map=position_map)
|
|
248
|
+
|
|
249
|
+
def _remap_target(self, target: Union[str, List[str]], position_map: Dict[int, str]) -> Union[str, List[str]]:
|
|
250
|
+
from evalscope.utils.multi_choices import answer_index
|
|
251
|
+
|
|
252
|
+
if isinstance(target, list):
|
|
253
|
+
return [position_map[answer_index(t)] for t in target]
|
|
254
|
+
else:
|
|
255
|
+
return position_map[answer_index(target)]
|
|
256
|
+
|
|
257
|
+
def filter(self, predicate: Callable[[Sample], bool], name: Optional[str] = None) -> 'MemoryDataset':
|
|
258
|
+
return MemoryDataset(
|
|
259
|
+
name=name or self.name,
|
|
260
|
+
location=self.location,
|
|
261
|
+
samples=[sample for sample in self.samples if predicate(sample)],
|
|
262
|
+
shuffled=self.shuffled,
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
def reindex(self, group_size=1):
|
|
266
|
+
# Reindex the dataset samples to ensure consistent ordering
|
|
267
|
+
for i, sample in enumerate(self.samples):
|
|
268
|
+
sample.id = i
|
|
269
|
+
sample.group_id = i // group_size
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
class DatasetDict:
|
|
273
|
+
"""
|
|
274
|
+
A dictionary-like container for datasets.
|
|
275
|
+
"""
|
|
276
|
+
|
|
277
|
+
def __init__(self, datasets: Dict[str, Dataset]):
|
|
278
|
+
self.datasets = datasets
|
|
279
|
+
|
|
280
|
+
def __getitem__(self, key: str) -> Dataset:
|
|
281
|
+
return self.datasets[key]
|
|
282
|
+
|
|
283
|
+
def __setitem__(self, key: str, value: Dataset) -> None:
|
|
284
|
+
self.datasets[key] = value
|
|
285
|
+
|
|
286
|
+
def __delitem__(self, key: str) -> None:
|
|
287
|
+
del self.datasets[key]
|
|
288
|
+
|
|
289
|
+
def get(self, key: str, default: Optional[Dataset] = None) -> Optional[Dataset]:
|
|
290
|
+
return self.datasets.get(key, default)
|
|
291
|
+
|
|
292
|
+
def items(self):
|
|
293
|
+
return self.datasets.items()
|
|
294
|
+
|
|
295
|
+
def keys(self):
|
|
296
|
+
return self.datasets.keys()
|
|
297
|
+
|
|
298
|
+
def values(self):
|
|
299
|
+
return self.datasets.values()
|
|
300
|
+
|
|
301
|
+
def __len__(self) -> int:
|
|
302
|
+
return len(self.datasets)
|
|
303
|
+
|
|
304
|
+
@classmethod
|
|
305
|
+
def from_dataset(
|
|
306
|
+
cls,
|
|
307
|
+
dataset: Dataset,
|
|
308
|
+
subset_list: List[str],
|
|
309
|
+
limit: Optional[Union[int, float]] = None,
|
|
310
|
+
repeats: int = 1
|
|
311
|
+
) -> 'DatasetDict':
|
|
312
|
+
"""
|
|
313
|
+
Create a DatasetDict from a single Dataset using subset key in the sample.
|
|
314
|
+
|
|
315
|
+
Args:
|
|
316
|
+
dataset (Dataset): The dataset to wrap in a DatasetDict.
|
|
317
|
+
subset_list (List[str]): List of subset keys to include.
|
|
318
|
+
limit (int | float | None): Optional limit on number of samples per subset.
|
|
319
|
+
If int, limits to that many samples. If float, limits to that fraction of samples.
|
|
320
|
+
|
|
321
|
+
Returns:
|
|
322
|
+
DatasetDict: A new DatasetDict containing the provided dataset.
|
|
323
|
+
"""
|
|
324
|
+
data_dict = defaultdict(list)
|
|
325
|
+
dataset_dict = defaultdict(list)
|
|
326
|
+
# init subset keys to prevent order issues
|
|
327
|
+
for key in subset_list:
|
|
328
|
+
data_dict[key] = []
|
|
329
|
+
dataset_dict[key] = []
|
|
330
|
+
|
|
331
|
+
# Loop through each sample in the dataset
|
|
332
|
+
for sample in dataset.samples:
|
|
333
|
+
subset_key = sample.subset_key or 'default'
|
|
334
|
+
data_dict[subset_key].append(sample)
|
|
335
|
+
# Create a MemoryDataset for each subset key
|
|
336
|
+
for key, samples in data_dict.items():
|
|
337
|
+
if key not in subset_list:
|
|
338
|
+
continue
|
|
339
|
+
# Apply limit if specified
|
|
340
|
+
if limit is not None:
|
|
341
|
+
if isinstance(limit, float):
|
|
342
|
+
limit = int(len(samples) * limit)
|
|
343
|
+
total_limit = limit * repeats
|
|
344
|
+
samples = samples[:total_limit]
|
|
345
|
+
cur_dataset = MemoryDataset(samples, name=dataset.name)
|
|
346
|
+
# Reindex the dataset to ensure consistent IDs and group IDs
|
|
347
|
+
cur_dataset.reindex(group_size=repeats)
|
|
348
|
+
dataset_dict[key] = cur_dataset
|
|
349
|
+
return cls(dataset_dict)
|