evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/__init__.py +0 -0
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +3 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
- evalscope/api/benchmark/benchmark.py +321 -0
- evalscope/api/benchmark/meta.py +115 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +261 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +355 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +264 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +11 -0
- evalscope/api/messages/chat_message.py +198 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +105 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/dataset_mixin.py +105 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +157 -0
- evalscope/api/model/model.py +383 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +23 -11
- evalscope/app/utils/data_utils.py +42 -26
- evalscope/app/utils/text_utils.py +0 -2
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +6 -7
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -3
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +2 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +135 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +95 -54
- evalscope/constants.py +29 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +277 -423
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +32 -30
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +47 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +123 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +698 -0
- evalscope/perf/benchmark.py +2 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +7 -5
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +8 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -2
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +101 -6
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +26 -44
- evalscope/summarizer.py +1 -1
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +2 -1
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/io_utils.py +100 -5
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +10 -7
- evalscope/utils/multi_choices.py +271 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
- tests/aigc/test_t2i.py +22 -4
- tests/benchmark/__init__.py +1 -0
- tests/benchmark/test_eval.py +386 -0
- tests/cli/test_all.py +3 -5
- tests/cli/test_collection.py +13 -4
- tests/cli/test_custom.py +22 -15
- tests/rag/test_clip_benchmark.py +1 -0
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
from evalscope.api.dataset.dataset import Sample
|
|
2
|
+
from evalscope.api.evaluator import Choices, Target, TaskState
|
|
3
|
+
from evalscope.utils.multi_choices import (
|
|
4
|
+
FEW_SHOT_TEMPLATE,
|
|
5
|
+
MultipleChoiceTemplate,
|
|
6
|
+
format_example,
|
|
7
|
+
parse_answers,
|
|
8
|
+
parse_answers_zh,
|
|
9
|
+
prompt,
|
|
10
|
+
valid_template,
|
|
11
|
+
)
|
|
12
|
+
from .default_data_adapter import DefaultDataAdapter
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class MultiChoiceAdapter(DefaultDataAdapter):
|
|
16
|
+
"""
|
|
17
|
+
Adapter for multi-choice benchmarks.
|
|
18
|
+
This adapter formats the input for multi-choice questions and handles few-shot examples.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
multiple_correct: bool = False
|
|
22
|
+
"""Whether the benchmark allows multiple correct answers."""
|
|
23
|
+
|
|
24
|
+
def format_prompt_template(self, sample: Sample) -> str:
|
|
25
|
+
"""
|
|
26
|
+
Format the basic prompt template with the sample data.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
sample (Sample): The sample object containing the prompt data
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
str: The formatted prompt ready for model input
|
|
33
|
+
"""
|
|
34
|
+
assert valid_template(self.prompt_template), 'Prompt template is not valid'
|
|
35
|
+
|
|
36
|
+
return prompt(
|
|
37
|
+
question=sample.input,
|
|
38
|
+
choices=Choices(sample.choices),
|
|
39
|
+
template=self.prompt_template,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
def format_fewshot_template(self, fewshot: str, sample: Sample) -> str:
|
|
43
|
+
"""
|
|
44
|
+
Format the few-shot template with demonstrations and the main prompt.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
fewshot (str): The formatted few-shot demonstration examples
|
|
48
|
+
sample (Sample): The sample object containing the prompt data
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
str: The complete formatted input with few-shot context
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
few_shot_prompt_template = self.few_shot_prompt_template or (FEW_SHOT_TEMPLATE + self.prompt_template)
|
|
55
|
+
|
|
56
|
+
assert valid_template(few_shot_prompt_template), 'Few-shot prompt template is not valid'
|
|
57
|
+
|
|
58
|
+
return prompt(
|
|
59
|
+
question=sample.input, choices=Choices(sample.choices), template=few_shot_prompt_template, fewshot=fewshot
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
def sample_to_fewshot(self, sample: Sample) -> str:
|
|
63
|
+
"""
|
|
64
|
+
Convert a sample to a few-shot formatted string.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
sample (Sample): The sample object to format
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
str: The formatted few-shot example string
|
|
71
|
+
"""
|
|
72
|
+
return format_example(question=sample.input, choices=Choices(sample.choices), answer=Target(sample.target))
|
|
73
|
+
|
|
74
|
+
def extract_answer(self, prediction: str, task_state: TaskState) -> str:
|
|
75
|
+
if self.prompt_template in [
|
|
76
|
+
MultipleChoiceTemplate.CHINESE_SINGLE_ANSWER_TEMPLATE_COT,
|
|
77
|
+
MultipleChoiceTemplate.CHINESE_SINGLE_ANSWER_TEMPLATE
|
|
78
|
+
]:
|
|
79
|
+
# For Chinese COT template, we use a different extraction method
|
|
80
|
+
answers = parse_answers_zh(task_state, multiple_correct=self.multiple_correct)
|
|
81
|
+
else:
|
|
82
|
+
answers = parse_answers(task_state, multiple_correct=self.multiple_correct)
|
|
83
|
+
return ''.join(sorted(list(answers)))
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
from evalscope.api.dataset import Sample
|
|
5
|
+
from evalscope.api.evaluator import TaskState
|
|
6
|
+
from evalscope.api.messages.chat_message import ChatMessageUser
|
|
7
|
+
from evalscope.api.messages.content import ContentImage
|
|
8
|
+
from evalscope.api.metric import Score
|
|
9
|
+
from evalscope.api.model import ChatCompletionChoice, Model, ModelOutput
|
|
10
|
+
from evalscope.api.registry import get_metric
|
|
11
|
+
from evalscope.constants import EvalType
|
|
12
|
+
from evalscope.utils import get_logger
|
|
13
|
+
from evalscope.utils.function_utils import thread_safe
|
|
14
|
+
from .default_data_adapter import DefaultDataAdapter
|
|
15
|
+
|
|
16
|
+
logger = get_logger()
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class Text2ImageAdapter(DefaultDataAdapter):
|
|
20
|
+
"""Text to Image Adapter for benchmarks."""
|
|
21
|
+
|
|
22
|
+
def load_from_disk(self, **kwargs):
|
|
23
|
+
return super().load_from_disk(use_local_loader=True)
|
|
24
|
+
|
|
25
|
+
def record_to_sample(self, record) -> Sample:
|
|
26
|
+
"""Convert a record dictionary to a Sample object."""
|
|
27
|
+
return Sample(
|
|
28
|
+
input=[ChatMessageUser(content=record['prompt'])],
|
|
29
|
+
metadata={
|
|
30
|
+
'id': record['id'],
|
|
31
|
+
'prompt': record['prompt'],
|
|
32
|
+
'category': record.get('category', ''),
|
|
33
|
+
'tags': record.get('tags', []),
|
|
34
|
+
'image_path': record.get('image_path', ''), # Optional field for existing image path
|
|
35
|
+
}
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
def _on_inference(self, model: Model, sample: Sample) -> ModelOutput:
|
|
39
|
+
"""
|
|
40
|
+
Hook method called during the actual inference process.
|
|
41
|
+
|
|
42
|
+
This method executes the model inference and can be overridden
|
|
43
|
+
to implement custom inference logic or model interaction patterns.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
model (Model): The model to use for inference
|
|
47
|
+
sample (Sample): The sample to process
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
ModelOutput: The raw output from the model
|
|
51
|
+
"""
|
|
52
|
+
if self.eval_type == EvalType.MOCK_LLM:
|
|
53
|
+
return ModelOutput(
|
|
54
|
+
model=model.name,
|
|
55
|
+
choices=[ChatCompletionChoice.from_content('')],
|
|
56
|
+
)
|
|
57
|
+
else:
|
|
58
|
+
# Execute model inference with the processed input and any tools
|
|
59
|
+
model_output = model.generate(input=sample.input, tools=sample.tools)
|
|
60
|
+
return model_output
|
|
61
|
+
|
|
62
|
+
def _on_inference_end(
|
|
63
|
+
self, model: Model, sample: Sample, model_output: ModelOutput, output_dir: str, **kwargs
|
|
64
|
+
) -> TaskState:
|
|
65
|
+
"""
|
|
66
|
+
Hook method called after inference completes. Save generated images to output_dir.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
model (Model): The model that performed inference
|
|
70
|
+
sample (Sample): The processed sample
|
|
71
|
+
model_output (ModelOutput): The raw model output
|
|
72
|
+
output_dir (str): The directory where the model output was saved
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
TaskState: Complete state object for the inference task
|
|
76
|
+
"""
|
|
77
|
+
if self.eval_type == EvalType.MOCK_LLM:
|
|
78
|
+
return TaskState(
|
|
79
|
+
model=model.name,
|
|
80
|
+
sample=sample,
|
|
81
|
+
messages=[model_output.message],
|
|
82
|
+
output=model_output,
|
|
83
|
+
completed=True,
|
|
84
|
+
)
|
|
85
|
+
else:
|
|
86
|
+
image_id = f"{sample.metadata.get('id',sample.id)}_{sample.group_id}"
|
|
87
|
+
output_path = os.path.join(output_dir, 'images', f'{image_id}.png')
|
|
88
|
+
if not os.path.exists(os.path.dirname(output_path)):
|
|
89
|
+
os.makedirs(os.path.dirname(output_path))
|
|
90
|
+
# get base64 image from model_output
|
|
91
|
+
content = model_output.message.content[0]
|
|
92
|
+
|
|
93
|
+
assert isinstance(content, ContentImage), 'Expected ContentImage in model output'
|
|
94
|
+
|
|
95
|
+
image_base64 = content.image
|
|
96
|
+
with open(output_path, 'wb') as f:
|
|
97
|
+
f.write(base64.b64decode(image_base64))
|
|
98
|
+
|
|
99
|
+
sample.metadata['image_path'] = output_path
|
|
100
|
+
return TaskState(
|
|
101
|
+
model=model.name,
|
|
102
|
+
sample=sample,
|
|
103
|
+
messages=[model_output.message],
|
|
104
|
+
output=model_output,
|
|
105
|
+
completed=True,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# NOTE: thread safe is needed, since we can't batch inference here.
|
|
109
|
+
@thread_safe
|
|
110
|
+
def match_score(
|
|
111
|
+
self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
|
|
112
|
+
) -> Score:
|
|
113
|
+
# Get prediction and prompt from task state
|
|
114
|
+
image_path = task_state.metadata.get('image_path', original_prediction)
|
|
115
|
+
prompt = task_state.input[0].content
|
|
116
|
+
meta = task_state.metadata
|
|
117
|
+
|
|
118
|
+
# Initialize the score object with prediction details
|
|
119
|
+
score = Score(
|
|
120
|
+
extracted_prediction=image_path,
|
|
121
|
+
prediction=image_path,
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# Calculate scores for each configured metric
|
|
125
|
+
for metric in self.metric_list:
|
|
126
|
+
try:
|
|
127
|
+
if isinstance(metric, str):
|
|
128
|
+
metric_name = metric
|
|
129
|
+
metric_scorer = get_metric(metric) # Get metric implementation from registry
|
|
130
|
+
metric_func = metric_scorer() # Instantiate the metric scorer
|
|
131
|
+
elif isinstance(metric, dict):
|
|
132
|
+
metric_name = list(metric.keys())[0]
|
|
133
|
+
metric_cls = get_metric(metric_name)
|
|
134
|
+
metric_func = metric_cls(**metric[metric_name]) # Initialize with parameters
|
|
135
|
+
metric_score = metric_func(image_path, prompt)[0]
|
|
136
|
+
|
|
137
|
+
# fine-granular metrics
|
|
138
|
+
category = meta.get('category')
|
|
139
|
+
if category:
|
|
140
|
+
metric_name = f'{metric_name}_{category}'
|
|
141
|
+
if isinstance(metric_score, dict):
|
|
142
|
+
for k, v in metric_score.items():
|
|
143
|
+
score.value[f'{metric_name}_{k}'] = v.cpu().item()
|
|
144
|
+
else:
|
|
145
|
+
score.value[metric_name] = metric_score.cpu().item()
|
|
146
|
+
except Exception as e:
|
|
147
|
+
logger.error(f'Error calculating metric {metric}: {e}')
|
|
148
|
+
score.value[metric_name] = 0
|
|
149
|
+
score.metadata[metric_name] = f'error: {str(e)}'
|
|
150
|
+
|
|
151
|
+
return score
|
|
152
|
+
|
|
153
|
+
def _on_generate_report(self, scores, model_name, add_aggregation_name=True):
|
|
154
|
+
# Don't add aggregation name for needle haystack adapter
|
|
155
|
+
return super()._on_generate_report(scores, model_name, False)
|
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
|
|
3
|
+
import contextlib
|
|
4
|
+
from abc import ABC, abstractmethod
|
|
5
|
+
from collections import OrderedDict
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
|
|
7
|
+
|
|
8
|
+
from evalscope.api.dataset import DatasetDict, Sample
|
|
9
|
+
from evalscope.api.evaluator import TaskState
|
|
10
|
+
from evalscope.api.filter import FilterEnsemble, build_filter_ensemble
|
|
11
|
+
from evalscope.api.metric import AggScore, SampleScore
|
|
12
|
+
from evalscope.api.mixin import LLMJudgeMixin
|
|
13
|
+
from evalscope.api.model import Model
|
|
14
|
+
from evalscope.report import Report
|
|
15
|
+
from evalscope.utils.logger import get_logger
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from evalscope.api.benchmark import BenchmarkMeta
|
|
19
|
+
from evalscope.config import TaskConfig
|
|
20
|
+
|
|
21
|
+
logger = get_logger()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DataAdapter(LLMJudgeMixin, ABC):
|
|
25
|
+
"""
|
|
26
|
+
Data Adapter for the benchmark.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(self, benchmark_meta: 'BenchmarkMeta', task_config: Optional['TaskConfig'] = None):
|
|
30
|
+
self._benchmark_meta = benchmark_meta
|
|
31
|
+
self._task_config = task_config
|
|
32
|
+
super().__init__(task_config=task_config)
|
|
33
|
+
|
|
34
|
+
self.reformat_subset = False
|
|
35
|
+
"""Whether to reformat the subset data with subset key"""
|
|
36
|
+
|
|
37
|
+
self.split_as_subset = False
|
|
38
|
+
"""Whether to use the split name as the dataset subsets"""
|
|
39
|
+
|
|
40
|
+
self.shuffle_choices = False
|
|
41
|
+
"""Whether to shuffle the choices in the dataset"""
|
|
42
|
+
|
|
43
|
+
self.save_metadata = True
|
|
44
|
+
"""Whether to save metadata in the review result"""
|
|
45
|
+
|
|
46
|
+
self.category_map = {}
|
|
47
|
+
"""Category map for the benchmark"""
|
|
48
|
+
|
|
49
|
+
self.current_subset_name = ''
|
|
50
|
+
"""Subset name when loading datasets"""
|
|
51
|
+
|
|
52
|
+
# dataset
|
|
53
|
+
self.test_dataset: Optional[DatasetDict] = None
|
|
54
|
+
"""Dataset to be evaluated"""
|
|
55
|
+
|
|
56
|
+
self.fewshot_dataset: Optional[DatasetDict] = None
|
|
57
|
+
"""Dataset for few-shot evaluation"""
|
|
58
|
+
|
|
59
|
+
# filters
|
|
60
|
+
self._filter_ensemble: Optional[OrderedDict] = None
|
|
61
|
+
|
|
62
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
63
|
+
"""Convert the benchmark metadata to a dictionary."""
|
|
64
|
+
return self._benchmark_meta.to_string_dict()
|
|
65
|
+
|
|
66
|
+
@abstractmethod
|
|
67
|
+
def load_dataset(self) -> DatasetDict:
|
|
68
|
+
pass
|
|
69
|
+
|
|
70
|
+
@abstractmethod
|
|
71
|
+
def run_inference(self, model: Model, sample: Sample, output_dir: str, **kwargs) -> TaskState:
|
|
72
|
+
pass
|
|
73
|
+
|
|
74
|
+
@abstractmethod
|
|
75
|
+
def calculate_metrics(self, task_state: TaskState) -> SampleScore:
|
|
76
|
+
pass
|
|
77
|
+
|
|
78
|
+
@abstractmethod
|
|
79
|
+
def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
|
|
80
|
+
pass
|
|
81
|
+
|
|
82
|
+
@abstractmethod
|
|
83
|
+
def generate_report(self, scores: Dict[str, List[AggScore]], model_name: str, output_dir: str, **kwargs) -> Report:
|
|
84
|
+
"""
|
|
85
|
+
Generate a report based on the evaluation results.
|
|
86
|
+
"""
|
|
87
|
+
pass
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def name(self) -> str:
|
|
91
|
+
"""
|
|
92
|
+
Return the unique name of the benchmark.
|
|
93
|
+
"""
|
|
94
|
+
return self._benchmark_meta.name
|
|
95
|
+
|
|
96
|
+
@property
|
|
97
|
+
def dataset_id(self) -> str:
|
|
98
|
+
"""
|
|
99
|
+
Return the dataset ID or path to the benchmark.
|
|
100
|
+
"""
|
|
101
|
+
return self._benchmark_meta.dataset_id
|
|
102
|
+
|
|
103
|
+
@property
|
|
104
|
+
def output_types(self) -> Optional[List[str]]:
|
|
105
|
+
"""
|
|
106
|
+
Return the output types of the benchmark.
|
|
107
|
+
"""
|
|
108
|
+
return self._benchmark_meta.output_types
|
|
109
|
+
|
|
110
|
+
@property
|
|
111
|
+
def limit(self) -> Optional[Union[int, float]]:
|
|
112
|
+
"""
|
|
113
|
+
Return the limit for the benchmark.
|
|
114
|
+
"""
|
|
115
|
+
return self._task_config.limit
|
|
116
|
+
|
|
117
|
+
@property
|
|
118
|
+
def repeats(self) -> int:
|
|
119
|
+
"""
|
|
120
|
+
Return the number of repeats for each sample in the benchmark.
|
|
121
|
+
"""
|
|
122
|
+
return self._task_config.repeats
|
|
123
|
+
|
|
124
|
+
@property
|
|
125
|
+
def dataset_hub(self) -> str:
|
|
126
|
+
"""
|
|
127
|
+
Return the dataset hub type for the benchmark.
|
|
128
|
+
"""
|
|
129
|
+
return self._task_config.dataset_hub
|
|
130
|
+
|
|
131
|
+
@dataset_hub.setter
|
|
132
|
+
def dataset_hub(self, value: str):
|
|
133
|
+
"""
|
|
134
|
+
Set the dataset hub type for the benchmark.
|
|
135
|
+
"""
|
|
136
|
+
self._task_config.dataset_hub = value
|
|
137
|
+
|
|
138
|
+
@property
|
|
139
|
+
def eval_type(self) -> str:
|
|
140
|
+
"""
|
|
141
|
+
Return the evaluation type for the benchmark.
|
|
142
|
+
"""
|
|
143
|
+
return self._task_config.eval_type
|
|
144
|
+
|
|
145
|
+
@property
|
|
146
|
+
def subset_list(self) -> List[str]:
|
|
147
|
+
"""
|
|
148
|
+
Return the subset list of the benchmark.
|
|
149
|
+
"""
|
|
150
|
+
return self._benchmark_meta.subset_list
|
|
151
|
+
|
|
152
|
+
@subset_list.setter
|
|
153
|
+
def subset_list(self, value: List[str]):
|
|
154
|
+
"""
|
|
155
|
+
Set the subset list of the benchmark.
|
|
156
|
+
"""
|
|
157
|
+
self._benchmark_meta.subset_list = value
|
|
158
|
+
|
|
159
|
+
@property
|
|
160
|
+
def metric_list(self) -> List[Union[str, Dict[str, Any]]]:
|
|
161
|
+
"""
|
|
162
|
+
Return the metric list of the benchmark.
|
|
163
|
+
"""
|
|
164
|
+
return self._benchmark_meta.metric_list
|
|
165
|
+
|
|
166
|
+
@property
|
|
167
|
+
def default_subset(self) -> str:
|
|
168
|
+
"""
|
|
169
|
+
Return the default subset of the benchmark.
|
|
170
|
+
"""
|
|
171
|
+
return self._benchmark_meta.default_subset
|
|
172
|
+
|
|
173
|
+
@property
|
|
174
|
+
def few_shot_num(self) -> int:
|
|
175
|
+
"""
|
|
176
|
+
Return the few shot number of the benchmark.
|
|
177
|
+
"""
|
|
178
|
+
return self._benchmark_meta.few_shot_num
|
|
179
|
+
|
|
180
|
+
@few_shot_num.setter
|
|
181
|
+
def few_shot_num(self, value: int):
|
|
182
|
+
"""
|
|
183
|
+
Set the few shot number of the benchmark.
|
|
184
|
+
"""
|
|
185
|
+
self._benchmark_meta.few_shot_num = value
|
|
186
|
+
|
|
187
|
+
@property
|
|
188
|
+
def few_shot_random(self) -> bool:
|
|
189
|
+
"""
|
|
190
|
+
Return whether few shot is random for the benchmark.
|
|
191
|
+
"""
|
|
192
|
+
return self._benchmark_meta.few_shot_random
|
|
193
|
+
|
|
194
|
+
@property
|
|
195
|
+
def train_split(self) -> Optional[str]:
|
|
196
|
+
"""
|
|
197
|
+
Return the train split of the benchmark.
|
|
198
|
+
"""
|
|
199
|
+
return self._benchmark_meta.train_split
|
|
200
|
+
|
|
201
|
+
@property
|
|
202
|
+
def eval_split(self) -> Optional[str]:
|
|
203
|
+
"""
|
|
204
|
+
Return the eval split of the benchmark.
|
|
205
|
+
"""
|
|
206
|
+
return self._benchmark_meta.eval_split
|
|
207
|
+
|
|
208
|
+
@property
|
|
209
|
+
def prompt_template(self) -> Optional[str]:
|
|
210
|
+
"""
|
|
211
|
+
Return the prompt template of the benchmark.
|
|
212
|
+
"""
|
|
213
|
+
return self._benchmark_meta.prompt_template
|
|
214
|
+
|
|
215
|
+
@prompt_template.setter
|
|
216
|
+
def prompt_template(self, value: str):
|
|
217
|
+
"""
|
|
218
|
+
Set the prompt template of the benchmark.
|
|
219
|
+
"""
|
|
220
|
+
self._benchmark_meta.prompt_template = value
|
|
221
|
+
|
|
222
|
+
@property
|
|
223
|
+
def system_prompt(self) -> Optional[str]:
|
|
224
|
+
"""
|
|
225
|
+
Return the system prompt of the benchmark.
|
|
226
|
+
"""
|
|
227
|
+
return self._benchmark_meta.system_prompt
|
|
228
|
+
|
|
229
|
+
@property
|
|
230
|
+
def query_template(self) -> Optional[str]:
|
|
231
|
+
"""
|
|
232
|
+
Return the query template of the benchmark.
|
|
233
|
+
"""
|
|
234
|
+
return self._benchmark_meta.query_template
|
|
235
|
+
|
|
236
|
+
@property
|
|
237
|
+
def few_shot_prompt_template(self) -> Optional[str]:
|
|
238
|
+
"""
|
|
239
|
+
Return the few-shot prompt template of the benchmark.
|
|
240
|
+
"""
|
|
241
|
+
return self._benchmark_meta.few_shot_prompt_template
|
|
242
|
+
|
|
243
|
+
@property
|
|
244
|
+
def pretty_name(self) -> Optional[str]:
|
|
245
|
+
"""
|
|
246
|
+
Return the pretty name of the benchmark.
|
|
247
|
+
"""
|
|
248
|
+
return self._benchmark_meta.pretty_name
|
|
249
|
+
|
|
250
|
+
@property
|
|
251
|
+
def description(self) -> Optional[str]:
|
|
252
|
+
"""
|
|
253
|
+
Return the description of the benchmark.
|
|
254
|
+
"""
|
|
255
|
+
return self._benchmark_meta.description
|
|
256
|
+
|
|
257
|
+
@property
|
|
258
|
+
def tags(self) -> Optional[List[str]]:
|
|
259
|
+
"""
|
|
260
|
+
Return the tags of the benchmark.
|
|
261
|
+
"""
|
|
262
|
+
return self._benchmark_meta.tags
|
|
263
|
+
|
|
264
|
+
@property
|
|
265
|
+
def filters(self) -> Optional[OrderedDict]:
|
|
266
|
+
"""
|
|
267
|
+
Return the filters of the benchmark.
|
|
268
|
+
"""
|
|
269
|
+
return self._benchmark_meta.filters
|
|
270
|
+
|
|
271
|
+
@property
|
|
272
|
+
def filter_ensemble(self) -> Optional[FilterEnsemble]:
|
|
273
|
+
"""
|
|
274
|
+
Return the filter ensemble of the benchmark.
|
|
275
|
+
"""
|
|
276
|
+
if self._filter_ensemble is None:
|
|
277
|
+
if self.filters:
|
|
278
|
+
self._filter_ensemble = build_filter_ensemble(filters=self.filters)
|
|
279
|
+
return self._filter_ensemble
|
|
280
|
+
|
|
281
|
+
@property
|
|
282
|
+
def aggregation(self) -> str:
|
|
283
|
+
"""
|
|
284
|
+
Return the aggregation function for the metrics.
|
|
285
|
+
"""
|
|
286
|
+
return self._benchmark_meta.aggregation
|
|
287
|
+
|
|
288
|
+
@property
|
|
289
|
+
def extra_params(self) -> Optional[Dict]:
|
|
290
|
+
"""
|
|
291
|
+
Return the extra parameters of the benchmark.
|
|
292
|
+
"""
|
|
293
|
+
return self._benchmark_meta.extra_params
|
|
294
|
+
|
|
295
|
+
@property
|
|
296
|
+
def seed(self) -> Optional[int]:
|
|
297
|
+
"""
|
|
298
|
+
Return the seed for the benchmark.
|
|
299
|
+
"""
|
|
300
|
+
return self._task_config.seed
|
|
301
|
+
|
|
302
|
+
@contextlib.contextmanager
|
|
303
|
+
def _temporary_attribute(self, attr_name: str, new_value):
|
|
304
|
+
"""
|
|
305
|
+
Set a temporary value for an attribute and restore the original value after the context block.
|
|
306
|
+
|
|
307
|
+
Args:
|
|
308
|
+
attr_name: The name of the attribute to temporarily set.
|
|
309
|
+
new_value: The new value to set for the attribute.
|
|
310
|
+
"""
|
|
311
|
+
had_attr = hasattr(self, attr_name)
|
|
312
|
+
original_value = getattr(self, attr_name, None) if had_attr else None
|
|
313
|
+
|
|
314
|
+
setattr(self, attr_name, new_value)
|
|
315
|
+
try:
|
|
316
|
+
yield
|
|
317
|
+
finally:
|
|
318
|
+
if had_attr:
|
|
319
|
+
setattr(self, attr_name, original_value)
|
|
320
|
+
else:
|
|
321
|
+
delattr(self, attr_name)
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
from collections import OrderedDict
|
|
3
|
+
from dataclasses import asdict, dataclass, field
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Type, Union
|
|
5
|
+
|
|
6
|
+
from evalscope.constants import OutputType
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from evalscope.api.benchmark import DataAdapter
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class BenchmarkMeta:
|
|
14
|
+
"""Metadata for a benchmark, including dataset and model configurations."""
|
|
15
|
+
|
|
16
|
+
name: str
|
|
17
|
+
""" Unique name of the benchmark."""
|
|
18
|
+
|
|
19
|
+
dataset_id: str
|
|
20
|
+
""" Dataset id on modelscope or path to local dataset."""
|
|
21
|
+
|
|
22
|
+
data_adapter: Optional[Type['DataAdapter']] = None
|
|
23
|
+
""" Data adapter class for the benchmark."""
|
|
24
|
+
|
|
25
|
+
output_types: List[str] = field(default_factory=lambda: [OutputType.GENERATION])
|
|
26
|
+
""" List of output types supported by the benchmark."""
|
|
27
|
+
|
|
28
|
+
subset_list: List[str] = field(default_factory=lambda: ['default'])
|
|
29
|
+
""" List of subsets available for the benchmark."""
|
|
30
|
+
|
|
31
|
+
default_subset: str = 'default'
|
|
32
|
+
""" Default subset to use for the benchmark."""
|
|
33
|
+
|
|
34
|
+
few_shot_num: int = 0
|
|
35
|
+
""" Number of few-shot examples to use."""
|
|
36
|
+
|
|
37
|
+
few_shot_random: bool = False
|
|
38
|
+
""" Whether to use random few-shot examples."""
|
|
39
|
+
|
|
40
|
+
train_split: Optional[str] = None
|
|
41
|
+
""" Training split to use for the benchmark."""
|
|
42
|
+
|
|
43
|
+
eval_split: Optional[str] = None
|
|
44
|
+
""" Evaluation split to use for the benchmark."""
|
|
45
|
+
|
|
46
|
+
prompt_template: Optional[str] = None
|
|
47
|
+
""" Prompt template to use for the benchmark."""
|
|
48
|
+
|
|
49
|
+
few_shot_prompt_template: Optional[str] = None
|
|
50
|
+
""" Few-shot prompt template to use for the benchmark."""
|
|
51
|
+
|
|
52
|
+
system_prompt: Optional[str] = None
|
|
53
|
+
""" System prompt to use for the benchmark."""
|
|
54
|
+
|
|
55
|
+
query_template: Optional[str] = None
|
|
56
|
+
""" Query template to use for the benchmark."""
|
|
57
|
+
|
|
58
|
+
pretty_name: Optional[str] = None
|
|
59
|
+
""" Human-readable name for the benchmark."""
|
|
60
|
+
|
|
61
|
+
description: Optional[str] = None
|
|
62
|
+
""" Description of the benchmark."""
|
|
63
|
+
|
|
64
|
+
tags: List[str] = field(default_factory=list)
|
|
65
|
+
""" Tags associated with the benchmark."""
|
|
66
|
+
|
|
67
|
+
filters: Optional[OrderedDict] = None
|
|
68
|
+
""" Filters to apply to the dataset on model output."""
|
|
69
|
+
|
|
70
|
+
metric_list: List[Union[str, Dict[str, Any]]] = field(default_factory=list)
|
|
71
|
+
""" List of metrics to evaluate the benchmark."""
|
|
72
|
+
|
|
73
|
+
aggregation: str = 'mean'
|
|
74
|
+
""" Aggregation function for the metrics. Default is 'mean'. Can be 'mean', 'pass@<k>' or a custom function name."""
|
|
75
|
+
|
|
76
|
+
extra_params: Dict = field(default_factory=dict)
|
|
77
|
+
""" Additional parameters for the benchmark."""
|
|
78
|
+
|
|
79
|
+
def __post_init__(self):
|
|
80
|
+
"""Validate fields after initialization."""
|
|
81
|
+
if self.few_shot_num < 0:
|
|
82
|
+
raise ValueError('few_shot_num must be >= 0')
|
|
83
|
+
|
|
84
|
+
def _update(self, args: dict):
|
|
85
|
+
"""Update instance with provided arguments, maintaining backward compatibility."""
|
|
86
|
+
args = copy.deepcopy(args)
|
|
87
|
+
|
|
88
|
+
if args.get('local_path'):
|
|
89
|
+
self.dataset_id = args['local_path']
|
|
90
|
+
del args['local_path']
|
|
91
|
+
|
|
92
|
+
if args.get('filters'):
|
|
93
|
+
if self.filters is None:
|
|
94
|
+
self.filters = OrderedDict()
|
|
95
|
+
new_filters = OrderedDict(args['filters'])
|
|
96
|
+
# insert filters at the beginning
|
|
97
|
+
self.filters = OrderedDict(list(new_filters.items()) + list(self.filters.items()))
|
|
98
|
+
del args['filters']
|
|
99
|
+
# Update fields with validation
|
|
100
|
+
for key, value in args.items():
|
|
101
|
+
if hasattr(self, key):
|
|
102
|
+
setattr(self, key, value) # Validate few_shot_num if it's being updated
|
|
103
|
+
if key == 'few_shot_num' and value < 0:
|
|
104
|
+
raise ValueError('few_shot_num must be >= 0')
|
|
105
|
+
|
|
106
|
+
def to_dict(self) -> dict:
|
|
107
|
+
"""Convert to dictionary, maintaining backward compatibility."""
|
|
108
|
+
return asdict(self)
|
|
109
|
+
|
|
110
|
+
def to_string_dict(self) -> dict:
|
|
111
|
+
"""Convert to string dictionary, excluding data_adapter."""
|
|
112
|
+
cur_dict = copy.deepcopy(asdict(self))
|
|
113
|
+
if 'data_adapter' in cur_dict:
|
|
114
|
+
del cur_dict['data_adapter']
|
|
115
|
+
return cur_dict
|