evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/__init__.py +0 -0
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +3 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
- evalscope/api/benchmark/benchmark.py +321 -0
- evalscope/api/benchmark/meta.py +115 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +261 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +355 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +264 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +11 -0
- evalscope/api/messages/chat_message.py +198 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +105 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/dataset_mixin.py +105 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +157 -0
- evalscope/api/model/model.py +383 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +23 -11
- evalscope/app/utils/data_utils.py +42 -26
- evalscope/app/utils/text_utils.py +0 -2
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +6 -7
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -3
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +2 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +135 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +95 -54
- evalscope/constants.py +29 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +277 -423
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +32 -30
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +47 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +123 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +698 -0
- evalscope/perf/benchmark.py +2 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +7 -5
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +8 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -2
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +101 -6
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +26 -44
- evalscope/summarizer.py +1 -1
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +2 -1
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/io_utils.py +100 -5
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +10 -7
- evalscope/utils/multi_choices.py +271 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
- tests/aigc/test_t2i.py +22 -4
- tests/benchmark/__init__.py +1 -0
- tests/benchmark/test_eval.py +386 -0
- tests/cli/test_all.py +3 -5
- tests/cli/test_collection.py +13 -4
- tests/cli/test_custom.py +22 -15
- tests/rag/test_clip_benchmark.py +1 -0
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import dataclasses
|
|
2
2
|
from typing import Dict, Optional, Union
|
|
3
3
|
|
|
4
|
-
from
|
|
4
|
+
from . import instructions_registry
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
@dataclasses.dataclass
|
|
@@ -121,14 +121,13 @@ def process_results(doc, results):
|
|
|
121
121
|
out_loose = test_instruction_following_loose(inp, response)
|
|
122
122
|
|
|
123
123
|
return {
|
|
124
|
-
'
|
|
125
|
-
'
|
|
126
|
-
'
|
|
127
|
-
'
|
|
124
|
+
'prompt_level_strict': float(out_strict.follow_all_instructions),
|
|
125
|
+
'inst_level_strict': agg_inst_level_acc(out_strict.follow_instruction_list),
|
|
126
|
+
'prompt_level_loose': float(out_loose.follow_all_instructions),
|
|
127
|
+
'inst_level_loose': agg_inst_level_acc(out_loose.follow_instruction_list),
|
|
128
128
|
}
|
|
129
129
|
|
|
130
130
|
|
|
131
131
|
def agg_inst_level_acc(items):
|
|
132
|
-
|
|
133
|
-
inst_level_acc = sum(flat_items) / len(flat_items)
|
|
132
|
+
inst_level_acc = sum(items) / len(items) if items else 0
|
|
134
133
|
return inst_level_acc
|
|
@@ -1,70 +1,35 @@
|
|
|
1
|
-
from evalscope.
|
|
2
|
-
from evalscope.
|
|
3
|
-
from evalscope.
|
|
4
|
-
from evalscope.
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
1
|
+
from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
|
|
2
|
+
from evalscope.api.dataset import Sample
|
|
3
|
+
from evalscope.api.registry import register_benchmark
|
|
4
|
+
from evalscope.constants import Tags
|
|
5
|
+
from evalscope.utils.multi_choices import MultipleChoiceTemplate
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@register_benchmark(
|
|
9
|
+
BenchmarkMeta(
|
|
10
|
+
name='iquiz',
|
|
11
|
+
pretty_name='IQuiz',
|
|
12
|
+
tags=[Tags.KNOWLEDGE, Tags.MULTIPLE_CHOICE, Tags.CHINESE],
|
|
13
|
+
description=
|
|
14
|
+
'IQuiz is a benchmark for evaluating AI models on IQ and EQ questions. It consists of multiple-choice questions where the model must select the correct answer and provide an explanation.', # noqa: E501
|
|
15
|
+
dataset_id='AI-ModelScope/IQuiz',
|
|
16
|
+
metric_list=['acc'],
|
|
17
|
+
subset_list=['IQ', 'EQ'],
|
|
18
|
+
few_shot_num=0,
|
|
19
|
+
train_split=None,
|
|
20
|
+
eval_split='test',
|
|
21
|
+
prompt_template=MultipleChoiceTemplate.CHINESE_SINGLE_ANSWER_TEMPLATE_COT,
|
|
22
|
+
)
|
|
22
23
|
)
|
|
23
|
-
class IQuizAdapter(
|
|
24
|
+
class IQuizAdapter(MultiChoiceAdapter):
|
|
24
25
|
|
|
25
26
|
def __init__(self, **kwargs):
|
|
26
27
|
super().__init__(**kwargs)
|
|
27
28
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
"question":"天气预报说本周星期三会下雨,昨天果然下雨了,今天星期几?",
|
|
36
|
-
"choices":["星期一","星期二","星期三","星期四"],
|
|
37
|
-
"answer":"D",
|
|
38
|
-
"level":1
|
|
39
|
-
}
|
|
40
|
-
"""
|
|
41
|
-
prompt = f"问题: {input_d['question']}\n"
|
|
42
|
-
prompt += self.__form_options(input_d['choices'])
|
|
43
|
-
return self.gen_prompt_data(prompt)
|
|
44
|
-
|
|
45
|
-
def __form_options(self, options: list):
|
|
46
|
-
option_str = '选项:\n'
|
|
47
|
-
for opt, choice in zip(options, self.choices):
|
|
48
|
-
option_str += f'({choice}): {opt}' + '\n'
|
|
49
|
-
return option_str
|
|
50
|
-
|
|
51
|
-
def get_gold_answer(self, input_d: dict) -> str:
|
|
52
|
-
"""
|
|
53
|
-
Parse the raw input labels (gold).
|
|
54
|
-
"""
|
|
55
|
-
return input_d['answer']
|
|
56
|
-
|
|
57
|
-
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
58
|
-
"""
|
|
59
|
-
Parse the predicted result and extract proper answer.
|
|
60
|
-
"""
|
|
61
|
-
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
62
|
-
return result
|
|
63
|
-
else:
|
|
64
|
-
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
65
|
-
|
|
66
|
-
def match(self, gold: str, pred: str) -> float:
|
|
67
|
-
"""
|
|
68
|
-
Match the gold answer and the predicted answer.
|
|
69
|
-
"""
|
|
70
|
-
return exact_match(gold=gold, pred=pred)
|
|
29
|
+
def record_to_sample(self, record) -> Sample:
|
|
30
|
+
return Sample(
|
|
31
|
+
input=record['question'],
|
|
32
|
+
choices=record['choices'],
|
|
33
|
+
target=record['answer'],
|
|
34
|
+
metadata={'level': record.get('level', 'unknown')},
|
|
35
|
+
)
|
|
@@ -130,8 +130,8 @@ def evaluate_generations(
|
|
|
130
130
|
results[index] = result
|
|
131
131
|
metadata[index] = meta
|
|
132
132
|
|
|
133
|
-
assert len(results
|
|
134
|
-
|
|
133
|
+
assert len(results
|
|
134
|
+
) == len(generations_list), f'results = {len(results)} inputs = {len(generations_list)} {results=}'
|
|
135
135
|
|
|
136
136
|
return results, metadata
|
|
137
137
|
|
|
@@ -1,88 +1,138 @@
|
|
|
1
|
-
from
|
|
2
|
-
|
|
3
|
-
from evalscope.
|
|
1
|
+
from typing import Any, Dict
|
|
2
|
+
|
|
3
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
4
|
+
from evalscope.api.dataset import Sample
|
|
5
|
+
from evalscope.api.evaluator import TaskState
|
|
6
|
+
from evalscope.api.messages.chat_message import ChatMessageUser
|
|
7
|
+
from evalscope.api.metric import Score
|
|
8
|
+
from evalscope.api.registry import register_benchmark
|
|
9
|
+
from evalscope.constants import Tags
|
|
10
|
+
from evalscope.utils.io_utils import convert_numpy_types
|
|
4
11
|
from evalscope.utils.logger import get_logger
|
|
5
12
|
|
|
6
13
|
logger = get_logger()
|
|
7
14
|
|
|
8
15
|
|
|
9
|
-
@
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
prompt_template=
|
|
30
|
-
'### Question:\n{question_content}\n\n{format_prompt} ### Answer: (use the provided format with backticks)\n\n', # noqa: E501
|
|
16
|
+
@register_benchmark(
|
|
17
|
+
BenchmarkMeta(
|
|
18
|
+
name='live_code_bench',
|
|
19
|
+
pretty_name='Live-Code-Bench',
|
|
20
|
+
tags=[Tags.CODING],
|
|
21
|
+
description=
|
|
22
|
+
'Live Code Bench is a benchmark for evaluating code generation models on real-world coding tasks. It includes a variety of programming problems with test cases to assess the model\'s ability to generate correct and efficient code solutions.', # noqa: E501
|
|
23
|
+
dataset_id='AI-ModelScope/code_generation_lite',
|
|
24
|
+
subset_list=['release_latest'],
|
|
25
|
+
metric_list=['Pass@1'],
|
|
26
|
+
eval_split='test',
|
|
27
|
+
prompt_template=
|
|
28
|
+
'### Question:\n{question_content}\n\n{format_prompt} ### Answer: (use the provided format with backticks)\n\n',
|
|
29
|
+
extra_params={
|
|
30
|
+
'start_date': None,
|
|
31
|
+
'end_date': None,
|
|
32
|
+
'timeout': 6,
|
|
33
|
+
'debug': False
|
|
34
|
+
},
|
|
35
|
+
)
|
|
31
36
|
)
|
|
32
|
-
class LiveCodeBenchAdapter(
|
|
37
|
+
class LiveCodeBenchAdapter(DefaultDataAdapter):
|
|
38
|
+
"""
|
|
39
|
+
Live Code Bench adapter using the new data processing framework.
|
|
40
|
+
"""
|
|
33
41
|
|
|
34
42
|
def __init__(self, **kwargs):
|
|
35
43
|
super().__init__(**kwargs)
|
|
36
44
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
self.
|
|
40
|
-
self.
|
|
41
|
-
|
|
42
|
-
self.
|
|
43
|
-
|
|
44
|
-
def
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
filtered_datasets = filter_date(datasets, start_date=self.start_date, end_date=self.end_date)
|
|
53
|
-
|
|
54
|
-
transformed_datasets = [transform(item) for item in tqdm(filtered_datasets, desc='Transforming data')]
|
|
55
|
-
new_dataset_dict[subset_key] = {self.eval_split: transformed_datasets}
|
|
56
|
-
return new_dataset_dict
|
|
57
|
-
|
|
58
|
-
def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
|
|
59
|
-
"""
|
|
60
|
-
Generate the prompt for the model input.
|
|
61
|
-
"""
|
|
62
|
-
format_prompt = input_d['format_prompt']
|
|
63
|
-
question_content = input_d['question_content']
|
|
45
|
+
self.timeout = self.extra_params.get('timeout', 6)
|
|
46
|
+
self.debug = self.extra_params.get('debug', False)
|
|
47
|
+
self.start_date = self.extra_params.get('start_date')
|
|
48
|
+
self.end_date = self.extra_params.get('end_date')
|
|
49
|
+
|
|
50
|
+
self.save_metadata = False # Don't save metadata, since they are large
|
|
51
|
+
|
|
52
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
53
|
+
"""Convert a data record to a Sample object."""
|
|
54
|
+
from .load_utils import transform
|
|
55
|
+
|
|
56
|
+
record = transform(record)
|
|
57
|
+
|
|
58
|
+
question_content = record['question_content']
|
|
59
|
+
format_prompt = record['format_prompt']
|
|
64
60
|
full_prompt = self.prompt_template.format(question_content=question_content, format_prompt=format_prompt)
|
|
65
61
|
|
|
66
|
-
return
|
|
62
|
+
return Sample(
|
|
63
|
+
input=[ChatMessageUser(content=full_prompt)],
|
|
64
|
+
target='',
|
|
65
|
+
metadata={
|
|
66
|
+
'evaluation_sample': record['evaluation_sample'],
|
|
67
|
+
'contest_date': record['contest_date']
|
|
68
|
+
}
|
|
69
|
+
)
|
|
67
70
|
|
|
68
|
-
def
|
|
69
|
-
|
|
70
|
-
return input_d
|
|
71
|
+
def sample_filter(self, sample):
|
|
72
|
+
from .load_utils import filter_date
|
|
71
73
|
|
|
72
|
-
|
|
73
|
-
|
|
74
|
+
return filter_date(sample.metadata['contest_date'], start_date=self.start_date, end_date=self.end_date)
|
|
75
|
+
|
|
76
|
+
def extract_answer(self, prediction: str, task_state: TaskState) -> str:
|
|
77
|
+
"""Extract code from the prediction."""
|
|
74
78
|
from .extract_utils import extract_code_generation
|
|
79
|
+
return extract_code_generation(prediction)
|
|
75
80
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
num_process_evaluate=1,
|
|
85
|
-
timeout=self.timeout,
|
|
86
|
-
debug=self.debug,
|
|
81
|
+
def match_score(
|
|
82
|
+
self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
|
|
83
|
+
) -> Score:
|
|
84
|
+
from .evaluate_utils import codegen_metrics
|
|
85
|
+
|
|
86
|
+
score = Score(
|
|
87
|
+
extracted_prediction=filtered_prediction,
|
|
88
|
+
prediction=original_prediction,
|
|
87
89
|
)
|
|
88
|
-
|
|
90
|
+
|
|
91
|
+
references = [{'input_output': task_state.metadata['evaluation_sample']}]
|
|
92
|
+
predictions = [[filtered_prediction]]
|
|
93
|
+
|
|
94
|
+
try:
|
|
95
|
+
metrics, eval_results, final_metadata = codegen_metrics(
|
|
96
|
+
references,
|
|
97
|
+
predictions,
|
|
98
|
+
k_list=[1],
|
|
99
|
+
num_process_evaluate=1,
|
|
100
|
+
timeout=self.timeout,
|
|
101
|
+
debug=self.debug,
|
|
102
|
+
)
|
|
103
|
+
pass_rate = metrics['pass@1'] / 100 # convert to point scale
|
|
104
|
+
|
|
105
|
+
score.value = {'pass': float(pass_rate > 0)}
|
|
106
|
+
score.explanation = f"Pass@1: {metrics['pass@1']}%"
|
|
107
|
+
|
|
108
|
+
# Convert numpy types to native Python types for JSON serialization
|
|
109
|
+
serializable_eval_results = convert_numpy_types(eval_results)
|
|
110
|
+
serializable_final_metadata = convert_numpy_types(final_metadata)
|
|
111
|
+
|
|
112
|
+
score.metadata = {
|
|
113
|
+
'pass_rate': float(pass_rate),
|
|
114
|
+
'timeout': self.timeout,
|
|
115
|
+
'debug': self.debug,
|
|
116
|
+
'eval_results': serializable_eval_results,
|
|
117
|
+
'final_metadata': serializable_final_metadata
|
|
118
|
+
}
|
|
119
|
+
except Exception as e:
|
|
120
|
+
score.value = {'pass': False}
|
|
121
|
+
score.explanation = f'Evaluation failed: {str(e)}'
|
|
122
|
+
score.metadata = {'error': str(e)}
|
|
123
|
+
|
|
124
|
+
score.main_score_name = 'pass'
|
|
125
|
+
return score
|
|
126
|
+
|
|
127
|
+
def aggregate_scores(self, sample_scores):
|
|
128
|
+
from evalscope.metrics.metric import PassAtK
|
|
129
|
+
|
|
130
|
+
# calculate pass@k here
|
|
131
|
+
agg_list = []
|
|
132
|
+
for metric in self.metric_list:
|
|
133
|
+
if metric.lower().startswith('pass@'):
|
|
134
|
+
k = int(metric.split('@')[1])
|
|
135
|
+
# Get the scores for this metric
|
|
136
|
+
agg = PassAtK(k)
|
|
137
|
+
agg_list.extend(agg(sample_scores))
|
|
138
|
+
return agg_list
|
|
@@ -32,8 +32,8 @@ def transform(item):
|
|
|
32
32
|
private_test_cases = json.loads(item['private_test_cases'])
|
|
33
33
|
except Exception as e: # noqa: F841
|
|
34
34
|
private_test_cases = json.loads(
|
|
35
|
-
pickle.loads(zlib.decompress(base64.b64decode(private_test_cases.encode('utf-8'))
|
|
36
|
-
|
|
35
|
+
pickle.loads(zlib.decompress(base64.b64decode(private_test_cases.encode('utf-8'))))
|
|
36
|
+
)
|
|
37
37
|
|
|
38
38
|
# load metadata
|
|
39
39
|
metadata = json.loads(item['metadata'])
|
|
@@ -47,25 +47,17 @@ def transform(item):
|
|
|
47
47
|
return item
|
|
48
48
|
|
|
49
49
|
|
|
50
|
-
def filter_date(
|
|
51
|
-
new_dataset = []
|
|
52
|
-
|
|
53
|
-
for item in dataset:
|
|
54
|
-
contest_date = datetime.fromisoformat(item['contest_date'])
|
|
55
|
-
if start_date is not None:
|
|
56
|
-
p_start_date = datetime.strptime(start_date, '%Y-%m-%d')
|
|
57
|
-
if p_start_date > contest_date:
|
|
58
|
-
continue
|
|
50
|
+
def filter_date(contest_date, start_date=None, end_date=None) -> bool:
|
|
59
51
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
52
|
+
contest_date = datetime.fromisoformat(contest_date)
|
|
53
|
+
if start_date is not None:
|
|
54
|
+
p_start_date = datetime.strptime(start_date, '%Y-%m-%d')
|
|
55
|
+
if p_start_date > contest_date:
|
|
56
|
+
return False
|
|
64
57
|
|
|
65
|
-
|
|
58
|
+
if end_date is not None:
|
|
59
|
+
p_end_date = datetime.strptime(end_date, '%Y-%m-%d')
|
|
60
|
+
if p_end_date < contest_date:
|
|
61
|
+
return False
|
|
66
62
|
|
|
67
|
-
|
|
68
|
-
logger.info(
|
|
69
|
-
f'Filtered dataset with start_date: {start_date}, end_date: {end_date}, remaining items: {len(new_dataset)}'
|
|
70
|
-
)
|
|
71
|
-
return new_dataset
|
|
63
|
+
return True
|
|
@@ -4,18 +4,22 @@ import faulthandler
|
|
|
4
4
|
import json
|
|
5
5
|
import numpy as np
|
|
6
6
|
import platform
|
|
7
|
+
|
|
7
8
|
# to run the solution files we're using a timing based approach
|
|
8
9
|
import signal
|
|
9
10
|
import sys
|
|
10
11
|
import time
|
|
12
|
+
|
|
11
13
|
# used for debugging to time steps
|
|
12
14
|
from datetime import datetime
|
|
13
15
|
from decimal import Decimal
|
|
14
16
|
from enum import Enum
|
|
15
17
|
from functools import partial
|
|
16
18
|
from io import StringIO
|
|
19
|
+
|
|
17
20
|
# from pyext import RuntimeModule
|
|
18
21
|
from types import ModuleType
|
|
22
|
+
|
|
19
23
|
# used for testing the code that reads from input
|
|
20
24
|
from unittest.mock import mock_open, patch
|
|
21
25
|
|
|
@@ -342,8 +346,8 @@ def grade_stdio(
|
|
|
342
346
|
return all_results, WA_send_args
|
|
343
347
|
|
|
344
348
|
for output_line_idx, (
|
|
345
|
-
|
|
346
|
-
|
|
349
|
+
stripped_prediction_line,
|
|
350
|
+
stripped_gt_out_line,
|
|
347
351
|
) in enumerate(zip(stripped_prediction_lines, stripped_gt_out_lines)):
|
|
348
352
|
WA_send_args['error_message'] = (
|
|
349
353
|
f'Wrong answer at {output_line_idx=}: {truncatefn(stripped_prediction_line)} != {truncatefn(stripped_gt_out_line)}'
|
|
@@ -1,82 +1,56 @@
|
|
|
1
1
|
from typing import Any
|
|
2
2
|
|
|
3
|
-
from evalscope.
|
|
4
|
-
from evalscope.
|
|
5
|
-
from evalscope.
|
|
6
|
-
from evalscope.
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
@
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
'题目来自于{subset_name}请回答单选题。要求只输出选项,不输出解释,将选项放在<>里,直接输出答案。示例:\n\n题目:在船舶主推进动力装置中,传动轴系在运转中承受以下复杂的应力和负荷,但不包括______。\n选项:\nA. 电磁力\nB. 压拉应力\nC. 弯曲应力\nD. 扭应力\n答:<A> 当前题目\n {query}', # noqa: E501
|
|
3
|
+
from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
|
|
4
|
+
from evalscope.api.dataset import Sample
|
|
5
|
+
from evalscope.api.registry import register_benchmark
|
|
6
|
+
from evalscope.constants import Tags
|
|
7
|
+
|
|
8
|
+
MARITIME_PROMPT_TEMPLATE = '请回答单选题。要求只输出选项,不输出解释,将选项放在[]里,直接输出答案。示例:\n\n题目:在船舶主推进动力装置中,传动轴系在运转中承受以下复杂的应力和负荷,但不包括______。\n选项:\nA. 电磁力\nB. 压拉应力\nC. 弯曲应力\nD. 扭应力\n答:[A]\n 当前题目\n {question}\n选项:\n{choices}' # noqa: E501
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@register_benchmark(
|
|
12
|
+
BenchmarkMeta(
|
|
13
|
+
name='maritime_bench',
|
|
14
|
+
pretty_name='MaritimeBench',
|
|
15
|
+
tags=[Tags.CHINESE, Tags.MULTIPLE_CHOICE, Tags.KNOWLEDGE],
|
|
16
|
+
description=
|
|
17
|
+
'MaritimeBench is a benchmark for evaluating AI models on maritime-related multiple-choice questions. It consists of questions related to maritime knowledge, where the model must select the correct answer from given options.', # noqa: E501
|
|
18
|
+
dataset_id='HiDolphin/MaritimeBench',
|
|
19
|
+
metric_list=['acc'],
|
|
20
|
+
few_shot_num=0,
|
|
21
|
+
eval_split='test',
|
|
22
|
+
prompt_template=MARITIME_PROMPT_TEMPLATE,
|
|
23
|
+
)
|
|
25
24
|
)
|
|
26
|
-
class MaritimeBenchAdapter(
|
|
25
|
+
class MaritimeBenchAdapter(MultiChoiceAdapter):
|
|
27
26
|
|
|
28
27
|
def __init__(self, **kwargs):
|
|
29
28
|
super().__init__(**kwargs)
|
|
30
29
|
|
|
31
|
-
self.
|
|
32
|
-
|
|
33
|
-
def
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
59
|
-
"""
|
|
60
|
-
Parse the raw model prediction (pred).
|
|
61
|
-
|
|
62
|
-
Args:
|
|
63
|
-
pred: model prediction. Depending on the model.
|
|
64
|
-
|
|
65
|
-
Returns:
|
|
66
|
-
The parsed prediction. e.g. model answer... Depending on the model.
|
|
67
|
-
"""
|
|
68
|
-
|
|
69
|
-
return ResponseParser.parse_bracketed_answer(result, options=self.choices)
|
|
70
|
-
|
|
71
|
-
def match(self, gold: Any, pred: Any) -> Any:
|
|
72
|
-
"""
|
|
73
|
-
Match the gold answer with the predicted answer.
|
|
74
|
-
|
|
75
|
-
Args:
|
|
76
|
-
gold: The gold answer.
|
|
77
|
-
pred: The predicted answer.
|
|
78
|
-
|
|
79
|
-
Returns:
|
|
80
|
-
The result of the match.
|
|
81
|
-
"""
|
|
82
|
-
return exact_match(gold=gold, pred=pred)
|
|
30
|
+
self.reformat_subset = True
|
|
31
|
+
|
|
32
|
+
def record_to_sample(self, record) -> Sample:
|
|
33
|
+
# Extract available choices from the record
|
|
34
|
+
choices = []
|
|
35
|
+
choice_letters = ['A', 'B', 'C', 'D']
|
|
36
|
+
for letter in choice_letters:
|
|
37
|
+
if letter in record and record[letter]:
|
|
38
|
+
choices.append(record[letter])
|
|
39
|
+
|
|
40
|
+
return Sample(
|
|
41
|
+
input=record['question'],
|
|
42
|
+
choices=choices,
|
|
43
|
+
target=record['answer'],
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
def format_prompt_template(self, sample):
|
|
47
|
+
choices = '\n'.join([f'{chr(65 + i)}. {choice}' for i, choice in enumerate(sample.choices)])
|
|
48
|
+
return MARITIME_PROMPT_TEMPLATE.format(question=sample.input, choices=choices)
|
|
49
|
+
|
|
50
|
+
def extract_answer(self, prediction, task_state):
|
|
51
|
+
# use regex to extract the answer from the prediction
|
|
52
|
+
import re
|
|
53
|
+
match = re.search(r'\[([A-D])\]', prediction)
|
|
54
|
+
if match:
|
|
55
|
+
return match.group(1)
|
|
56
|
+
return ''
|