evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +5 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
- evalscope/api/benchmark/benchmark.py +356 -0
- evalscope/api/benchmark/meta.py +121 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +262 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +378 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +275 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +243 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +1 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +155 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/app.py +3 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +26 -14
- evalscope/app/utils/data_utils.py +43 -27
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -14
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +7 -10
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -5
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +10 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +136 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +157 -57
- evalscope/constants.py +37 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +275 -419
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +47 -33
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +67 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +126 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +701 -0
- evalscope/perf/benchmark.py +4 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +15 -10
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +11 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -3
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +51 -35
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +33 -47
- evalscope/summarizer.py +1 -1
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +3 -2
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/import_utils.py +23 -1
- evalscope/utils/io_utils.py +142 -6
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +11 -7
- evalscope/utils/multi_choices.py +288 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
- tests/benchmark/test_eval.py +385 -0
- tests/benchmark/test_image_edit.py +65 -0
- tests/{aigc → benchmark}/test_t2i.py +22 -4
- tests/benchmark/test_vlm.py +80 -0
- tests/cli/test_all.py +85 -47
- tests/cli/test_collection.py +20 -8
- tests/cli/test_custom.py +22 -15
- tests/cli/test_reasoning.py +81 -0
- tests/common.py +73 -0
- tests/perf/test_perf.py +4 -2
- tests/rag/test_clip_benchmark.py +0 -2
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
- /tests/{aigc → benchmark}/__init__.py +0 -0
|
@@ -1,9 +1,13 @@
|
|
|
1
1
|
import re
|
|
2
|
-
from
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
from evalscope.
|
|
6
|
-
from evalscope.
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.evaluator import TaskState
|
|
7
|
+
from evalscope.api.messages import ChatMessage, ChatMessageSystem, ChatMessageUser, Content, ContentImage, ContentText
|
|
8
|
+
from evalscope.api.metric import Score
|
|
9
|
+
from evalscope.api.registry import register_benchmark
|
|
10
|
+
from evalscope.constants import Tags
|
|
7
11
|
from evalscope.utils.logger import get_logger
|
|
8
12
|
|
|
9
13
|
# flake8: noqa
|
|
@@ -21,98 +25,128 @@ SUBSET_LIST = [
|
|
|
21
25
|
'Other',
|
|
22
26
|
]
|
|
23
27
|
|
|
28
|
+
ANSWER_TYPE_EXACT_MATCH = 'exactMatch'
|
|
29
|
+
ANSWER_TYPE_MULTIPLE_CHOICE = 'multipleChoice'
|
|
30
|
+
|
|
31
|
+
# System prompt constants
|
|
32
|
+
SYSTEM_EXACT_ANSWER = 'Your response should be in the following format:\nExplanation: {your explanation for your final answer}\nExact Answer: {your succinct, final answer}\nConfidence: {your confidence score between 0% and 100% for your answer}'
|
|
33
|
+
|
|
34
|
+
SYSTEM_MC = 'Your response should be in the following format:\nExplanation: {your explanation for your answer choice}\nAnswer: {your chosen answer}\nConfidence: {your confidence score between 0% and 100% for your answer}'
|
|
35
|
+
|
|
36
|
+
JUDGE_PROMPT = """Judge whether the following [response] to [question] is correct or not based on the precise and unambiguous [correct_answer] below.
|
|
37
|
+
|
|
38
|
+
[question]: {question}
|
|
39
|
+
|
|
40
|
+
[response]: {response}
|
|
41
|
+
|
|
42
|
+
[correct_answer]: {correct_answer}
|
|
24
43
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
44
|
+
Your judgment must focus only on if there are meaningful differences between [correct_answer] and the [response]. Do not comment on any background to the problem, do not attempt to solve the problem, do not argue for any answer different than [correct_answer], focus only on whether the answers match. Explain why the [response] is correct or incorrect based on [correct_answer] in one or two sentences. Finally, write your answer in the format 'GRADE: C' for correct answer or 'GRADE: I' for incorrect answer.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@register_benchmark(
|
|
49
|
+
BenchmarkMeta(
|
|
50
|
+
name='hle',
|
|
51
|
+
pretty_name="Humanity's-Last-Exam",
|
|
52
|
+
tags=[Tags.KNOWLEDGE, Tags.QA],
|
|
53
|
+
description='Humanity\'s Last Exam (HLE) is a language model benchmark consisting of 2,500 '
|
|
54
|
+
'questions across a broad range of subjects. It was created jointly by the Center '
|
|
55
|
+
'for AI Safety and Scale AI. The benchmark classifies the questions into the '
|
|
56
|
+
'following broad subjects: mathematics (41%), physics (9%), biology/medicine (11%), '
|
|
57
|
+
'humanities/social science (9%), computer science/artificial intelligence (10%), '
|
|
58
|
+
'engineering (4%), chemistry (7%), and other (9%). Around 14% of the questions '
|
|
59
|
+
'require the ability to understand both text and images, i.e., multi-modality. '
|
|
60
|
+
'24% of the questions are multiple-choice; the rest are short-answer, exact-match questions. '
|
|
61
|
+
'To evaluate the performance of model without multi-modality capabilities, please set the extra_params["include_multi_modal"] to False.', # noqa: E501
|
|
62
|
+
dataset_id='cais/hle',
|
|
63
|
+
subset_list=SUBSET_LIST,
|
|
64
|
+
metric_list=['acc'],
|
|
65
|
+
eval_split='test',
|
|
66
|
+
prompt_template='{question}',
|
|
67
|
+
extra_params={'include_multi_modal': True}
|
|
68
|
+
)
|
|
38
69
|
)
|
|
39
|
-
class HLEAdapter(
|
|
70
|
+
class HLEAdapter(DefaultDataAdapter):
|
|
40
71
|
|
|
41
72
|
def __init__(self, *args, **kwargs):
|
|
42
73
|
super().__init__(*args, **kwargs)
|
|
43
74
|
|
|
44
|
-
self.
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
'
|
|
67
|
-
'
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
75
|
+
self._use_llm_judge = True # Use LLM as a judge by default
|
|
76
|
+
self.reformat_subset = True
|
|
77
|
+
self.include_multi_modal = self.extra_params.get('include_multi_modal', True)
|
|
78
|
+
|
|
79
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
80
|
+
answer_type = record['answer_type']
|
|
81
|
+
system_prompt = (SYSTEM_EXACT_ANSWER if answer_type == ANSWER_TYPE_EXACT_MATCH else SYSTEM_MC)
|
|
82
|
+
text_content = ContentText(text=record['question'])
|
|
83
|
+
|
|
84
|
+
content: List[Content] = [text_content]
|
|
85
|
+
if record['image']:
|
|
86
|
+
image_content = ContentImage(image=record['image'])
|
|
87
|
+
content.append(image_content)
|
|
88
|
+
|
|
89
|
+
messages: List[ChatMessage] = [
|
|
90
|
+
ChatMessageSystem(content=system_prompt),
|
|
91
|
+
ChatMessageUser(content=content),
|
|
92
|
+
]
|
|
93
|
+
return Sample(
|
|
94
|
+
input=messages,
|
|
95
|
+
subset_key=record['category'],
|
|
96
|
+
metadata={
|
|
97
|
+
'uid': record['id'],
|
|
98
|
+
'author_name': record['author_name'],
|
|
99
|
+
'rationale': record['rationale'],
|
|
100
|
+
'raw_subject': record['raw_subject'],
|
|
101
|
+
'category': record['category'],
|
|
102
|
+
'has_image': bool(record['image']),
|
|
103
|
+
},
|
|
104
|
+
target=record['answer'],
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
def sample_filter(self, sample):
|
|
108
|
+
if not self.include_multi_modal:
|
|
109
|
+
if sample.metadata is not None and sample.metadata['has_image']:
|
|
110
|
+
return False
|
|
111
|
+
return True
|
|
112
|
+
|
|
113
|
+
def llm_match_score(
|
|
114
|
+
self,
|
|
115
|
+
original_prediction: str,
|
|
116
|
+
filtered_prediction: str,
|
|
117
|
+
reference: str,
|
|
118
|
+
task_state: TaskState,
|
|
119
|
+
) -> Score:
|
|
120
|
+
score = Score(
|
|
121
|
+
extracted_prediction=filtered_prediction,
|
|
122
|
+
prediction=original_prediction,
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
confidence = 100
|
|
126
|
+
if task_state.output and task_state.output.completion:
|
|
127
|
+
confidence_match = re.search(r'confidence:\s*(\d+)', task_state.output.completion, re.IGNORECASE)
|
|
128
|
+
if confidence_match:
|
|
129
|
+
confidence = int(confidence_match.group(1))
|
|
130
|
+
|
|
131
|
+
judge_prompt = JUDGE_PROMPT.format(
|
|
132
|
+
question=task_state.input_text, response=filtered_prediction, correct_answer=reference
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
# Request judge and obtain score
|
|
136
|
+
judge_response = self.llm_judge.judge(prompt=judge_prompt)
|
|
137
|
+
|
|
138
|
+
# Parse judge response to get accuracy score
|
|
139
|
+
accuracy_score = re.search(r'GRADE:\s*([CI])', judge_response, re.IGNORECASE)
|
|
140
|
+
if accuracy_score:
|
|
141
|
+
score.value = {
|
|
142
|
+
'acc': 1.0 if accuracy_score.group(1) == 'C' else 0.0,
|
|
143
|
+
}
|
|
144
|
+
score.explanation = f'LLM judge: {judge_response}'
|
|
145
|
+
score.metadata = {
|
|
146
|
+
'source': 'llm_judge',
|
|
147
|
+
'judge_strategy': self.judge_strategy,
|
|
148
|
+
'model': self.llm_judge.model_id,
|
|
149
|
+
'confidence': confidence,
|
|
112
150
|
}
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
# zip dict answers
|
|
116
|
-
res_dict = super().compute_dict_metric(review_res_list, **kwargs)
|
|
117
|
-
|
|
118
|
-
return super().compute_metric(res_dict, **kwargs)
|
|
151
|
+
score.main_score_name = 'acc'
|
|
152
|
+
return score
|
|
@@ -1,7 +1,15 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
# flake8: noqa: E501
|
|
2
3
|
import re
|
|
3
|
-
|
|
4
|
-
|
|
4
|
+
from typing import Any, Dict
|
|
5
|
+
|
|
6
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
7
|
+
from evalscope.api.dataset import Sample
|
|
8
|
+
from evalscope.api.evaluator import TaskState
|
|
9
|
+
from evalscope.api.messages.chat_message import ChatMessageUser
|
|
10
|
+
from evalscope.api.metric import Score
|
|
11
|
+
from evalscope.api.registry import register_benchmark
|
|
12
|
+
from evalscope.constants import Tags
|
|
5
13
|
from evalscope.utils.logger import get_logger
|
|
6
14
|
|
|
7
15
|
logger = get_logger()
|
|
@@ -10,28 +18,28 @@ logger = get_logger()
|
|
|
10
18
|
# {"task_id": "HumanEval/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n given threshold.\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n False\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n True\n \"\"\"\n", "entry_point": "has_close_elements", "canonical_solution": " for idx, elem in enumerate(numbers):\n for idx2, elem2 in enumerate(numbers):\n if idx != idx2:\n distance = abs(elem - elem2)\n if distance < threshold:\n return True\n\n return False\n", "test": "\n\nMETADATA = {\n 'author': 'jt',\n 'dataset': 'test'\n}\n\n\ndef check(candidate):\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n"} # noqa
|
|
11
19
|
|
|
12
20
|
|
|
13
|
-
@
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
21
|
+
@register_benchmark(
|
|
22
|
+
BenchmarkMeta(
|
|
23
|
+
name='humaneval',
|
|
24
|
+
pretty_name='HumanEval',
|
|
25
|
+
tags=[Tags.CODING],
|
|
26
|
+
description=
|
|
27
|
+
'HumanEval is a benchmark for evaluating the ability of code generation models to write Python functions based on given specifications. It consists of programming tasks with a defined input-output behavior.',
|
|
28
|
+
dataset_id='opencompass/humaneval',
|
|
29
|
+
subset_list=['openai_humaneval'],
|
|
30
|
+
metric_list=['Pass@1'],
|
|
31
|
+
eval_split='test',
|
|
32
|
+
prompt_template=
|
|
33
|
+
'Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n{question}',
|
|
34
|
+
extra_params={
|
|
35
|
+
'num_workers': 4,
|
|
36
|
+
'timeout': 4
|
|
37
|
+
},
|
|
38
|
+
)
|
|
31
39
|
)
|
|
32
|
-
class HumanevalAdapter(
|
|
40
|
+
class HumanevalAdapter(DefaultDataAdapter):
|
|
33
41
|
"""
|
|
34
|
-
|
|
42
|
+
HumanEval adapter using the new data processing framework.
|
|
35
43
|
"""
|
|
36
44
|
|
|
37
45
|
def __init__(self, **kwargs):
|
|
@@ -39,9 +47,11 @@ class HumanevalAdapter(DataAdapter):
|
|
|
39
47
|
from human_eval.data import stream_jsonl, write_jsonl
|
|
40
48
|
from human_eval.evaluation import check_correctness
|
|
41
49
|
except ImportError:
|
|
42
|
-
raise ImportError(
|
|
43
|
-
|
|
44
|
-
|
|
50
|
+
raise ImportError(
|
|
51
|
+
'Please install human_eval:'
|
|
52
|
+
'https://github.com/openai/human-eval/tree/master#installation , '
|
|
53
|
+
'Note that you need to enable the execution code in the human_eval/execution.py first.'
|
|
54
|
+
)
|
|
45
55
|
super().__init__(**kwargs)
|
|
46
56
|
|
|
47
57
|
extra_params = kwargs.get('extra_params', {})
|
|
@@ -53,41 +63,62 @@ class HumanevalAdapter(DataAdapter):
|
|
|
53
63
|
self.write_jsonl_func = write_jsonl
|
|
54
64
|
self.eval_func = check_correctness
|
|
55
65
|
|
|
56
|
-
def
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
return self.gen_prompt_data(full_prompt)
|
|
66
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
67
|
+
"""Convert a data record to a Sample object."""
|
|
68
|
+
query = record['prompt']
|
|
69
|
+
full_prompt = self.prompt_template.format(question=query)
|
|
70
|
+
|
|
71
|
+
return Sample(
|
|
72
|
+
input=[ChatMessageUser(content=full_prompt)],
|
|
73
|
+
target=record['canonical_solution'],
|
|
74
|
+
metadata={
|
|
75
|
+
'task_id': record['task_id'],
|
|
76
|
+
'entry_point': record['entry_point'],
|
|
77
|
+
'prompt': record['prompt'],
|
|
78
|
+
'test': record['test'],
|
|
79
|
+
}
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
def extract_answer(self, prediction: str, task_state: TaskState) -> str:
|
|
83
|
+
"""Extract code from the prediction."""
|
|
84
|
+
return self._postprocess(prediction)
|
|
77
85
|
|
|
78
86
|
@classmethod
|
|
79
87
|
def _postprocess(cls, text: str) -> str:
|
|
88
|
+
"""Extract code from markdown code blocks."""
|
|
80
89
|
blocks = re.findall(r'```\w*\n(.*?)```', text, re.DOTALL)
|
|
81
90
|
if len(blocks) >= 1:
|
|
82
91
|
text = blocks[0]
|
|
83
92
|
return text
|
|
84
93
|
|
|
85
|
-
def
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
+
def match_score(
|
|
95
|
+
self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
|
|
96
|
+
) -> Score:
|
|
97
|
+
score = Score(
|
|
98
|
+
extracted_prediction=filtered_prediction,
|
|
99
|
+
prediction=original_prediction,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
# Execute the code and check correctness
|
|
103
|
+
res = self.eval_func(task_state.metadata, filtered_prediction, self.timeout)
|
|
104
|
+
passed = res['passed']
|
|
105
|
+
|
|
106
|
+
score.value = {'pass': passed}
|
|
107
|
+
score.explanation = res.get('result', 'Code execution completed')
|
|
108
|
+
score.metadata = {'task_id': task_state.metadata['task_id'], 'timeout': self.timeout, 'execution_result': res}
|
|
109
|
+
score.main_score_name = 'pass'
|
|
110
|
+
|
|
111
|
+
return score
|
|
112
|
+
|
|
113
|
+
def aggregate_scores(self, sample_scores):
|
|
114
|
+
from evalscope.metrics.metric import PassAtK
|
|
115
|
+
|
|
116
|
+
# caculate pass@k here
|
|
117
|
+
agg_list = []
|
|
118
|
+
for metric in self.metric_list:
|
|
119
|
+
if metric.lower().startswith('pass@'):
|
|
120
|
+
k = int(metric.split('@')[1])
|
|
121
|
+
# Get the scores for this metric
|
|
122
|
+
agg = PassAtK(k)
|
|
123
|
+
agg_list.extend(agg(sample_scores))
|
|
124
|
+
return agg_list
|
|
@@ -1,54 +1,83 @@
|
|
|
1
|
-
from collections import defaultdict
|
|
2
1
|
from typing import Any, Dict, List
|
|
3
2
|
|
|
4
|
-
from evalscope.
|
|
5
|
-
from evalscope.
|
|
6
|
-
from evalscope.
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
'
|
|
19
|
-
'
|
|
20
|
-
|
|
21
|
-
'
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
3
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
4
|
+
from evalscope.api.dataset import Sample
|
|
5
|
+
from evalscope.api.evaluator import TaskState
|
|
6
|
+
from evalscope.api.messages import ChatMessageUser
|
|
7
|
+
from evalscope.api.metric import Score
|
|
8
|
+
from evalscope.api.registry import register_benchmark
|
|
9
|
+
from evalscope.constants import Tags
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@register_benchmark(
|
|
16
|
+
BenchmarkMeta(
|
|
17
|
+
name='ifeval',
|
|
18
|
+
pretty_name='IFEval',
|
|
19
|
+
description=
|
|
20
|
+
'IFEval is a benchmark for evaluating instruction-following language models, focusing on their ability to understand and respond to various prompts. It includes a diverse set of tasks and metrics to assess model performance comprehensively.', # noqa: E501
|
|
21
|
+
tags=[Tags.INSTRUCTION_FOLLOWING],
|
|
22
|
+
dataset_id='opencompass/ifeval',
|
|
23
|
+
subset_list=['default'],
|
|
24
|
+
metric_list=[
|
|
25
|
+
'prompt_level_strict',
|
|
26
|
+
'inst_level_strict',
|
|
27
|
+
'prompt_level_loose',
|
|
28
|
+
'inst_level_loose',
|
|
29
|
+
],
|
|
30
|
+
few_shot_num=0,
|
|
31
|
+
train_split=None,
|
|
32
|
+
eval_split='train',
|
|
33
|
+
prompt_template='',
|
|
34
|
+
)
|
|
27
35
|
)
|
|
28
|
-
class IFEvalAdapter(
|
|
36
|
+
class IFEvalAdapter(DefaultDataAdapter):
|
|
29
37
|
|
|
30
38
|
def __init__(self, **kwargs):
|
|
31
39
|
super().__init__(**kwargs)
|
|
32
40
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
metric_registry.register(Metric(name='prompt_level_loose_acc', object=mean))
|
|
37
|
-
metric_registry.register(Metric(name='inst_level_loose_acc', object=mean))
|
|
41
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
42
|
+
"""
|
|
43
|
+
Convert a data record to a Sample object.
|
|
38
44
|
|
|
39
|
-
|
|
40
|
-
|
|
45
|
+
Args:
|
|
46
|
+
record (Dict[str, Any]): Input data record.
|
|
41
47
|
|
|
42
|
-
|
|
43
|
-
|
|
48
|
+
Returns:
|
|
49
|
+
Sample: Sample object with input, target, and metadata.
|
|
50
|
+
"""
|
|
51
|
+
prompt = record.get('prompt', '')
|
|
52
|
+
message_list = [ChatMessageUser(content=prompt)]
|
|
44
53
|
|
|
45
|
-
|
|
54
|
+
return Sample(input=message_list, target='', metadata=record)
|
|
55
|
+
|
|
56
|
+
def match_score(
|
|
57
|
+
self, original_prediction: str, filtered_prediction: str, reference: Dict, task_state: TaskState
|
|
58
|
+
) -> Score:
|
|
59
|
+
"""
|
|
60
|
+
Calculate evaluation scores by comparing prediction with reference.
|
|
61
|
+
"""
|
|
46
62
|
from evalscope.benchmarks.ifeval.utils import process_results
|
|
47
63
|
|
|
48
|
-
|
|
64
|
+
# Initialize the score object with prediction details
|
|
65
|
+
score = Score(
|
|
66
|
+
extracted_prediction=filtered_prediction,
|
|
67
|
+
prediction=original_prediction,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
doc = task_state.metadata
|
|
71
|
+
try:
|
|
72
|
+
# Process results using the existing ifeval utility
|
|
73
|
+
results = process_results(doc, [filtered_prediction])
|
|
74
|
+
score.value.update(results)
|
|
75
|
+
|
|
76
|
+
# Set main score name
|
|
77
|
+
score.main_score_name = 'prompt_level_strict'
|
|
49
78
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
79
|
+
except Exception as e:
|
|
80
|
+
logger.error(f'Error calculating ifeval metrics: {e}')
|
|
81
|
+
score.value = {}
|
|
53
82
|
|
|
54
|
-
return
|
|
83
|
+
return score
|