evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/__init__.py +0 -0
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +3 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
- evalscope/api/benchmark/benchmark.py +321 -0
- evalscope/api/benchmark/meta.py +115 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +261 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +355 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +264 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +11 -0
- evalscope/api/messages/chat_message.py +198 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +105 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/dataset_mixin.py +105 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +157 -0
- evalscope/api/model/model.py +383 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +23 -11
- evalscope/app/utils/data_utils.py +42 -26
- evalscope/app/utils/text_utils.py +0 -2
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +6 -7
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -3
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +2 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +135 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +95 -54
- evalscope/constants.py +29 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +277 -423
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +32 -30
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +47 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +123 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +698 -0
- evalscope/perf/benchmark.py +2 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +7 -5
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +8 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -2
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +101 -6
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +26 -44
- evalscope/summarizer.py +1 -1
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +2 -1
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/io_utils.py +100 -5
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +10 -7
- evalscope/utils/multi_choices.py +271 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
- tests/aigc/test_t2i.py +22 -4
- tests/benchmark/__init__.py +1 -0
- tests/benchmark/test_eval.py +386 -0
- tests/cli/test_all.py +3 -5
- tests/cli/test_collection.py +13 -4
- tests/cli/test_custom.py +22 -15
- tests/rag/test_clip_benchmark.py +1 -0
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,15 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
# flake8: noqa: E501
|
|
2
3
|
import re
|
|
3
|
-
|
|
4
|
-
|
|
4
|
+
from typing import Any, Dict
|
|
5
|
+
|
|
6
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
7
|
+
from evalscope.api.dataset import Sample
|
|
8
|
+
from evalscope.api.evaluator import TaskState
|
|
9
|
+
from evalscope.api.messages.chat_message import ChatMessageUser
|
|
10
|
+
from evalscope.api.metric import Score
|
|
11
|
+
from evalscope.api.registry import register_benchmark
|
|
12
|
+
from evalscope.constants import Tags
|
|
5
13
|
from evalscope.utils.logger import get_logger
|
|
6
14
|
|
|
7
15
|
logger = get_logger()
|
|
@@ -10,28 +18,28 @@ logger = get_logger()
|
|
|
10
18
|
# {"task_id": "HumanEval/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n given threshold.\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n False\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n True\n \"\"\"\n", "entry_point": "has_close_elements", "canonical_solution": " for idx, elem in enumerate(numbers):\n for idx2, elem2 in enumerate(numbers):\n if idx != idx2:\n distance = abs(elem - elem2)\n if distance < threshold:\n return True\n\n return False\n", "test": "\n\nMETADATA = {\n 'author': 'jt',\n 'dataset': 'test'\n}\n\n\ndef check(candidate):\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n"} # noqa
|
|
11
19
|
|
|
12
20
|
|
|
13
|
-
@
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
21
|
+
@register_benchmark(
|
|
22
|
+
BenchmarkMeta(
|
|
23
|
+
name='humaneval',
|
|
24
|
+
pretty_name='HumanEval',
|
|
25
|
+
tags=[Tags.CODING],
|
|
26
|
+
description=
|
|
27
|
+
'HumanEval is a benchmark for evaluating the ability of code generation models to write Python functions based on given specifications. It consists of programming tasks with a defined input-output behavior.',
|
|
28
|
+
dataset_id='opencompass/humaneval',
|
|
29
|
+
subset_list=['openai_humaneval'],
|
|
30
|
+
metric_list=['Pass@1'],
|
|
31
|
+
eval_split='test',
|
|
32
|
+
prompt_template=
|
|
33
|
+
'Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n{question}',
|
|
34
|
+
extra_params={
|
|
35
|
+
'num_workers': 4,
|
|
36
|
+
'timeout': 4
|
|
37
|
+
},
|
|
38
|
+
)
|
|
31
39
|
)
|
|
32
|
-
class HumanevalAdapter(
|
|
40
|
+
class HumanevalAdapter(DefaultDataAdapter):
|
|
33
41
|
"""
|
|
34
|
-
|
|
42
|
+
HumanEval adapter using the new data processing framework.
|
|
35
43
|
"""
|
|
36
44
|
|
|
37
45
|
def __init__(self, **kwargs):
|
|
@@ -39,9 +47,11 @@ class HumanevalAdapter(DataAdapter):
|
|
|
39
47
|
from human_eval.data import stream_jsonl, write_jsonl
|
|
40
48
|
from human_eval.evaluation import check_correctness
|
|
41
49
|
except ImportError:
|
|
42
|
-
raise ImportError(
|
|
43
|
-
|
|
44
|
-
|
|
50
|
+
raise ImportError(
|
|
51
|
+
'Please install human_eval:'
|
|
52
|
+
'https://github.com/openai/human-eval/tree/master#installation , '
|
|
53
|
+
'Note that you need to enable the execution code in the human_eval/execution.py first.'
|
|
54
|
+
)
|
|
45
55
|
super().__init__(**kwargs)
|
|
46
56
|
|
|
47
57
|
extra_params = kwargs.get('extra_params', {})
|
|
@@ -53,41 +63,62 @@ class HumanevalAdapter(DataAdapter):
|
|
|
53
63
|
self.write_jsonl_func = write_jsonl
|
|
54
64
|
self.eval_func = check_correctness
|
|
55
65
|
|
|
56
|
-
def
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
return self.gen_prompt_data(full_prompt)
|
|
66
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
67
|
+
"""Convert a data record to a Sample object."""
|
|
68
|
+
query = record['prompt']
|
|
69
|
+
full_prompt = self.prompt_template.format(question=query)
|
|
70
|
+
|
|
71
|
+
return Sample(
|
|
72
|
+
input=[ChatMessageUser(content=full_prompt)],
|
|
73
|
+
target=record['canonical_solution'],
|
|
74
|
+
metadata={
|
|
75
|
+
'task_id': record['task_id'],
|
|
76
|
+
'entry_point': record['entry_point'],
|
|
77
|
+
'prompt': record['prompt'],
|
|
78
|
+
'test': record['test'],
|
|
79
|
+
}
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
def extract_answer(self, prediction: str, task_state: TaskState) -> str:
|
|
83
|
+
"""Extract code from the prediction."""
|
|
84
|
+
return self._postprocess(prediction)
|
|
77
85
|
|
|
78
86
|
@classmethod
|
|
79
87
|
def _postprocess(cls, text: str) -> str:
|
|
88
|
+
"""Extract code from markdown code blocks."""
|
|
80
89
|
blocks = re.findall(r'```\w*\n(.*?)```', text, re.DOTALL)
|
|
81
90
|
if len(blocks) >= 1:
|
|
82
91
|
text = blocks[0]
|
|
83
92
|
return text
|
|
84
93
|
|
|
85
|
-
def
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
+
def match_score(
|
|
95
|
+
self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
|
|
96
|
+
) -> Score:
|
|
97
|
+
score = Score(
|
|
98
|
+
extracted_prediction=filtered_prediction,
|
|
99
|
+
prediction=original_prediction,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
# Execute the code and check correctness
|
|
103
|
+
res = self.eval_func(task_state.metadata, filtered_prediction, self.timeout)
|
|
104
|
+
passed = res['passed']
|
|
105
|
+
|
|
106
|
+
score.value = {'pass': passed}
|
|
107
|
+
score.explanation = res.get('result', 'Code execution completed')
|
|
108
|
+
score.metadata = {'task_id': task_state.metadata['task_id'], 'timeout': self.timeout, 'execution_result': res}
|
|
109
|
+
score.main_score_name = 'pass'
|
|
110
|
+
|
|
111
|
+
return score
|
|
112
|
+
|
|
113
|
+
def aggregate_scores(self, sample_scores):
|
|
114
|
+
from evalscope.metrics.metric import PassAtK
|
|
115
|
+
|
|
116
|
+
# caculate pass@k here
|
|
117
|
+
agg_list = []
|
|
118
|
+
for metric in self.metric_list:
|
|
119
|
+
if metric.lower().startswith('pass@'):
|
|
120
|
+
k = int(metric.split('@')[1])
|
|
121
|
+
# Get the scores for this metric
|
|
122
|
+
agg = PassAtK(k)
|
|
123
|
+
agg_list.extend(agg(sample_scores))
|
|
124
|
+
return agg_list
|
|
@@ -1,54 +1,83 @@
|
|
|
1
|
-
from collections import defaultdict
|
|
2
1
|
from typing import Any, Dict, List
|
|
3
2
|
|
|
4
|
-
from evalscope.
|
|
5
|
-
from evalscope.
|
|
6
|
-
from evalscope.
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
'
|
|
19
|
-
'
|
|
20
|
-
|
|
21
|
-
'
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
3
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
4
|
+
from evalscope.api.dataset import Sample
|
|
5
|
+
from evalscope.api.evaluator import TaskState
|
|
6
|
+
from evalscope.api.messages import ChatMessageUser
|
|
7
|
+
from evalscope.api.metric import Score
|
|
8
|
+
from evalscope.api.registry import register_benchmark
|
|
9
|
+
from evalscope.constants import Tags
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@register_benchmark(
|
|
16
|
+
BenchmarkMeta(
|
|
17
|
+
name='ifeval',
|
|
18
|
+
pretty_name='IFEval',
|
|
19
|
+
description=
|
|
20
|
+
'IFEval is a benchmark for evaluating instruction-following language models, focusing on their ability to understand and respond to various prompts. It includes a diverse set of tasks and metrics to assess model performance comprehensively.', # noqa: E501
|
|
21
|
+
tags=[Tags.INSTRUCTION_FOLLOWING],
|
|
22
|
+
dataset_id='opencompass/ifeval',
|
|
23
|
+
subset_list=['default'],
|
|
24
|
+
metric_list=[
|
|
25
|
+
'prompt_level_strict',
|
|
26
|
+
'inst_level_strict',
|
|
27
|
+
'prompt_level_loose',
|
|
28
|
+
'inst_level_loose',
|
|
29
|
+
],
|
|
30
|
+
few_shot_num=0,
|
|
31
|
+
train_split=None,
|
|
32
|
+
eval_split='train',
|
|
33
|
+
prompt_template='',
|
|
34
|
+
)
|
|
27
35
|
)
|
|
28
|
-
class IFEvalAdapter(
|
|
36
|
+
class IFEvalAdapter(DefaultDataAdapter):
|
|
29
37
|
|
|
30
38
|
def __init__(self, **kwargs):
|
|
31
39
|
super().__init__(**kwargs)
|
|
32
40
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
metric_registry.register(Metric(name='prompt_level_loose_acc', object=mean))
|
|
37
|
-
metric_registry.register(Metric(name='inst_level_loose_acc', object=mean))
|
|
41
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
42
|
+
"""
|
|
43
|
+
Convert a data record to a Sample object.
|
|
38
44
|
|
|
39
|
-
|
|
40
|
-
|
|
45
|
+
Args:
|
|
46
|
+
record (Dict[str, Any]): Input data record.
|
|
41
47
|
|
|
42
|
-
|
|
43
|
-
|
|
48
|
+
Returns:
|
|
49
|
+
Sample: Sample object with input, target, and metadata.
|
|
50
|
+
"""
|
|
51
|
+
prompt = record.get('prompt', '')
|
|
52
|
+
message_list = [ChatMessageUser(content=prompt)]
|
|
44
53
|
|
|
45
|
-
|
|
54
|
+
return Sample(input=message_list, target='', metadata=record)
|
|
55
|
+
|
|
56
|
+
def match_score(
|
|
57
|
+
self, original_prediction: str, filtered_prediction: str, reference: Dict, task_state: TaskState
|
|
58
|
+
) -> Score:
|
|
59
|
+
"""
|
|
60
|
+
Calculate evaluation scores by comparing prediction with reference.
|
|
61
|
+
"""
|
|
46
62
|
from evalscope.benchmarks.ifeval.utils import process_results
|
|
47
63
|
|
|
48
|
-
|
|
64
|
+
# Initialize the score object with prediction details
|
|
65
|
+
score = Score(
|
|
66
|
+
extracted_prediction=filtered_prediction,
|
|
67
|
+
prediction=original_prediction,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
doc = task_state.metadata
|
|
71
|
+
try:
|
|
72
|
+
# Process results using the existing ifeval utility
|
|
73
|
+
results = process_results(doc, [filtered_prediction])
|
|
74
|
+
score.value.update(results)
|
|
75
|
+
|
|
76
|
+
# Set main score name
|
|
77
|
+
score.main_score_name = 'prompt_level_strict'
|
|
49
78
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
79
|
+
except Exception as e:
|
|
80
|
+
logger.error(f'Error calculating ifeval metrics: {e}')
|
|
81
|
+
score.value = {}
|
|
53
82
|
|
|
54
|
-
return
|
|
83
|
+
return score
|
|
@@ -21,7 +21,7 @@ import re
|
|
|
21
21
|
import string
|
|
22
22
|
from typing import Dict, Optional, Sequence, Union
|
|
23
23
|
|
|
24
|
-
from
|
|
24
|
+
from . import instructions_util
|
|
25
25
|
|
|
26
26
|
_InstructionArgsDtype = Optional[Dict[str, Union[int, str, Sequence[str]]]]
|
|
27
27
|
|
|
@@ -140,8 +140,9 @@ class ResponseLanguageChecker(Instruction):
|
|
|
140
140
|
if self._language is None:
|
|
141
141
|
self._language = random.choice(list(_LANGUAGES.keys()))
|
|
142
142
|
# TODO(tianjianlu): opens the description generation to more choices.
|
|
143
|
-
self._description_pattern = (
|
|
144
|
-
|
|
143
|
+
self._description_pattern = (
|
|
144
|
+
'Your ENTIRE response should be in {language} language, no other ' + 'language is allowed.'
|
|
145
|
+
)
|
|
145
146
|
return self._description_pattern.format(language=_LANGUAGES[self._language])
|
|
146
147
|
|
|
147
148
|
def get_instruction_args(self):
|
|
@@ -197,8 +198,10 @@ class NumberOfSentences(Instruction):
|
|
|
197
198
|
if relation is None:
|
|
198
199
|
self._comparison_relation = random.choice(_COMPARISON_RELATION)
|
|
199
200
|
elif relation not in _COMPARISON_RELATION:
|
|
200
|
-
raise ValueError(
|
|
201
|
-
|
|
201
|
+
raise ValueError(
|
|
202
|
+
'The supported relation for comparison must be in '
|
|
203
|
+
f'{_COMPARISON_RELATION}, but {relation} is given.'
|
|
204
|
+
)
|
|
202
205
|
else:
|
|
203
206
|
self._comparison_relation = relation
|
|
204
207
|
|
|
@@ -255,8 +258,10 @@ class PlaceholderChecker(Instruction):
|
|
|
255
258
|
self._num_placeholders = num_placeholders
|
|
256
259
|
if self._num_placeholders is None or self._num_placeholders < 0:
|
|
257
260
|
self._num_placeholders = random.randint(1, _NUM_PLACEHOLDERS)
|
|
258
|
-
self._description_pattern = (
|
|
259
|
-
|
|
261
|
+
self._description_pattern = (
|
|
262
|
+
'The response must contain at least {num_placeholders} placeholders '
|
|
263
|
+
+ 'represented by square brackets, such as [address].'
|
|
264
|
+
)
|
|
260
265
|
return self._description_pattern.format(num_placeholders=self._num_placeholders)
|
|
261
266
|
|
|
262
267
|
def get_instruction_args(self):
|
|
@@ -298,9 +303,10 @@ class BulletListChecker(Instruction):
|
|
|
298
303
|
self._num_bullets = num_bullets
|
|
299
304
|
if self._num_bullets is None or self._num_bullets < 0:
|
|
300
305
|
self._num_bullets = random.randint(1, _NUM_BULLETS)
|
|
301
|
-
self._description_pattern = (
|
|
302
|
-
|
|
303
|
-
|
|
306
|
+
self._description_pattern = (
|
|
307
|
+
'Your answer must contain exactly {num_bullets} bullet points. '
|
|
308
|
+
+ 'Use the markdown bullet points such as:\n' + '* This is point 1. \n' + '* This is point 2'
|
|
309
|
+
)
|
|
304
310
|
return self._description_pattern.format(num_bullets=self._num_bullets)
|
|
305
311
|
|
|
306
312
|
def get_instruction_args(self):
|
|
@@ -379,8 +385,9 @@ class ConstrainedStartChecker(Instruction):
|
|
|
379
385
|
self._starter = starter.strip() if isinstance(starter, str) else starter
|
|
380
386
|
if self._starter is None:
|
|
381
387
|
self._starter = random.choice(_STARTER_OPTIONS)
|
|
382
|
-
self._description_pattern = (
|
|
383
|
-
|
|
388
|
+
self._description_pattern = (
|
|
389
|
+
'During the conversation, when it is your turn, ' + 'please always start with {starter}'
|
|
390
|
+
)
|
|
384
391
|
return self._description_pattern.format(starter=self._starter)
|
|
385
392
|
|
|
386
393
|
def get_instruction_args(self):
|
|
@@ -423,8 +430,10 @@ class HighlightSectionChecker(Instruction):
|
|
|
423
430
|
if self._num_highlights is None or self._num_highlights < 0:
|
|
424
431
|
self._num_highlights = random.randint(1, _NUM_HIGHLIGHTED_SECTIONS)
|
|
425
432
|
|
|
426
|
-
self._description_pattern = (
|
|
427
|
-
|
|
433
|
+
self._description_pattern = (
|
|
434
|
+
'Highlight at least {num_highlights} sections in your answer with '
|
|
435
|
+
+ 'markdown, i.e. *highlighted section*.'
|
|
436
|
+
)
|
|
428
437
|
|
|
429
438
|
return self._description_pattern.format(num_highlights=self._num_highlights)
|
|
430
439
|
|
|
@@ -482,9 +491,11 @@ class SectionChecker(Instruction):
|
|
|
482
491
|
if self._num_sections is None or self._num_sections < 0:
|
|
483
492
|
self._num_sections = random.randint(1, _NUM_SECTIONS)
|
|
484
493
|
|
|
485
|
-
self._description_pattern = (
|
|
486
|
-
|
|
487
|
-
|
|
494
|
+
self._description_pattern = (
|
|
495
|
+
'Your response must have {num_sections} sections. Mark the beginning '
|
|
496
|
+
+ 'of each section with {section_spliter} X, such as:\n' + '{section_spliter} 1\n'
|
|
497
|
+
+ '[content of section 1]\n' + '{section_spliter} 2\n' + '[content of section 2]'
|
|
498
|
+
)
|
|
488
499
|
|
|
489
500
|
return self._description_pattern.format(num_sections=self._num_sections, section_spliter=self._section_spliter)
|
|
490
501
|
|
|
@@ -534,8 +545,9 @@ class ParagraphChecker(Instruction):
|
|
|
534
545
|
if self._num_paragraphs is None or self._num_paragraphs < 0:
|
|
535
546
|
self._num_paragraphs = random.randint(1, _NUM_PARAGRAPHS)
|
|
536
547
|
|
|
537
|
-
self._description_pattern = (
|
|
538
|
-
|
|
548
|
+
self._description_pattern = (
|
|
549
|
+
'There should be {num_paragraphs} paragraphs. ' + 'Paragraphs are separated with the markdown divider: ***'
|
|
550
|
+
)
|
|
539
551
|
|
|
540
552
|
return self._description_pattern.format(num_paragraphs=self._num_paragraphs)
|
|
541
553
|
|
|
@@ -585,12 +597,14 @@ class PostscriptChecker(Instruction):
|
|
|
585
597
|
A string representing the instruction description.
|
|
586
598
|
"""
|
|
587
599
|
self._postscript_marker = (
|
|
588
|
-
postscript_marker.strip() if isinstance(postscript_marker, str) else postscript_marker
|
|
600
|
+
postscript_marker.strip() if isinstance(postscript_marker, str) else postscript_marker
|
|
601
|
+
)
|
|
589
602
|
if self._postscript_marker is None:
|
|
590
603
|
self._postscript_marker = random.choice(_POSTSCRIPT_MARKER)
|
|
591
604
|
|
|
592
|
-
self._description_pattern = (
|
|
593
|
-
|
|
605
|
+
self._description_pattern = (
|
|
606
|
+
'At the end of your response, please explicitly add a postscript ' + 'starting with {postscript}'
|
|
607
|
+
)
|
|
594
608
|
|
|
595
609
|
return self._description_pattern.format(postscript=self._postscript_marker)
|
|
596
610
|
|
|
@@ -644,8 +658,10 @@ class RephraseChecker(Instruction):
|
|
|
644
658
|
'in the form of *change me*.')
|
|
645
659
|
|
|
646
660
|
self._reference_without_change = original_message
|
|
647
|
-
self._description = (
|
|
648
|
-
|
|
661
|
+
self._description = (
|
|
662
|
+
'Rephrasing: Your rephrased response should only' + 'change the words/sentences in between two asterisks'
|
|
663
|
+
+ 'such as *change me*.'
|
|
664
|
+
)
|
|
649
665
|
return self._description
|
|
650
666
|
|
|
651
667
|
def get_instruction_args(self):
|
|
@@ -757,13 +773,16 @@ class KeywordFrequencyChecker(Instruction):
|
|
|
757
773
|
if relation is None:
|
|
758
774
|
self._comparison_relation = random.choice(_COMPARISON_RELATION)
|
|
759
775
|
elif relation not in _COMPARISON_RELATION:
|
|
760
|
-
raise ValueError(
|
|
761
|
-
|
|
776
|
+
raise ValueError(
|
|
777
|
+
'The supported relation for comparison must be in '
|
|
778
|
+
f'{_COMPARISON_RELATION}, but {relation} is given.'
|
|
779
|
+
)
|
|
762
780
|
else:
|
|
763
781
|
self._comparison_relation = relation
|
|
764
782
|
|
|
765
|
-
self._description_pattern = (
|
|
766
|
-
|
|
783
|
+
self._description_pattern = (
|
|
784
|
+
'In your response, the word {keyword} should appear {relation} ' + '{frequency} times.'
|
|
785
|
+
)
|
|
767
786
|
|
|
768
787
|
return self._description_pattern.format(
|
|
769
788
|
keyword=self._keyword,
|
|
@@ -819,8 +838,10 @@ class NumberOfWords(Instruction):
|
|
|
819
838
|
if relation is None:
|
|
820
839
|
self._comparison_relation = random.choice(_COMPARISON_RELATION)
|
|
821
840
|
elif relation not in _COMPARISON_RELATION:
|
|
822
|
-
raise ValueError(
|
|
823
|
-
|
|
841
|
+
raise ValueError(
|
|
842
|
+
'The supported relation for comparison must be in '
|
|
843
|
+
f'{_COMPARISON_RELATION}, but {relation} is given.'
|
|
844
|
+
)
|
|
824
845
|
else:
|
|
825
846
|
self._comparison_relation = relation
|
|
826
847
|
|
|
@@ -850,8 +871,10 @@ class JsonFormat(Instruction):
|
|
|
850
871
|
"""Check the Json format."""
|
|
851
872
|
|
|
852
873
|
def build_description(self):
|
|
853
|
-
self._description_pattern = (
|
|
854
|
-
|
|
874
|
+
self._description_pattern = (
|
|
875
|
+
'Entire output should be wrapped in JSON format. You can use markdown'
|
|
876
|
+
' ticks such as ```.'
|
|
877
|
+
)
|
|
855
878
|
return self._description_pattern
|
|
856
879
|
|
|
857
880
|
def get_instruction_args(self):
|
|
@@ -864,8 +887,9 @@ class JsonFormat(Instruction):
|
|
|
864
887
|
|
|
865
888
|
def check_following(self, value):
|
|
866
889
|
value = (
|
|
867
|
-
value.strip().removeprefix('```json').removeprefix('```Json').removeprefix('```JSON').removeprefix(
|
|
868
|
-
|
|
890
|
+
value.strip().removeprefix('```json').removeprefix('```Json').removeprefix('```JSON').removeprefix('```').
|
|
891
|
+
removesuffix('```').strip()
|
|
892
|
+
)
|
|
869
893
|
try:
|
|
870
894
|
json.loads(value)
|
|
871
895
|
except ValueError:
|
|
@@ -903,10 +927,12 @@ class ParagraphFirstWordCheck(Instruction):
|
|
|
903
927
|
self._first_word = instructions_util.generate_keywords(num_keywords=1)[0]
|
|
904
928
|
self._first_word = self._first_word.lower()
|
|
905
929
|
|
|
906
|
-
self._description_pattern = (
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
930
|
+
self._description_pattern = (
|
|
931
|
+
'There should be {num_paragraphs} paragraphs. '
|
|
932
|
+
+ 'Paragraphs and only paragraphs are separated with each other by two '
|
|
933
|
+
+ "new lines as if it was '\\n\\n' in python. "
|
|
934
|
+
+ 'Paragraph {nth_paragraph} must start with word {first_word}.'
|
|
935
|
+
)
|
|
910
936
|
|
|
911
937
|
return self._description_pattern.format(
|
|
912
938
|
num_paragraphs=self._num_paragraphs,
|
|
@@ -1084,11 +1110,12 @@ class RephraseParagraph(Instruction):
|
|
|
1084
1110
|
self._low = low
|
|
1085
1111
|
self._high = high
|
|
1086
1112
|
|
|
1087
|
-
self._description = (
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1113
|
+
self._description = (
|
|
1114
|
+
'Rephrase the following paragraph: ' + '{original_paragraph}\nYour response should have '
|
|
1115
|
+
+ 'between {low} and {high} of the same words. ' + 'Words are the same if and only if all of the '
|
|
1116
|
+
+ 'letters, ignoring cases, are the same. For ' + "example, 'run' is the same as 'Run' but different "
|
|
1117
|
+
+ "to 'ran'."
|
|
1118
|
+
)
|
|
1092
1119
|
|
|
1093
1120
|
return self._description.format(original_paragraph=original_paragraph, low=self._low, high=self._high)
|
|
1094
1121
|
|
|
@@ -1123,8 +1150,10 @@ class TwoResponsesChecker(Instruction):
|
|
|
1123
1150
|
|
|
1124
1151
|
def build_description(self):
|
|
1125
1152
|
"""Build the instruction description."""
|
|
1126
|
-
self._description_pattern = (
|
|
1127
|
-
|
|
1153
|
+
self._description_pattern = (
|
|
1154
|
+
'Give two different responses. Responses and only responses should'
|
|
1155
|
+
' be separated by 6 asterisk symbols: ******.'
|
|
1156
|
+
)
|
|
1128
1157
|
return self._description_pattern
|
|
1129
1158
|
|
|
1130
1159
|
def get_instruction_args(self):
|
|
@@ -1171,10 +1200,12 @@ class RepeatPromptThenAnswer(Instruction):
|
|
|
1171
1200
|
raise ValueError('prompt_to_repeat must be set.')
|
|
1172
1201
|
else:
|
|
1173
1202
|
self._prompt_to_repeat = prompt_to_repeat
|
|
1174
|
-
self._description_pattern = (
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
|
|
1203
|
+
self._description_pattern = (
|
|
1204
|
+
'First repeat the request word for word without change,'
|
|
1205
|
+
' then give your answer (1. do not say any words or characters'
|
|
1206
|
+
' before repeating the request; 2. the request you need to repeat'
|
|
1207
|
+
' does not include this sentence)'
|
|
1208
|
+
)
|
|
1178
1209
|
return self._description_pattern
|
|
1179
1210
|
|
|
1180
1211
|
def get_instruction_args(self):
|
|
@@ -1205,8 +1236,10 @@ class EndChecker(Instruction):
|
|
|
1205
1236
|
self._end_phrase = (end_phrase.strip() if isinstance(end_phrase, str) else end_phrase)
|
|
1206
1237
|
if self._end_phrase is None:
|
|
1207
1238
|
self._end_phrase = random.choice(_ENDING_OPTIONS)
|
|
1208
|
-
self._description_pattern = (
|
|
1209
|
-
|
|
1239
|
+
self._description_pattern = (
|
|
1240
|
+
'Finish your response with this exact phrase {ender}. '
|
|
1241
|
+
'No other words should follow this phrase.'
|
|
1242
|
+
)
|
|
1210
1243
|
return self._description_pattern.format(ender=self._end_phrase)
|
|
1211
1244
|
|
|
1212
1245
|
def get_instruction_args(self):
|
|
@@ -1228,8 +1261,10 @@ class TitleChecker(Instruction):
|
|
|
1228
1261
|
|
|
1229
1262
|
def build_description(self):
|
|
1230
1263
|
"""Build the instruction description."""
|
|
1231
|
-
self._description_pattern = (
|
|
1232
|
-
|
|
1264
|
+
self._description_pattern = (
|
|
1265
|
+
'Your answer must contain a title, wrapped in double angular brackets,'
|
|
1266
|
+
' such as <<poem of joy>>.'
|
|
1267
|
+
)
|
|
1233
1268
|
return self._description_pattern
|
|
1234
1269
|
|
|
1235
1270
|
def get_instruction_args(self):
|
|
@@ -1283,13 +1318,17 @@ class LetterFrequencyChecker(Instruction):
|
|
|
1283
1318
|
if let_relation is None:
|
|
1284
1319
|
self._comparison_relation = random.choice(_COMPARISON_RELATION)
|
|
1285
1320
|
elif let_relation not in _COMPARISON_RELATION:
|
|
1286
|
-
raise ValueError(
|
|
1287
|
-
|
|
1321
|
+
raise ValueError(
|
|
1322
|
+
'The supported relation for comparison must be in '
|
|
1323
|
+
f'{_COMPARISON_RELATION}, but {let_relation} is given.'
|
|
1324
|
+
)
|
|
1288
1325
|
else:
|
|
1289
1326
|
self._comparison_relation = let_relation
|
|
1290
1327
|
|
|
1291
|
-
self._description_pattern = (
|
|
1292
|
-
|
|
1328
|
+
self._description_pattern = (
|
|
1329
|
+
'In your response, the letter {letter} should appear {let_relation}'
|
|
1330
|
+
' {let_frequency} times.'
|
|
1331
|
+
)
|
|
1293
1332
|
|
|
1294
1333
|
return self._description_pattern.format(
|
|
1295
1334
|
letter=self._letter,
|
|
@@ -1352,8 +1391,10 @@ class LowercaseLettersEnglishChecker(Instruction):
|
|
|
1352
1391
|
|
|
1353
1392
|
def build_description(self):
|
|
1354
1393
|
"""Build the instruction description."""
|
|
1355
|
-
self._description_pattern = (
|
|
1356
|
-
|
|
1394
|
+
self._description_pattern = (
|
|
1395
|
+
'Your entire response should be in English, and in all lowercase'
|
|
1396
|
+
' letters. No capital letters are allowed.'
|
|
1397
|
+
)
|
|
1357
1398
|
return self._description_pattern
|
|
1358
1399
|
|
|
1359
1400
|
def get_instruction_args(self):
|
|
@@ -1422,11 +1463,15 @@ class CapitalWordFrequencyChecker(Instruction):
|
|
|
1422
1463
|
if capital_relation is None:
|
|
1423
1464
|
self._comparison_relation = random.choice(_COMPARISON_RELATION)
|
|
1424
1465
|
elif capital_relation not in _COMPARISON_RELATION:
|
|
1425
|
-
raise ValueError(
|
|
1426
|
-
|
|
1427
|
-
|
|
1428
|
-
|
|
1429
|
-
|
|
1466
|
+
raise ValueError(
|
|
1467
|
+
'The supported relation for comparison must be in '
|
|
1468
|
+
f'{_COMPARISON_RELATION}, but {capital_relation} is given.'
|
|
1469
|
+
)
|
|
1470
|
+
|
|
1471
|
+
self._description_pattern = (
|
|
1472
|
+
'In your response, words with all capital letters should appear'
|
|
1473
|
+
' {relation} {frequency} times.'
|
|
1474
|
+
)
|
|
1430
1475
|
|
|
1431
1476
|
return self._description_pattern.format(frequency=self._frequency, relation=self._comparison_relation)
|
|
1432
1477
|
|