PyPI - evalscope - Versions diffs - 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

evalscope 0.17.1py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (273) hide show

evalscope/__init__.py +4 -1
evalscope/api/__init__.py +0 -0
evalscope/api/benchmark/__init__.py +3 -0
evalscope/api/benchmark/adapters/__init__.py +3 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
evalscope/api/benchmark/benchmark.py +321 -0
evalscope/api/benchmark/meta.py +115 -0
evalscope/api/dataset/__init__.py +2 -0
evalscope/api/dataset/dataset.py +349 -0
evalscope/api/dataset/loader.py +261 -0
evalscope/api/dataset/utils.py +143 -0
evalscope/api/evaluator/__init__.py +3 -0
evalscope/api/evaluator/cache.py +355 -0
evalscope/api/evaluator/evaluator.py +56 -0
evalscope/api/evaluator/state.py +264 -0
evalscope/api/filter/__init__.py +1 -0
evalscope/api/filter/filter.py +72 -0
evalscope/api/messages/__init__.py +11 -0
evalscope/api/messages/chat_message.py +198 -0
evalscope/api/messages/content.py +102 -0
evalscope/api/messages/utils.py +35 -0
evalscope/api/metric/__init__.py +2 -0
evalscope/api/metric/metric.py +55 -0
evalscope/api/metric/scorer.py +105 -0
evalscope/api/mixin/__init__.py +2 -0
evalscope/api/mixin/dataset_mixin.py +105 -0
evalscope/api/mixin/llm_judge_mixin.py +168 -0
evalscope/api/model/__init__.py +12 -0
evalscope/api/model/generate_config.py +157 -0
evalscope/api/model/model.py +383 -0
evalscope/api/model/model_output.py +285 -0
evalscope/api/registry.py +182 -0
evalscope/api/tool/__init__.py +3 -0
evalscope/api/tool/tool_call.py +101 -0
evalscope/api/tool/tool_info.py +173 -0
evalscope/api/tool/utils.py +64 -0
evalscope/app/ui/app_ui.py +2 -1
evalscope/app/ui/multi_model.py +50 -25
evalscope/app/ui/single_model.py +23 -11
evalscope/app/utils/data_utils.py +42 -26
evalscope/app/utils/text_utils.py +0 -2
evalscope/app/utils/visualization.py +9 -4
evalscope/arguments.py +6 -7
evalscope/backend/opencompass/api_meta_template.py +2 -1
evalscope/backend/opencompass/backend_manager.py +6 -3
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
evalscope/backend/rag_eval/ragas/task_template.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
evalscope/backend/rag_eval/utils/embedding.py +2 -1
evalscope/backend/rag_eval/utils/llm.py +13 -12
evalscope/benchmarks/__init__.py +0 -2
evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
evalscope/benchmarks/aime/aime24_adapter.py +38 -40
evalscope/benchmarks/aime/aime25_adapter.py +34 -40
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
evalscope/benchmarks/arc/arc_adapter.py +34 -147
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
evalscope/benchmarks/arena_hard/utils.py +37 -1
evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
evalscope/benchmarks/bfcl/generation.py +222 -0
evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
evalscope/benchmarks/docmath/utils.py +4 -5
evalscope/benchmarks/drop/drop_adapter.py +88 -40
evalscope/benchmarks/frames/frames_adapter.py +135 -52
evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
evalscope/benchmarks/general_arena/utils.py +23 -27
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
evalscope/benchmarks/hle/hle_adapter.py +127 -93
evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
evalscope/benchmarks/ifeval/instructions.py +109 -64
evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
evalscope/benchmarks/ifeval/utils.py +6 -7
evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
evalscope/benchmarks/musr/musr_adapter.py +33 -64
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
evalscope/benchmarks/race/race_adapter.py +33 -119
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
evalscope/benchmarks/super_gpqa/utils.py +2 -1
evalscope/benchmarks/tau_bench/generation.py +147 -0
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
evalscope/cli/cli.py +2 -0
evalscope/cli/start_server.py +6 -3
evalscope/collections/__init__.py +2 -10
evalscope/collections/sampler.py +10 -10
evalscope/collections/schema.py +13 -11
evalscope/config.py +95 -54
evalscope/constants.py +29 -61
evalscope/evaluator/__init__.py +1 -1
evalscope/evaluator/evaluator.py +277 -423
evalscope/filters/__init__.py +2 -0
evalscope/filters/extraction.py +126 -0
evalscope/filters/selection.py +57 -0
evalscope/metrics/__init__.py +13 -13
evalscope/metrics/llm_judge.py +32 -30
evalscope/metrics/math_parser.py +27 -22
evalscope/metrics/metric.py +307 -0
evalscope/metrics/metrics.py +22 -18
evalscope/metrics/t2v_metrics/__init__.py +0 -52
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
evalscope/models/__init__.py +6 -29
evalscope/models/mockllm.py +65 -0
evalscope/models/model_apis.py +47 -0
evalscope/models/modelscope.py +455 -0
evalscope/models/openai_compatible.py +123 -0
evalscope/models/text2image_model.py +124 -0
evalscope/models/utils/openai.py +698 -0
evalscope/perf/benchmark.py +2 -1
evalscope/perf/http_client.py +4 -2
evalscope/perf/plugin/api/custom_api.py +5 -4
evalscope/perf/plugin/api/openai_api.py +11 -9
evalscope/perf/plugin/datasets/custom.py +2 -1
evalscope/perf/plugin/datasets/flickr8k.py +1 -1
evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
evalscope/perf/plugin/datasets/line_by_line.py +2 -1
evalscope/perf/plugin/datasets/longalpaca.py +2 -1
evalscope/perf/plugin/datasets/openqa.py +4 -2
evalscope/perf/utils/benchmark_util.py +7 -5
evalscope/perf/utils/db_util.py +9 -6
evalscope/perf/utils/local_server.py +8 -3
evalscope/perf/utils/rich_display.py +16 -10
evalscope/report/__init__.py +2 -2
evalscope/report/combinator.py +18 -12
evalscope/report/generator.py +101 -6
evalscope/report/{utils.py → report.py} +8 -6
evalscope/run.py +26 -44
evalscope/summarizer.py +1 -1
evalscope/utils/__init__.py +21 -2
evalscope/utils/chat_service.py +2 -1
evalscope/utils/deprecation_utils.py +12 -1
evalscope/utils/function_utils.py +29 -0
evalscope/utils/io_utils.py +100 -5
evalscope/utils/json_schema.py +208 -0
evalscope/utils/logger.py +51 -12
evalscope/utils/model_utils.py +10 -7
evalscope/utils/multi_choices.py +271 -0
evalscope/utils/url_utils.py +65 -0
evalscope/version.py +2 -2
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
tests/aigc/test_t2i.py +22 -4
tests/benchmark/__init__.py +1 -0
tests/benchmark/test_eval.py +386 -0
tests/cli/test_all.py +3 -5
tests/cli/test_collection.py +13 -4
tests/cli/test_custom.py +22 -15
tests/rag/test_clip_benchmark.py +1 -0
evalscope/benchmarks/aigc/t2i/base.py +0 -56
evalscope/benchmarks/arc/ai2_arc.py +0 -151
evalscope/benchmarks/benchmark.py +0 -81
evalscope/benchmarks/ceval/ceval_exam.py +0 -146
evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
evalscope/benchmarks/competition_math/competition_math.py +0 -79
evalscope/benchmarks/data_adapter.py +0 -528
evalscope/benchmarks/filters.py +0 -59
evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
evalscope/benchmarks/humaneval/humaneval.py +0 -79
evalscope/benchmarks/mmlu/mmlu.py +0 -160
evalscope/benchmarks/mmlu/samples.jsonl +0 -5
evalscope/benchmarks/process_bench/critique_template.txt +0 -13
evalscope/benchmarks/race/race.py +0 -104
evalscope/benchmarks/race/samples.jsonl +0 -5
evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
evalscope/benchmarks/utils.py +0 -60
evalscope/collections/evaluator.py +0 -375
evalscope/metrics/completion_parsers.py +0 -227
evalscope/metrics/named_metrics.py +0 -55
evalscope/models/adapters/__init__.py +0 -14
evalscope/models/adapters/base_adapter.py +0 -84
evalscope/models/adapters/bfcl_adapter.py +0 -246
evalscope/models/adapters/chat_adapter.py +0 -207
evalscope/models/adapters/choice_adapter.py +0 -222
evalscope/models/adapters/custom_adapter.py +0 -71
evalscope/models/adapters/server_adapter.py +0 -236
evalscope/models/adapters/t2i_adapter.py +0 -79
evalscope/models/adapters/tau_bench_adapter.py +0 -189
evalscope/models/custom/__init__.py +0 -4
evalscope/models/custom/custom_model.py +0 -50
evalscope/models/custom/dummy_model.py +0 -99
evalscope/models/local_model.py +0 -128
evalscope/models/register.py +0 -41
tests/cli/test_run.py +0 -489
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0

evalscope/benchmarks/humaneval/humaneval_adapter.py CHANGED Viewed

@@ -1,7 +1,15 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+# flake8: noqa: E501
 import re
-from evalscope.benchmarks import Benchmark, DataAdapter
+from typing import Any, Dict
+from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.evaluator import TaskState
+from evalscope.api.messages.chat_message import ChatMessageUser
+from evalscope.api.metric import Score
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
 from evalscope.utils.logger import get_logger
 logger = get_logger()
@@ -10,28 +18,28 @@ logger = get_logger()
 # {"task_id": "HumanEval/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n", "entry_point": "has_close_elements", "canonical_solution": "    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n    assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n"}  # noqa
-@Benchmark.register(
-    name='humaneval',
-    pretty_name='HumanEval',
-    tags=['Coding'],
-    description=
-    'HumanEval is a benchmark for evaluating the ability of code generation models to write Python functions based on given specifications. It consists of programming tasks with a defined input-output behavior.',  # noqa: E501
-    dataset_id='modelscope/humaneval',
-    subset_list=['openai_humaneval'],
-    metric_list=['Pass@1'],
-    few_shot_num=0,
-    train_split=None,
-    eval_split='test',
-    prompt_template=
-    'Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n{query}',  # noqa: E501
-    extra_params={
-        'num_workers': 4,
-        'timeout': 4
-    },
+@register_benchmark(
+    BenchmarkMeta(
+        name='humaneval',
+        pretty_name='HumanEval',
+        tags=[Tags.CODING],
+        description=
+        'HumanEval is a benchmark for evaluating the ability of code generation models to write Python functions based on given specifications. It consists of programming tasks with a defined input-output behavior.',
+        dataset_id='opencompass/humaneval',
+        subset_list=['openai_humaneval'],
+        metric_list=['Pass@1'],
+        eval_split='test',
+        prompt_template=
+        'Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n{question}',
+        extra_params={
+            'num_workers': 4,
+            'timeout': 4
+        },
+    )
 )
-class HumanevalAdapter(DataAdapter):
+class HumanevalAdapter(DefaultDataAdapter):
     """
-    A placeholder for humaneval adapter, see HumanevalEvaluator for implementation.
+    HumanEval adapter using the new data processing framework.
     """
     def __init__(self, **kwargs):
@@ -39,9 +47,11 @@ class HumanevalAdapter(DataAdapter):
             from human_eval.data import stream_jsonl, write_jsonl
             from human_eval.evaluation import check_correctness
         except ImportError:
-            raise ImportError('Please install human_eval:'
-                              'https://github.com/openai/human-eval/tree/master#installation , '
-                              'Note that you need to enable the execution code in the human_eval/execution.py first.')
+            raise ImportError(
+                'Please install human_eval:'
+                'https://github.com/openai/human-eval/tree/master#installation , '
+                'Note that you need to enable the execution code in the human_eval/execution.py first.'
+            )
         super().__init__(**kwargs)
         extra_params = kwargs.get('extra_params', {})
@@ -53,41 +63,62 @@ class HumanevalAdapter(DataAdapter):
         self.write_jsonl_func = write_jsonl
         self.eval_func = check_correctness
-    def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
-        data_dict = {}
-        for subset_name in subset_list:
-            data_dict[subset_name] = {}
-            # [{'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}, ...]
-            data_dict[subset_name][self.eval_split] = [task for task in self.read_problems_func(dataset_name_or_path)]
-        return data_dict
-    def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
-        """
-        Generate prompt for the model.
-        Args:
-            input_d (dict): The raw input. A single data format of the Humaneval:
-            {'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}
-        """
-        query = input_d['prompt']
-        full_prompt = self.prompt_template.format(query=query)
-        return self.gen_prompt_data(full_prompt)
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        """Convert a data record to a Sample object."""
+        query = record['prompt']
+        full_prompt = self.prompt_template.format(question=query)
+        return Sample(
+            input=[ChatMessageUser(content=full_prompt)],
+            target=record['canonical_solution'],
+            metadata={
+                'task_id': record['task_id'],
+                'entry_point': record['entry_point'],
+                'prompt': record['prompt'],
+                'test': record['test'],
+            }
+        )
+    def extract_answer(self, prediction: str, task_state: TaskState) -> str:
+        """Extract code from the prediction."""
+        return self._postprocess(prediction)
     @classmethod
     def _postprocess(cls, text: str) -> str:
+        """Extract code from markdown code blocks."""
         blocks = re.findall(r'```\w*\n(.*?)```', text, re.DOTALL)
         if len(blocks) >= 1:
             text = blocks[0]
         return text
-    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
-        return self._postprocess(result)
-    def get_gold_answer(self, input_d: dict) -> str:
-        return input_d
-    def match(self, gold: str, pred: str) -> float:
-        res = self.eval_func(gold, pred, self.timeout)
-        return float(res['passed'])
+    def match_score(
+        self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
+    ) -> Score:
+        score = Score(
+            extracted_prediction=filtered_prediction,
+            prediction=original_prediction,
+        )
+        # Execute the code and check correctness
+        res = self.eval_func(task_state.metadata, filtered_prediction, self.timeout)
+        passed = res['passed']
+        score.value = {'pass': passed}
+        score.explanation = res.get('result', 'Code execution completed')
+        score.metadata = {'task_id': task_state.metadata['task_id'], 'timeout': self.timeout, 'execution_result': res}
+        score.main_score_name = 'pass'
+        return score
+    def aggregate_scores(self, sample_scores):
+        from evalscope.metrics.metric import PassAtK
+        # caculate pass@k here
+        agg_list = []
+        for metric in self.metric_list:
+            if metric.lower().startswith('pass@'):
+                k = int(metric.split('@')[1])
+                # Get the scores for this metric
+                agg = PassAtK(k)
+                agg_list.extend(agg(sample_scores))
+        return agg_list

evalscope/benchmarks/ifeval/ifeval_adapter.py CHANGED Viewed

@@ -1,54 +1,83 @@
-from collections import defaultdict
 from typing import Any, Dict, List
-from evalscope.benchmarks import Benchmark, DataAdapter
-from evalscope.constants import EvalType
-from evalscope.metrics import Metric, mean, metric_registry
-@Benchmark.register(
-    name='ifeval',
-    pretty_name='IFEval',
-    tags=['Instruction-Following'],
-    description=
-    'IFEval is a benchmark for evaluating instruction-following language models, focusing on their ability to understand and respond to various prompts. It includes a diverse set of tasks and metrics to assess model performance comprehensively.',  # noqa: E501
-    dataset_id='opencompass/ifeval',
-    subset_list=['default'],
-    metric_list=[
-        'prompt_level_strict_acc',
-        'inst_level_strict_acc',
-        'prompt_level_loose_acc',
-        'inst_level_loose_acc',
-    ],
-    few_shot_num=0,
-    train_split=None,
-    eval_split='train',
-    prompt_template='',
+from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.evaluator import TaskState
+from evalscope.api.messages import ChatMessageUser
+from evalscope.api.metric import Score
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.logger import get_logger
+logger = get_logger()
+@register_benchmark(
+    BenchmarkMeta(
+        name='ifeval',
+        pretty_name='IFEval',
+        description=
+        'IFEval is a benchmark for evaluating instruction-following language models, focusing on their ability to understand and respond to various prompts. It includes a diverse set of tasks and metrics to assess model performance comprehensively.',  # noqa: E501
+        tags=[Tags.INSTRUCTION_FOLLOWING],
+        dataset_id='opencompass/ifeval',
+        subset_list=['default'],
+        metric_list=[
+            'prompt_level_strict',
+            'inst_level_strict',
+            'prompt_level_loose',
+            'inst_level_loose',
+        ],
+        few_shot_num=0,
+        train_split=None,
+        eval_split='train',
+        prompt_template='',
+    )
 )
-class IFEvalAdapter(DataAdapter):
+class IFEvalAdapter(DefaultDataAdapter):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
-        # register metrics
-        metric_registry.register(Metric(name='prompt_level_strict_acc', object=mean))
-        metric_registry.register(Metric(name='inst_level_strict_acc', object=mean))
-        metric_registry.register(Metric(name='prompt_level_loose_acc', object=mean))
-        metric_registry.register(Metric(name='inst_level_loose_acc', object=mean))
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        """
+        Convert a data record to a Sample object.
-    def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
-        return self.gen_prompt_data(input_d['prompt'])
+        Args:
+            record (Dict[str, Any]): Input data record.
-    def get_gold_answer(self, input_d: dict) -> str:
-        return input_d
+        Returns:
+            Sample: Sample object with input, target, and metadata.
+        """
+        prompt = record.get('prompt', '')
+        message_list = [ChatMessageUser(content=prompt)]
-    def match(self, gold: Any, pred: Any) -> Dict:
+        return Sample(input=message_list, target='', metadata=record)
+    def match_score(
+        self, original_prediction: str, filtered_prediction: str, reference: Dict, task_state: TaskState
+    ) -> Score:
+        """
+        Calculate evaluation scores by comparing prediction with reference.
+        """
         from evalscope.benchmarks.ifeval.utils import process_results
-        return process_results(gold, [pred])
+        # Initialize the score object with prediction details
+        score = Score(
+            extracted_prediction=filtered_prediction,
+            prediction=original_prediction,
+        )
+        doc = task_state.metadata
+        try:
+            # Process results using the existing ifeval utility
+            results = process_results(doc, [filtered_prediction])
+            score.value.update(results)
+            # Set main score name
+            score.main_score_name = 'prompt_level_strict'
-    def compute_metric(self, review_res_list: List[dict], **kwargs) -> Any:
-        # aggregate review results
-        res_dict = super().compute_dict_metric(review_res_list, **kwargs)
+        except Exception as e:
+            logger.error(f'Error calculating ifeval metrics: {e}')
+            score.value = {}
-        return super().compute_metric(res_dict, **kwargs)
+        return score

evalscope/benchmarks/ifeval/instructions.py CHANGED Viewed

@@ -21,7 +21,7 @@ import re
 import string
 from typing import Dict, Optional, Sequence, Union
-from evalscope.benchmarks.ifeval import instructions_util
+from . import instructions_util
 _InstructionArgsDtype = Optional[Dict[str, Union[int, str, Sequence[str]]]]
@@ -140,8 +140,9 @@ class ResponseLanguageChecker(Instruction):
         if self._language is None:
             self._language = random.choice(list(_LANGUAGES.keys()))
         # TODO(tianjianlu): opens the description generation to more choices.
-        self._description_pattern = ('Your ENTIRE response should be in {language} language, no other '
-                                     + 'language is allowed.')
+        self._description_pattern = (
+            'Your ENTIRE response should be in {language} language, no other ' + 'language is allowed.'
+        )
         return self._description_pattern.format(language=_LANGUAGES[self._language])
     def get_instruction_args(self):
@@ -197,8 +198,10 @@ class NumberOfSentences(Instruction):
         if relation is None:
             self._comparison_relation = random.choice(_COMPARISON_RELATION)
         elif relation not in _COMPARISON_RELATION:
-            raise ValueError('The supported relation for comparison must be in '
-                             f'{_COMPARISON_RELATION}, but {relation} is given.')
+            raise ValueError(
+                'The supported relation for comparison must be in '
+                f'{_COMPARISON_RELATION}, but {relation} is given.'
+            )
         else:
             self._comparison_relation = relation
@@ -255,8 +258,10 @@ class PlaceholderChecker(Instruction):
         self._num_placeholders = num_placeholders
         if self._num_placeholders is None or self._num_placeholders < 0:
             self._num_placeholders = random.randint(1, _NUM_PLACEHOLDERS)
-        self._description_pattern = ('The response must contain at least {num_placeholders} placeholders '
-                                     + 'represented by square brackets, such as [address].')
+        self._description_pattern = (
+            'The response must contain at least {num_placeholders} placeholders '
+            + 'represented by square brackets, such as [address].'
+        )
         return self._description_pattern.format(num_placeholders=self._num_placeholders)
     def get_instruction_args(self):
@@ -298,9 +303,10 @@ class BulletListChecker(Instruction):
         self._num_bullets = num_bullets
         if self._num_bullets is None or self._num_bullets < 0:
             self._num_bullets = random.randint(1, _NUM_BULLETS)
-        self._description_pattern = ('Your answer must contain exactly {num_bullets} bullet points. '
-                                     + 'Use the markdown bullet points such as:\n' + '* This is point 1. \n'
-                                     + '* This is point 2')
+        self._description_pattern = (
+            'Your answer must contain exactly {num_bullets} bullet points. '
+            + 'Use the markdown bullet points such as:\n' + '* This is point 1. \n' + '* This is point 2'
+        )
         return self._description_pattern.format(num_bullets=self._num_bullets)
     def get_instruction_args(self):
@@ -379,8 +385,9 @@ class ConstrainedStartChecker(Instruction):
         self._starter = starter.strip() if isinstance(starter, str) else starter
         if self._starter is None:
             self._starter = random.choice(_STARTER_OPTIONS)
-        self._description_pattern = ('During the conversation, when it is your turn, '
-                                     + 'please always start with {starter}')
+        self._description_pattern = (
+            'During the conversation, when it is your turn, ' + 'please always start with {starter}'
+        )
         return self._description_pattern.format(starter=self._starter)
     def get_instruction_args(self):
@@ -423,8 +430,10 @@ class HighlightSectionChecker(Instruction):
         if self._num_highlights is None or self._num_highlights < 0:
             self._num_highlights = random.randint(1, _NUM_HIGHLIGHTED_SECTIONS)
-        self._description_pattern = ('Highlight at least {num_highlights} sections in your answer with '
-                                     + 'markdown, i.e. *highlighted section*.')
+        self._description_pattern = (
+            'Highlight at least {num_highlights} sections in your answer with '
+            + 'markdown, i.e. *highlighted section*.'
+        )
         return self._description_pattern.format(num_highlights=self._num_highlights)
@@ -482,9 +491,11 @@ class SectionChecker(Instruction):
         if self._num_sections is None or self._num_sections < 0:
             self._num_sections = random.randint(1, _NUM_SECTIONS)
-        self._description_pattern = ('Your response must have {num_sections} sections. Mark the beginning '
-                                     + 'of each section with {section_spliter} X, such as:\n' + '{section_spliter} 1\n'
-                                     + '[content of section 1]\n' + '{section_spliter} 2\n' + '[content of section 2]')
+        self._description_pattern = (
+            'Your response must have {num_sections} sections. Mark the beginning '
+            + 'of each section with {section_spliter} X, such as:\n' + '{section_spliter} 1\n'
+            + '[content of section 1]\n' + '{section_spliter} 2\n' + '[content of section 2]'
+        )
         return self._description_pattern.format(num_sections=self._num_sections, section_spliter=self._section_spliter)
@@ -534,8 +545,9 @@ class ParagraphChecker(Instruction):
         if self._num_paragraphs is None or self._num_paragraphs < 0:
             self._num_paragraphs = random.randint(1, _NUM_PARAGRAPHS)
-        self._description_pattern = ('There should be {num_paragraphs} paragraphs. '
-                                     + 'Paragraphs are separated with the markdown divider: ***')
+        self._description_pattern = (
+            'There should be {num_paragraphs} paragraphs. ' + 'Paragraphs are separated with the markdown divider: ***'
+        )
         return self._description_pattern.format(num_paragraphs=self._num_paragraphs)
@@ -585,12 +597,14 @@ class PostscriptChecker(Instruction):
           A string representing the instruction description.
         """
         self._postscript_marker = (
-            postscript_marker.strip() if isinstance(postscript_marker, str) else postscript_marker)
+            postscript_marker.strip() if isinstance(postscript_marker, str) else postscript_marker
+        )
         if self._postscript_marker is None:
             self._postscript_marker = random.choice(_POSTSCRIPT_MARKER)
-        self._description_pattern = ('At the end of your response, please explicitly add a postscript '
-                                     + 'starting with {postscript}')
+        self._description_pattern = (
+            'At the end of your response, please explicitly add a postscript ' + 'starting with {postscript}'
+        )
         return self._description_pattern.format(postscript=self._postscript_marker)
@@ -644,8 +658,10 @@ class RephraseChecker(Instruction):
                              'in the form of *change me*.')
         self._reference_without_change = original_message
-        self._description = ('Rephrasing: Your rephrased response should only'
-                             + 'change the words/sentences in between two asterisks' + 'such as *change me*.')
+        self._description = (
+            'Rephrasing: Your rephrased response should only' + 'change the words/sentences in between two asterisks'
+            + 'such as *change me*.'
+        )
         return self._description
     def get_instruction_args(self):
@@ -757,13 +773,16 @@ class KeywordFrequencyChecker(Instruction):
         if relation is None:
             self._comparison_relation = random.choice(_COMPARISON_RELATION)
         elif relation not in _COMPARISON_RELATION:
-            raise ValueError('The supported relation for comparison must be in '
-                             f'{_COMPARISON_RELATION}, but {relation} is given.')
+            raise ValueError(
+                'The supported relation for comparison must be in '
+                f'{_COMPARISON_RELATION}, but {relation} is given.'
+            )
         else:
             self._comparison_relation = relation
-        self._description_pattern = ('In your response, the word {keyword} should appear {relation} '
-                                     + '{frequency} times.')
+        self._description_pattern = (
+            'In your response, the word {keyword} should appear {relation} ' + '{frequency} times.'
+        )
         return self._description_pattern.format(
             keyword=self._keyword,
@@ -819,8 +838,10 @@ class NumberOfWords(Instruction):
         if relation is None:
             self._comparison_relation = random.choice(_COMPARISON_RELATION)
         elif relation not in _COMPARISON_RELATION:
-            raise ValueError('The supported relation for comparison must be in '
-                             f'{_COMPARISON_RELATION}, but {relation} is given.')
+            raise ValueError(
+                'The supported relation for comparison must be in '
+                f'{_COMPARISON_RELATION}, but {relation} is given.'
+            )
         else:
             self._comparison_relation = relation
@@ -850,8 +871,10 @@ class JsonFormat(Instruction):
     """Check the Json format."""
     def build_description(self):
-        self._description_pattern = ('Entire output should be wrapped in JSON format. You can use markdown'
-                                     ' ticks such as ```.')
+        self._description_pattern = (
+            'Entire output should be wrapped in JSON format. You can use markdown'
+            ' ticks such as ```.'
+        )
         return self._description_pattern
     def get_instruction_args(self):
@@ -864,8 +887,9 @@ class JsonFormat(Instruction):
     def check_following(self, value):
         value = (
-            value.strip().removeprefix('```json').removeprefix('```Json').removeprefix('```JSON').removeprefix(
-                '```').removesuffix('```').strip())
+            value.strip().removeprefix('```json').removeprefix('```Json').removeprefix('```JSON').removeprefix('```').
+            removesuffix('```').strip()
+        )
         try:
             json.loads(value)
         except ValueError:
@@ -903,10 +927,12 @@ class ParagraphFirstWordCheck(Instruction):
             self._first_word = instructions_util.generate_keywords(num_keywords=1)[0]
         self._first_word = self._first_word.lower()
-        self._description_pattern = ('There should be {num_paragraphs} paragraphs. '
-                                     + 'Paragraphs and only paragraphs are separated with each other by two '
-                                     + "new lines as if it was '\\n\\n' in python. "
-                                     + 'Paragraph {nth_paragraph} must start with word {first_word}.')
+        self._description_pattern = (
+            'There should be {num_paragraphs} paragraphs. '
+            + 'Paragraphs and only paragraphs are separated with each other by two '
+            + "new lines as if it was '\\n\\n' in python. "
+            + 'Paragraph {nth_paragraph} must start with word {first_word}.'
+        )
         return self._description_pattern.format(
             num_paragraphs=self._num_paragraphs,
@@ -1084,11 +1110,12 @@ class RephraseParagraph(Instruction):
         self._low = low
         self._high = high
-        self._description = ('Rephrase the following paragraph: ' + '{original_paragraph}\nYour response should have '
-                             + 'between {low} and {high} of the same words. '
-                             + 'Words are the same if and only if all of the '
-                             + 'letters, ignoring cases, are the same. For '
-                             + "example, 'run' is the same as 'Run' but different " + "to 'ran'.")
+        self._description = (
+            'Rephrase the following paragraph: ' + '{original_paragraph}\nYour response should have '
+            + 'between {low} and {high} of the same words. ' + 'Words are the same if and only if all of the '
+            + 'letters, ignoring cases, are the same. For ' + "example, 'run' is the same as 'Run' but different "
+            + "to 'ran'."
+        )
         return self._description.format(original_paragraph=original_paragraph, low=self._low, high=self._high)
@@ -1123,8 +1150,10 @@ class TwoResponsesChecker(Instruction):
     def build_description(self):
         """Build the instruction description."""
-        self._description_pattern = ('Give two different responses. Responses and only responses should'
-                                     ' be separated by 6 asterisk symbols: ******.')
+        self._description_pattern = (
+            'Give two different responses. Responses and only responses should'
+            ' be separated by 6 asterisk symbols: ******.'
+        )
         return self._description_pattern
     def get_instruction_args(self):
@@ -1171,10 +1200,12 @@ class RepeatPromptThenAnswer(Instruction):
             raise ValueError('prompt_to_repeat must be set.')
         else:
             self._prompt_to_repeat = prompt_to_repeat
-        self._description_pattern = ('First repeat the request word for word without change,'
-                                     ' then give your answer (1. do not say any words or characters'
-                                     ' before repeating the request; 2. the request you need to repeat'
-                                     ' does not include this sentence)')
+        self._description_pattern = (
+            'First repeat the request word for word without change,'
+            ' then give your answer (1. do not say any words or characters'
+            ' before repeating the request; 2. the request you need to repeat'
+            ' does not include this sentence)'
+        )
         return self._description_pattern
     def get_instruction_args(self):
@@ -1205,8 +1236,10 @@ class EndChecker(Instruction):
         self._end_phrase = (end_phrase.strip() if isinstance(end_phrase, str) else end_phrase)
         if self._end_phrase is None:
             self._end_phrase = random.choice(_ENDING_OPTIONS)
-        self._description_pattern = ('Finish your response with this exact phrase {ender}. '
-                                     'No other words should follow this phrase.')
+        self._description_pattern = (
+            'Finish your response with this exact phrase {ender}. '
+            'No other words should follow this phrase.'
+        )
         return self._description_pattern.format(ender=self._end_phrase)
     def get_instruction_args(self):
@@ -1228,8 +1261,10 @@ class TitleChecker(Instruction):
     def build_description(self):
         """Build the instruction description."""
-        self._description_pattern = ('Your answer must contain a title, wrapped in double angular brackets,'
-                                     ' such as <<poem of joy>>.')
+        self._description_pattern = (
+            'Your answer must contain a title, wrapped in double angular brackets,'
+            ' such as <<poem of joy>>.'
+        )
         return self._description_pattern
     def get_instruction_args(self):
@@ -1283,13 +1318,17 @@ class LetterFrequencyChecker(Instruction):
         if let_relation is None:
             self._comparison_relation = random.choice(_COMPARISON_RELATION)
         elif let_relation not in _COMPARISON_RELATION:
-            raise ValueError('The supported relation for comparison must be in '
-                             f'{_COMPARISON_RELATION}, but {let_relation} is given.')
+            raise ValueError(
+                'The supported relation for comparison must be in '
+                f'{_COMPARISON_RELATION}, but {let_relation} is given.'
+            )
         else:
             self._comparison_relation = let_relation
-        self._description_pattern = ('In your response, the letter {letter} should appear {let_relation}'
-                                     ' {let_frequency} times.')
+        self._description_pattern = (
+            'In your response, the letter {letter} should appear {let_relation}'
+            ' {let_frequency} times.'
+        )
         return self._description_pattern.format(
             letter=self._letter,
@@ -1352,8 +1391,10 @@ class LowercaseLettersEnglishChecker(Instruction):
     def build_description(self):
         """Build the instruction description."""
-        self._description_pattern = ('Your entire response should be in English, and in all lowercase'
-                                     ' letters. No capital letters are allowed.')
+        self._description_pattern = (
+            'Your entire response should be in English, and in all lowercase'
+            ' letters. No capital letters are allowed.'
+        )
         return self._description_pattern
     def get_instruction_args(self):
@@ -1422,11 +1463,15 @@ class CapitalWordFrequencyChecker(Instruction):
         if capital_relation is None:
             self._comparison_relation = random.choice(_COMPARISON_RELATION)
         elif capital_relation not in _COMPARISON_RELATION:
-            raise ValueError('The supported relation for comparison must be in '
-                             f'{_COMPARISON_RELATION}, but {capital_relation} is given.')
-        self._description_pattern = ('In your response, words with all capital letters should appear'
-                                     ' {relation} {frequency} times.')
+            raise ValueError(
+                'The supported relation for comparison must be in '
+                f'{_COMPARISON_RELATION}, but {capital_relation} is given.'
+            )
+        self._description_pattern = (
+            'In your response, words with all capital letters should appear'
+            ' {relation} {frequency} times.'
+        )
         return self._description_pattern.format(frequency=self._frequency, relation=self._comparison_relation)

evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

evalscope 0.17.1py3-none-any.whl → 1.0.0py3-none-any.whl