PyPI - evalscope - Versions diffs - 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

evalscope 1.0.0py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (324) hide show

evalscope/api/benchmark/__init__.py +9 -1
evalscope/api/benchmark/adapters/__init__.py +4 -0
evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
evalscope/api/benchmark/benchmark.py +85 -2
evalscope/api/benchmark/meta.py +10 -1
evalscope/api/dataset/dataset.py +27 -6
evalscope/api/dataset/loader.py +8 -3
evalscope/api/evaluator/cache.py +31 -4
evalscope/api/evaluator/evaluator.py +5 -0
evalscope/api/evaluator/state.py +17 -1
evalscope/api/messages/__init__.py +1 -0
evalscope/api/messages/chat_message.py +52 -2
evalscope/api/metric/__init__.py +1 -1
evalscope/api/metric/metric.py +6 -1
evalscope/api/metric/scorer.py +15 -7
evalscope/api/mixin/__init__.py +1 -1
evalscope/api/mixin/llm_judge_mixin.py +2 -0
evalscope/api/mixin/sandbox_mixin.py +182 -0
evalscope/api/model/generate_config.py +10 -6
evalscope/api/model/model.py +5 -2
evalscope/api/tool/tool_info.py +1 -1
evalscope/app/app.py +3 -0
evalscope/app/ui/multi_model.py +6 -1
evalscope/app/ui/single_model.py +11 -5
evalscope/app/utils/data_utils.py +8 -7
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/text_utils.py +14 -12
evalscope/app/utils/visualization.py +2 -2
evalscope/arguments.py +8 -4
evalscope/backend/opencompass/backend_manager.py +0 -2
evalscope/backend/rag_eval/utils/embedding.py +9 -1
evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
evalscope/benchmarks/aime/aime24_adapter.py +5 -0
evalscope/benchmarks/aime/aime25_adapter.py +136 -1
evalscope/benchmarks/aime/grader.py +307 -0
evalscope/benchmarks/aime/math_normalize.py +189 -0
evalscope/benchmarks/amc/amc_adapter.py +51 -0
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
evalscope/benchmarks/bfcl/v3/utils.py +23 -0
evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
evalscope/benchmarks/bfcl/v4/utils.py +410 -0
evalscope/benchmarks/biomix_qa/__init__.py +0 -0
evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
evalscope/benchmarks/blink/__init__.py +0 -0
evalscope/benchmarks/blink/blink_adapter.py +61 -0
evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
evalscope/benchmarks/chartqa/__init__.py +0 -0
evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
evalscope/benchmarks/chartqa/utils.py +38 -0
evalscope/benchmarks/coin_flip/__init__.py +0 -0
evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
evalscope/benchmarks/docvqa/__init__.py +0 -0
evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
evalscope/benchmarks/drivelology/__init__.py +0 -0
evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
evalscope/benchmarks/drop/drop_adapter.py +15 -44
evalscope/benchmarks/drop/utils.py +97 -0
evalscope/benchmarks/frames/frames_adapter.py +2 -1
evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
evalscope/benchmarks/general_arena/utils.py +2 -1
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
evalscope/benchmarks/halu_eval/__init__.py +0 -0
evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
evalscope/benchmarks/healthbench/__init__.py +0 -0
evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
evalscope/benchmarks/healthbench/utils.py +102 -0
evalscope/benchmarks/hle/hle_adapter.py +3 -2
evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
evalscope/benchmarks/humaneval/utils.py +235 -0
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/image_edit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/infovqa/__init__.py +0 -0
evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
evalscope/benchmarks/logi_qa/__int__.py +0 -0
evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
evalscope/benchmarks/math_qa/__init__.py +0 -0
evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
evalscope/benchmarks/math_verse/__init__.py +0 -0
evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
evalscope/benchmarks/math_vision/__init__.py +0 -0
evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
evalscope/benchmarks/math_vista/__init__.py +0 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
evalscope/benchmarks/med_mcqa/__init__.py +0 -0
evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
evalscope/benchmarks/minerva_math/__init__.py +0 -0
evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
evalscope/benchmarks/mm_bench/__init__.py +0 -0
evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
evalscope/benchmarks/mm_star/__init__.py +0 -0
evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
evalscope/benchmarks/multi_if/__init__.py +0 -0
evalscope/benchmarks/multi_if/ifeval.py +3354 -0
evalscope/benchmarks/multi_if/metrics.py +120 -0
evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
evalscope/benchmarks/music_trivia/__init__.py +0 -0
evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
evalscope/benchmarks/ner/__init__.py +0 -0
evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
evalscope/benchmarks/ner/copious_adapter.py +85 -0
evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
evalscope/benchmarks/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
evalscope/benchmarks/olympiad_bench/utils.py +565 -0
evalscope/benchmarks/omni_bench/__init__.py +0 -0
evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
evalscope/benchmarks/piqa/__init__.py +0 -0
evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
evalscope/benchmarks/poly_math/__init__.py +0 -0
evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
evalscope/benchmarks/pope/__init__.py +0 -0
evalscope/benchmarks/pope/pope_adapter.py +112 -0
evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
evalscope/benchmarks/pumed_qa/__init__.py +0 -0
evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
evalscope/benchmarks/qasc/__init__.py +0 -0
evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
evalscope/benchmarks/real_world_qa/__init__.py +0 -0
evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
evalscope/benchmarks/sciq/__init__.py +0 -0
evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
evalscope/benchmarks/simple_vqa/__init__.py +0 -0
evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
evalscope/benchmarks/siqa/__init__.py +0 -0
evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
evalscope/benchmarks/visu_logic/__init__.py +0 -0
evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
evalscope/benchmarks/wmt/__init__.py +0 -0
evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
evalscope/benchmarks/zerobench/__init__.py +0 -0
evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
evalscope/cli/start_app.py +7 -1
evalscope/cli/start_perf.py +7 -1
evalscope/config.py +103 -18
evalscope/constants.py +18 -0
evalscope/evaluator/evaluator.py +138 -82
evalscope/metrics/bert_score/__init__.py +0 -0
evalscope/metrics/bert_score/scorer.py +338 -0
evalscope/metrics/bert_score/utils.py +697 -0
evalscope/metrics/llm_judge.py +19 -7
evalscope/metrics/math_parser.py +14 -0
evalscope/metrics/metric.py +317 -13
evalscope/metrics/metrics.py +37 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
evalscope/models/image_edit_model.py +125 -0
evalscope/models/model_apis.py +22 -0
evalscope/models/openai_compatible.py +21 -0
evalscope/models/text2image_model.py +2 -2
evalscope/models/utils/openai.py +16 -6
evalscope/perf/arguments.py +26 -4
evalscope/perf/benchmark.py +76 -89
evalscope/perf/http_client.py +31 -16
evalscope/perf/main.py +15 -2
evalscope/perf/plugin/api/base.py +9 -7
evalscope/perf/plugin/api/custom_api.py +13 -58
evalscope/perf/plugin/api/default_api.py +188 -79
evalscope/perf/plugin/api/openai_api.py +85 -20
evalscope/perf/plugin/datasets/base.py +21 -0
evalscope/perf/plugin/datasets/custom.py +2 -3
evalscope/perf/plugin/datasets/flickr8k.py +2 -2
evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
evalscope/perf/plugin/datasets/line_by_line.py +2 -3
evalscope/perf/plugin/datasets/longalpaca.py +2 -3
evalscope/perf/plugin/datasets/openqa.py +2 -4
evalscope/perf/plugin/datasets/random_dataset.py +1 -3
evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
evalscope/perf/utils/benchmark_util.py +43 -27
evalscope/perf/utils/db_util.py +14 -19
evalscope/perf/utils/local_server.py +3 -44
evalscope/perf/utils/log_utils.py +21 -6
evalscope/report/__init__.py +13 -3
evalscope/report/combinator.py +91 -20
evalscope/report/generator.py +8 -87
evalscope/report/report.py +8 -4
evalscope/run.py +13 -5
evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
evalscope/utils/argument_utils.py +1 -1
evalscope/utils/chat_service.py +1 -1
evalscope/utils/function_utils.py +249 -12
evalscope/utils/import_utils.py +73 -1
evalscope/utils/io_utils.py +132 -7
evalscope/utils/json_schema.py +25 -2
evalscope/utils/logger.py +69 -18
evalscope/utils/model_utils.py +4 -3
evalscope/utils/multi_choices.py +39 -7
evalscope/utils/ner.py +377 -0
evalscope/version.py +2 -2
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
evalscope/api/mixin/dataset_mixin.py +0 -105
evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
tests/__init__.py +0 -1
tests/aigc/__init__.py +0 -1
tests/aigc/test_t2i.py +0 -142
tests/benchmark/__init__.py +0 -1
tests/benchmark/test_eval.py +0 -386
tests/cli/__init__.py +0 -1
tests/cli/test_all.py +0 -229
tests/cli/test_collection.py +0 -96
tests/cli/test_custom.py +0 -268
tests/perf/__init__.py +0 -1
tests/perf/test_perf.py +0 -176
tests/rag/test_clip_benchmark.py +0 -90
tests/rag/test_mteb.py +0 -213
tests/rag/test_ragas.py +0 -128
tests/swift/__init__.py +0 -1
tests/swift/test_run_swift_eval.py +0 -146
tests/swift/test_run_swift_vlm_eval.py +0 -128
tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
tests/test_run_all.py +0 -12
tests/utils.py +0 -13
tests/vlm/__init__.py +0 -1
tests/vlm/test_vlmeval.py +0 -102
/evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
{tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0

evalscope/benchmarks/multi_if/metrics.py ADDED Viewed

@@ -0,0 +1,120 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, List, Optional, Tuple
+from evalscope.utils import get_logger
+from . import ifeval
+logger = get_logger()
+def gen_acc_strict(x: Dict[str, Any]) -> Dict[str, List]:
+    # reference: fbcode/gen_ai/github/fair_evals/evals/tasks/finetune/ifeval.py
+    response = str(x['response'])
+    instruction_list = x['instruction_id_list']
+    is_following_list = []
+    for index, instruction_id in enumerate(instruction_list):
+        instruction_cls = ifeval.INSTRUCTION_DICT[instruction_id]
+        instruction = instruction_cls(instruction_id)
+        instruction.build_description(**x['kwargs'][index])
+        if response and instruction.check_following(response):
+            is_following_list.append(True)
+        else:
+            is_following_list.append(False)
+    return {
+        'follow_instruction_list': is_following_list,
+        'instruction_id_list': instruction_list,
+    }
+def gen_acc_loose(x: Dict[str, Any]) -> Dict[str, List]:
+    response = str(x['response'])
+    r = response.split('\n')
+    response_remove_first = '\n'.join(r[1:]).strip()
+    response_remove_last = '\n'.join(r[:-1]).strip()
+    response_remove_both = '\n'.join(r[1:-1]).strip()
+    revised_response = response.replace('*', '')
+    revised_response_remove_first = response_remove_first.replace('*', '')
+    revised_response_remove_last = response_remove_last.replace('*', '')
+    revised_response_remove_both = response_remove_both.replace('*', '')
+    all_responses = [
+        response,
+        revised_response,
+        response_remove_first,
+        response_remove_last,
+        response_remove_both,
+        revised_response_remove_first,
+        revised_response_remove_last,
+        revised_response_remove_both,
+    ]
+    instruction_list = x['instruction_id_list']
+    is_following_list = []
+    for index, instruction_id in enumerate(instruction_list):
+        instruction_cls = ifeval.INSTRUCTION_DICT[instruction_id]
+        instruction = instruction_cls(instruction_id)
+        instruction.build_description(**x['kwargs'][index])
+        is_following = False
+        for r in all_responses:  # type: ignore
+            if r.strip() and instruction.check_following(r):  # type: ignore
+                is_following = True
+                break
+        is_following_list.append(is_following)
+    return {
+        'follow_instruction_list': is_following_list,
+        'instruction_id_list': instruction_list,
+    }
+def parse_result(outputs: List[Dict[str, Any]]) -> Tuple[float, float]:
+    prompt_total = 0
+    prompt_correct = 0
+    instruction_total = 0
+    instruction_correct = 0
+    for example in outputs:
+        follow_instruction_list = example['follow_instruction_list']
+        instruction_id_list = example['instruction_id_list']
+        prompt_total += 1
+        if all(follow_instruction_list):
+            prompt_correct += 1
+        instruction_total += len(instruction_id_list)
+        instruction_correct += sum(follow_instruction_list)
+    return prompt_correct / prompt_total if prompt_total > 0 else 0, \
+        instruction_correct / instruction_total if instruction_total > 0 else 0
+def parse_result_no_reduce(outputs: List[Dict[str, Any]]) -> Tuple[List, List]:
+    prompt_res = []
+    inst_res = []
+    for example in outputs:
+        follow_instruction_list = example['follow_instruction_list']
+        instruction_id_list = example['instruction_id_list']
+        if all(follow_instruction_list):
+            prompt_res.append(1)
+        else:
+            prompt_res.append(0)
+        inst_res.append(sum(follow_instruction_list) / len(instruction_id_list) if instruction_id_list else 0.0)
+    return prompt_res, inst_res

evalscope/benchmarks/multi_if/multi_if_adapter.py ADDED Viewed

@@ -0,0 +1,161 @@
+import json
+from typing import Any, Dict, List
+from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.evaluator import TaskState
+from evalscope.api.messages import ChatMessageUser, messages_pretty_str
+from evalscope.api.metric import Score
+from evalscope.api.model import Model
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.import_utils import check_import
+from evalscope.utils.logger import get_logger
+logger = get_logger()
+SUBSET_LIST = [
+    'Chinese',
+    'English',
+    'German',
+    'Italian',
+    'Vietnamese',
+    'Spanish',
+    'Hindi',
+    'Portuguese',
+    'French',
+    'Thai',
+    'Russian',
+]
+@register_benchmark(
+    BenchmarkMeta(
+        name='multi_if',
+        pretty_name='Multi-IF',
+        description=
+        'Multi-IF is a benchmark designed to evaluate the performance of LLM models\' capabilities in multi-turn instruction following within a multilingual environment.',  # noqa: E501
+        tags=[Tags.INSTRUCTION_FOLLOWING, Tags.MULTI_LINGUAL, Tags.MULTI_TURN],
+        dataset_id='facebook/Multi-IF',
+        subset_list=SUBSET_LIST,
+        metric_list=[
+            'prompt_level_strict',
+            'inst_level_strict',
+            'prompt_level_loose',
+            'inst_level_loose',
+        ],
+        few_shot_num=0,
+        train_split=None,
+        eval_split='train',
+        extra_params={
+            'max_turns': 3,  # maximum number of turns to evaluate
+        }
+    )
+)
+class MultiIFAdapter(DefaultDataAdapter):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        # Ensure required packages are installed
+        check_import(
+            module_name=['nltk', 'langdetect'],
+            package=['nltk', 'langdetect'],
+            raise_error=True,
+            feature_name=self.pretty_name
+        )
+        if 'Chinese' in self.subset_list:
+            check_import(module_name='emoji', package='emoji', raise_error=True, feature_name='Chinese subset')
+        if 'Thai' in self.subset_list:
+            check_import(module_name='pythainlp', package='pythainlp', raise_error=True, feature_name='Thai subset')
+        self.reformat_subset = True
+        self.max_turns = self.extra_params.get('max_turns', 3)
+        if not isinstance(self.max_turns, int) or self.max_turns < 1 or self.max_turns > 3:
+            logger.warning(f'max_turns should be an integer between 1 and 3, got {self.max_turns}, clamping to 3.')
+            self.max_turns = 3
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        return Sample(
+            input=[ChatMessageUser(content='')],  # NOTE: we will build the multi turn conversation in the evaluator
+            target='',
+            subset_key=record['language'],
+            metadata=record,
+        )
+    def run_inference(self, model: Model, sample: Sample, output_dir: str, **kwargs) -> TaskState:
+        """
+        Run multi-turn inference with the model and sample.
+        """
+        record = sample.metadata
+        history = []
+        step_record = {}
+        for step in range(1, self.max_turns + 1):
+            current_prompt = json.loads(record[f'turn_{step}_prompt'])
+            history.append(ChatMessageUser(content=current_prompt['content']))
+            # Generate model output
+            model_output = model.generate(input=history, tools=sample.tools)
+            response = model_output.completion
+            instruction_id_list = json.loads(record[f'turn_{step}_instruction_id_list'])
+            kwargs_list = json.loads(record[f'turn_{step}_kwargs'])
+            _kwargs = [json.loads(kwarg) for kwarg in kwargs_list]
+            step_record[step] = {
+                'prompt': messages_pretty_str(history),
+                'response': response,
+                'instruction_id_list': instruction_id_list,
+                'kwargs': _kwargs
+            }
+            # Append model output to history for next turn
+            history.append(model_output.message)
+        sample.metadata['step_record'] = step_record
+        return TaskState(
+            model=model.name,
+            sample=sample,
+            messages=history,
+            output=model_output,
+            completed=True,
+        )
+    def match_score(
+        self, original_prediction: str, filtered_prediction: str, reference: Dict, task_state: TaskState
+    ) -> Score:
+        """
+        Calculate evaluation scores by comparing prediction with reference.
+        """
+        from .metrics import gen_acc_loose, gen_acc_strict, parse_result
+        # Initialize the score object with prediction details
+        score = Score(
+            extracted_prediction=filtered_prediction,
+            prediction=original_prediction,
+        )
+        step_record = task_state.metadata['step_record']
+        results = {}
+        try:
+            for step, record in step_record.items():
+                outputs_strict = gen_acc_strict(record)
+                outputs_loose = gen_acc_loose(record)
+                prompt_level_strict, inst_level_strict = parse_result([outputs_strict])
+                prompt_level_loose, inst_level_loose = parse_result([outputs_loose])
+                results.update({
+                    f'turn_{step}_prompt_level_strict': prompt_level_strict,
+                    f'turn_{step}_inst_level_strict': inst_level_strict,
+                    f'turn_{step}_prompt_level_loose': prompt_level_loose,
+                    f'turn_{step}_inst_level_loose': inst_level_loose,
+                })
+            score.value.update(results)
+            # Set main score name
+            if results:
+                score.main_score_name = f'turn_{step}_prompt_level_strict'
+        except Exception as e:
+            logger.error(f'Error calculating ifeval metrics: {e}')
+            score.value = {}
+        return score

evalscope/benchmarks/music_trivia/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/music_trivia/music_trivia_adapter.py ADDED Viewed

@@ -0,0 +1,36 @@
+from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.multi_choices import MultipleChoiceTemplate
+DESCRIPTION = (
+    'MusicTrivia is a curated dataset of multiple-choice questions covering both classical and modern music topics. '
+    'It includes questions about composers, musical periods, and popular artists, designed for evaluating '
+    'factual recall and domain-specific music knowledge.'
+)  # noqa: E501
+@register_benchmark(
+    BenchmarkMeta(
+        name='music_trivia',
+        pretty_name='MusicTrivia',
+        tags=[Tags.KNOWLEDGE, Tags.MULTIPLE_CHOICE],
+        description=DESCRIPTION.strip(),
+        dataset_id='extraordinarylab/music-trivia',
+        metric_list=['acc'],
+        few_shot_num=0,
+        train_split=None,
+        eval_split='test',
+        prompt_template=MultipleChoiceTemplate.SINGLE_ANSWER,
+    )
+)
+class MusicTriviaAdapter(MultiChoiceAdapter):
+    def record_to_sample(self, record) -> Sample:
+        return Sample(
+            input=record['question'],
+            choices=record['choices'],
+            target=record['answer'],
+            metadata={},
+        )

evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py CHANGED Viewed

@@ -36,7 +36,7 @@ Don't give information outside the document or repeat your findings."""
         tags=[Tags.RETRIEVAL, Tags.LONG_CONTEXT],
         description='Needle in a Haystack is a benchmark focused on information retrieval tasks. '
         'It requires the model to find specific information within a large corpus of text. '
-        '[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/needle_haystack.html)',  # noqa: E501
+        '[Usage Example](https://evalscope.readthedocs.io/en/latest/third_party/needle_haystack.html)',  # noqa: E501
         dataset_id='AI-ModelScope/Needle-in-a-Haystack-Corpus',
         metric_list=['acc'],
         subset_list=['english', 'chinese'],
@@ -73,6 +73,7 @@ class NeedleHaystackAdapter(DefaultDataAdapter):
         super().__init__(**kwargs)
         self._use_llm_judge = True
+        self.add_aggregation_name = False  # Don't add aggregation name for needle haystack adapter
         # set extra params
         self.retrieval_question = self.extra_params.get(
             'retrieval_question', 'What is the best thing to do in San Francisco?'
@@ -164,7 +165,11 @@ class NeedleHaystackAdapter(DefaultDataAdapter):
                     records.append(record)
                 dataset = DictDataLoader(
-                    dict_list=records, limit=self.limit, repeats=self.repeats, sample_fields=self.record_to_sample
+                    dict_list=records,
+                    limit=self.limit,
+                    repeats=self.repeats,
+                    sample_fields=self.record_to_sample,
+                    shuffle=self.shuffle,
                 ).load()
                 datasets[subset_name] = dataset
@@ -355,10 +360,6 @@ class NeedleHaystackAdapter(DefaultDataAdapter):
         return score
-    def _on_generate_report(self, scores, model_name, add_aggregation_name=True):
-        # Don't add aggregation name for needle haystack adapter
-        return super()._on_generate_report(scores, model_name, False)
     def _on_generate_report_end(self, report: 'Report', output_dir: str, **kwargs):
         try:
             import os

evalscope/benchmarks/ner/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py ADDED Viewed

@@ -0,0 +1,52 @@
+from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE
+DESCRIPTION = (
+    'BroadTwitterCorpus is a dataset of tweets collected over stratified times, places '
+    'and social uses. The goal is to represent a broad range of activities, giving a '
+    'dataset more representative of the language used in this hardest of social media '
+    'formats to process.'
+)
+@register_benchmark(
+    BenchmarkMeta(
+        name='broad_twitter_corpus',
+        pretty_name='BroadTwitterCorpus',
+        dataset_id='extraordinarylab/broad-twitter-corpus',
+        tags=[Tags.KNOWLEDGE, Tags.NER],
+        description=DESCRIPTION.strip(),
+        few_shot_num=5,
+        train_split='train',
+        eval_split='test',
+        metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
+        prompt_template=PROMPT_TEMPLATE,
+        few_shot_prompt_template=FEWSHOT_TEMPLATE,
+    )
+)
+class BroadTwitterCorpusAdapter(NERAdapter):
+    """
+    Adapter for the BroadTwitterCorpus Named Entity Recognition dataset.
+    This adapter inherits the NER functionality from NERAdapter and
+    configures it specifically for the BroadTwitterCorpus dataset's entity types.
+    """
+    def __init__(self, **kwargs):
+        # Initialize the parent class first
+        super().__init__(**kwargs)
+        # Define BroadTwitterCorpus-specific entity mappings
+        self.entity_type_map = {'PER': 'person', 'ORG': 'organization', 'LOC': 'location'}
+        # Add descriptions for each entity type
+        self.entity_descriptions = {
+            'PER': 'Names of people, including first and last names',
+            'ORG': 'Names of companies, institutions, organizations, etc.',
+            'LOC': 'Names of locations, cities, states, countries, etc.',
+        }
+        # Setup entity mappings based on the defined entity types
+        self.setup_entity_mappings()

evalscope/benchmarks/ner/conll2003_adapter.py ADDED Viewed

@@ -0,0 +1,48 @@
+from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE
+@register_benchmark(
+    BenchmarkMeta(
+        name='conll2003',
+        pretty_name='CoNLL2003',
+        dataset_id='evalscope/conll2003',
+        tags=[Tags.KNOWLEDGE, Tags.NER],
+        description='The ConLL-2003 dataset is for the Named Entity Recognition (NER) task. It was introduced as part '
+        'of the ConLL-2003 Shared Task conference and contains texts annotated with entities such as '
+        'people, organizations, places, and various names.',
+        few_shot_num=5,
+        train_split='train',
+        eval_split='test',
+        metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
+        prompt_template=PROMPT_TEMPLATE,
+        few_shot_prompt_template=FEWSHOT_TEMPLATE,
+    )
+)
+class CoNLL2003Adapter(NERAdapter):
+    """
+    Adapter for the CoNLL2003 Named Entity Recognition dataset.
+    This adapter inherits the NER functionality from NERAdapter and
+    configures it specifically for the CoNLL2003 dataset's entity types.
+    """
+    def __init__(self, **kwargs):
+        # Initialize the parent class first
+        super().__init__(**kwargs)
+        # Define CoNLL2003-specific entity mappings
+        self.entity_type_map = {'PER': 'person', 'ORG': 'organization', 'LOC': 'location', 'MISC': 'miscellaneous'}
+        # Add descriptions for each entity type
+        self.entity_descriptions = {
+            'PER': 'Names of people, including first and last names',
+            'ORG': 'Names of companies, institutions, organizations, etc.',
+            'LOC': 'Names of locations, cities, states, countries, etc.',
+            'MISC': 'Miscellaneous entities not in the above categories'
+        }
+        # Setup entity mappings based on the defined entity types
+        self.setup_entity_mappings()

evalscope/benchmarks/ner/copious_adapter.py ADDED Viewed

@@ -0,0 +1,85 @@
+from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE
+DESCRIPTION = (
+    'Copious corpus is a gold standard corpus that covers a wide range of biodiversity '
+    'entities, consisting of 668 documents downloaded from the Biodiversity Heritage '
+    'Library with over 26K sentences and more than 28K entities.'
+)
+@register_benchmark(
+    BenchmarkMeta(
+        name='copious',
+        pretty_name='Copious',
+        dataset_id='extraordinarylab/copious',
+        tags=[Tags.KNOWLEDGE, Tags.NER],
+        description=DESCRIPTION.strip(),
+        few_shot_num=5,
+        train_split='train',
+        eval_split='test',
+        metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
+        prompt_template=PROMPT_TEMPLATE,
+        few_shot_prompt_template=FEWSHOT_TEMPLATE,
+    )
+)
+class CopiousAdapter(NERAdapter):
+    """
+    Adapter for the Copious Named Entity Recognition dataset.
+    This adapter inherits the NER functionality from NERAdapter and
+    configures it specifically for the Copious dataset's entity types.
+    """
+    def __init__(self, **kwargs):
+        # Initialize the parent class first
+        super().__init__(**kwargs)
+        # Define Copious-specific entity mappings
+        self.entity_type_map = {
+            'TAXON': 'taxon',
+            'GEOGRAPHICAL_LOCATION': 'geographical_location',
+            'HABITAT': 'habitat',
+            'PERSON': 'person',
+            'TEMPORAL_EXPRESSION': 'temporal_expression'
+        }
+        # Add descriptions for each entity type
+        self.entity_descriptions = {
+            'TAXON': (
+                'Mentions of taxonomic ranks such as species, genus, and family. '
+                'This includes scientific names (e.g., "Salvelinus alpinus") and '
+                'vernacular names (e.g., "flying fox"), but excludes general terms '
+                'like "fish" or "birds" and microorganism names.'
+            ),
+            'GEOGRAPHICAL_LOCATION': (
+                'Identifiable points or areas on the planet, including continents, '
+                'countries, cities, landforms, and bodies of water (e.g., "East coast '
+                'of Mindoro", "Balayan Bay"). This also includes geographical '
+                'coordinates (e.g., "13o 36\' 11\\" N.").'
+            ),
+            'HABITAT': (
+                'Descriptions of environments where organisms live. This includes '
+                'natural environments (e.g., "Lowland forest", "subalpine calcareous '
+                'pastures") and places where parasites or epiphytes reside (e.g., '
+                '"parasitic on Achillea holosericea"). It excludes habitat attributes '
+                'like altitude or depth.'
+            ),
+            'PERSON': (
+                'Proper nouns referring to person names, including those in historical '
+                'accounts or citations related to a species observation (e.g., "In 1905, '
+                '[Tattersall] follows..."). It excludes titles, general references like '
+                '"the researcher", and names that are part of a taxon\'s authority.'
+            ),
+            'TEMPORAL_EXPRESSION': (
+                'Spans of text referring to points in time. This includes specific dates '
+                '(e.g., "10 June 2013"), years, decades, seasons, and geochronological ages '
+                '(e.g., "late Pleistocene"). It excludes time-of-day information and dates '
+                'within a taxon name\'s authority.'
+            )
+        }
+        # Setup entity mappings based on the defined entity types
+        self.setup_entity_mappings()

evalscope/benchmarks/ner/cross_ner_adapter.py ADDED Viewed

@@ -0,0 +1,120 @@
+from typing import Any, Dict, List, Set, Tuple
+from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.registry import register_benchmark
+from evalscope.benchmarks.ner.cross_ner_entities import ai, literature, music, politics, science
+from evalscope.constants import Tags
+from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE, create_target_text
+DESCRIPTION = (
+    'CrossNER is a fully-labelled collected of named entity recognition (NER) data '
+    'spanning over five diverse domains (AI, Literature, Music, Politics, Science).'
+)
+@register_benchmark(
+    BenchmarkMeta(
+        name='cross_ner',
+        pretty_name='CrossNER',
+        dataset_id='extraordinarylab/cross-ner',
+        subset_list=['ai', 'literature', 'music', 'politics', 'science'],
+        tags=[Tags.KNOWLEDGE, Tags.NER],
+        description=DESCRIPTION.strip(),
+        few_shot_num=5,
+        train_split='train',
+        eval_split='test',
+        metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
+        prompt_template=PROMPT_TEMPLATE,
+        few_shot_prompt_template=FEWSHOT_TEMPLATE,
+    )
+)
+class CrossNERAdapter(NERAdapter):
+    """
+    Adapter for the CrossNER Named Entity Recognition dataset.
+    This adapter inherits the NER functionality from NERAdapter and
+    configures it specifically for the CrossNER dataset's entity types.
+    """
+    def __init__(self, **kwargs):
+        # Initialize the parent class first
+        super().__init__(**kwargs)
+        # Define CrossNER-specific entity mappings
+        self.entity_type_map = {}
+        # Add descriptions for each entity type
+        self.entity_descriptions = {}
+    def setup_entity_mappings(self):
+        """
+        Setup entity mappings and descriptions for prompt formatting.
+        This should be called after entity_type_map and entity_descriptions are defined.
+        """
+        if self.current_subset_name == 'ai':
+            self.entity_type_map, self.entity_descriptions = ai.get_entity_mappings()
+        elif self.current_subset_name == 'literature':
+            self.entity_type_map, self.entity_descriptions = literature.get_entity_mappings()
+        elif self.current_subset_name == 'music':
+            self.entity_type_map, self.entity_descriptions = music.get_entity_mappings()
+        elif self.current_subset_name == 'politics':
+            self.entity_type_map, self.entity_descriptions = politics.get_entity_mappings()
+        elif self.current_subset_name == 'science':
+            self.entity_type_map, self.entity_descriptions = science.get_entity_mappings()
+        # Reverse mapping for converting back from prediction to evaluation
+        self.reverse_entity_map = {v.lower(): k for k, v in self.entity_type_map.items()}
+        # Create list of tags for prompt formatting
+        self.entity_list = [f'<{ent.lower()}>' for ent in self.entity_type_map.values()]
+        # Create description of entities for prompt
+        self.entities_description = ', '.join([
+            f'{self.entity_type_map[tag]} ({self.entity_descriptions[tag]})' for tag in self.entity_type_map
+        ])
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        """
+        Convert a record with tokens and NER tags into a Sample.
+        Creates both the raw text input and annotated text target.
+        """
+        # Setup entity mappings based on the defined entity types
+        self.setup_entity_mappings()
+        tokens: List[str] = record['tokens']
+        ner_tags: List[str] = record['ner_tags']
+        # Create the input text by joining tokens
+        input_text = ' '.join(tokens)
+        # Process tokens and tags to create annotated target text
+        target_text = create_target_text(tokens, ner_tags, self.entity_type_map)
+        # Store tokens and tags in metadata for evaluation
+        metadata = {'tokens': tokens, 'ner_tags': ner_tags}
+        return Sample(input=input_text, target=target_text, metadata=metadata)
+    def format_prompt_template(self, sample):
+        """
+        Format the prompt with entity types, available tags, and text to annotate.
+        """
+        # Setup entity mappings based on the defined entity types
+        self.setup_entity_mappings()
+        return self.prompt_template.format(
+            entities=self.entities_description, entity_list=', '.join(self.entity_list), text=sample.input
+        )
+    def format_fewshot_template(self, fewshot, sample):
+        """
+        Format the few-shot prompt with all required parameters.
+        """
+        # Setup entity mappings based on the defined entity types
+        self.setup_entity_mappings()
+        return self.few_shot_prompt_template.format(
+            fewshot=fewshot,
+            entities=self.entities_description,
+            entity_list=', '.join(self.entity_list),
+            text=sample.input
+        )

evalscope/benchmarks/ner/cross_ner_entities/__init__.py ADDED Viewed

File without changes

evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

evalscope 1.0.0py3-none-any.whl → 1.2.0py3-none-any.whl