evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/api/benchmark/__init__.py +9 -1
- evalscope/api/benchmark/adapters/__init__.py +4 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +85 -2
- evalscope/api/benchmark/meta.py +10 -1
- evalscope/api/dataset/dataset.py +27 -6
- evalscope/api/dataset/loader.py +8 -3
- evalscope/api/evaluator/cache.py +31 -4
- evalscope/api/evaluator/evaluator.py +5 -0
- evalscope/api/evaluator/state.py +17 -1
- evalscope/api/messages/__init__.py +1 -0
- evalscope/api/messages/chat_message.py +52 -2
- evalscope/api/metric/__init__.py +1 -1
- evalscope/api/metric/metric.py +6 -1
- evalscope/api/metric/scorer.py +15 -7
- evalscope/api/mixin/__init__.py +1 -1
- evalscope/api/mixin/llm_judge_mixin.py +2 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/generate_config.py +10 -6
- evalscope/api/model/model.py +5 -2
- evalscope/api/tool/tool_info.py +1 -1
- evalscope/app/app.py +3 -0
- evalscope/app/ui/multi_model.py +6 -1
- evalscope/app/ui/single_model.py +11 -5
- evalscope/app/utils/data_utils.py +8 -7
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -12
- evalscope/app/utils/visualization.py +2 -2
- evalscope/arguments.py +8 -4
- evalscope/backend/opencompass/backend_manager.py +0 -2
- evalscope/backend/rag_eval/utils/embedding.py +9 -1
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/aime24_adapter.py +5 -0
- evalscope/benchmarks/aime/aime25_adapter.py +136 -1
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/benchmarks/aime/math_normalize.py +189 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
- evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
- evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/drop_adapter.py +15 -44
- evalscope/benchmarks/drop/utils.py +97 -0
- evalscope/benchmarks/frames/frames_adapter.py +2 -1
- evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
- evalscope/benchmarks/general_arena/utils.py +2 -1
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hle/hle_adapter.py +3 -2
- evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
- evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/config.py +103 -18
- evalscope/constants.py +18 -0
- evalscope/evaluator/evaluator.py +138 -82
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/llm_judge.py +19 -7
- evalscope/metrics/math_parser.py +14 -0
- evalscope/metrics/metric.py +317 -13
- evalscope/metrics/metrics.py +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/model_apis.py +22 -0
- evalscope/models/openai_compatible.py +21 -0
- evalscope/models/text2image_model.py +2 -2
- evalscope/models/utils/openai.py +16 -6
- evalscope/perf/arguments.py +26 -4
- evalscope/perf/benchmark.py +76 -89
- evalscope/perf/http_client.py +31 -16
- evalscope/perf/main.py +15 -2
- evalscope/perf/plugin/api/base.py +9 -7
- evalscope/perf/plugin/api/custom_api.py +13 -58
- evalscope/perf/plugin/api/default_api.py +188 -79
- evalscope/perf/plugin/api/openai_api.py +85 -20
- evalscope/perf/plugin/datasets/base.py +21 -0
- evalscope/perf/plugin/datasets/custom.py +2 -3
- evalscope/perf/plugin/datasets/flickr8k.py +2 -2
- evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
- evalscope/perf/plugin/datasets/line_by_line.py +2 -3
- evalscope/perf/plugin/datasets/longalpaca.py +2 -3
- evalscope/perf/plugin/datasets/openqa.py +2 -4
- evalscope/perf/plugin/datasets/random_dataset.py +1 -3
- evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
- evalscope/perf/utils/benchmark_util.py +43 -27
- evalscope/perf/utils/db_util.py +14 -19
- evalscope/perf/utils/local_server.py +3 -44
- evalscope/perf/utils/log_utils.py +21 -6
- evalscope/report/__init__.py +13 -3
- evalscope/report/combinator.py +91 -20
- evalscope/report/generator.py +8 -87
- evalscope/report/report.py +8 -4
- evalscope/run.py +13 -5
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/argument_utils.py +1 -1
- evalscope/utils/chat_service.py +1 -1
- evalscope/utils/function_utils.py +249 -12
- evalscope/utils/import_utils.py +73 -1
- evalscope/utils/io_utils.py +132 -7
- evalscope/utils/json_schema.py +25 -2
- evalscope/utils/logger.py +69 -18
- evalscope/utils/model_utils.py +4 -3
- evalscope/utils/multi_choices.py +39 -7
- evalscope/utils/ner.py +377 -0
- evalscope/version.py +2 -2
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/api/mixin/dataset_mixin.py +0 -105
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
- tests/__init__.py +0 -1
- tests/aigc/__init__.py +0 -1
- tests/aigc/test_t2i.py +0 -142
- tests/benchmark/__init__.py +0 -1
- tests/benchmark/test_eval.py +0 -386
- tests/cli/__init__.py +0 -1
- tests/cli/test_all.py +0 -229
- tests/cli/test_collection.py +0 -96
- tests/cli/test_custom.py +0 -268
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -176
- tests/rag/test_clip_benchmark.py +0 -90
- tests/rag/test_mteb.py +0 -213
- tests/rag/test_ragas.py +0 -128
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -146
- tests/swift/test_run_swift_vlm_eval.py +0 -128
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
- tests/test_run_all.py +0 -12
- tests/utils.py +0 -13
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -102
- /evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
- {tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
15
|
+
|
|
16
|
+
from evalscope.utils import get_logger
|
|
17
|
+
from . import ifeval
|
|
18
|
+
|
|
19
|
+
logger = get_logger()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def gen_acc_strict(x: Dict[str, Any]) -> Dict[str, List]:
|
|
23
|
+
# reference: fbcode/gen_ai/github/fair_evals/evals/tasks/finetune/ifeval.py
|
|
24
|
+
response = str(x['response'])
|
|
25
|
+
instruction_list = x['instruction_id_list']
|
|
26
|
+
is_following_list = []
|
|
27
|
+
for index, instruction_id in enumerate(instruction_list):
|
|
28
|
+
instruction_cls = ifeval.INSTRUCTION_DICT[instruction_id]
|
|
29
|
+
instruction = instruction_cls(instruction_id)
|
|
30
|
+
|
|
31
|
+
instruction.build_description(**x['kwargs'][index])
|
|
32
|
+
if response and instruction.check_following(response):
|
|
33
|
+
is_following_list.append(True)
|
|
34
|
+
else:
|
|
35
|
+
is_following_list.append(False)
|
|
36
|
+
|
|
37
|
+
return {
|
|
38
|
+
'follow_instruction_list': is_following_list,
|
|
39
|
+
'instruction_id_list': instruction_list,
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def gen_acc_loose(x: Dict[str, Any]) -> Dict[str, List]:
|
|
44
|
+
response = str(x['response'])
|
|
45
|
+
r = response.split('\n')
|
|
46
|
+
response_remove_first = '\n'.join(r[1:]).strip()
|
|
47
|
+
response_remove_last = '\n'.join(r[:-1]).strip()
|
|
48
|
+
response_remove_both = '\n'.join(r[1:-1]).strip()
|
|
49
|
+
revised_response = response.replace('*', '')
|
|
50
|
+
revised_response_remove_first = response_remove_first.replace('*', '')
|
|
51
|
+
revised_response_remove_last = response_remove_last.replace('*', '')
|
|
52
|
+
revised_response_remove_both = response_remove_both.replace('*', '')
|
|
53
|
+
all_responses = [
|
|
54
|
+
response,
|
|
55
|
+
revised_response,
|
|
56
|
+
response_remove_first,
|
|
57
|
+
response_remove_last,
|
|
58
|
+
response_remove_both,
|
|
59
|
+
revised_response_remove_first,
|
|
60
|
+
revised_response_remove_last,
|
|
61
|
+
revised_response_remove_both,
|
|
62
|
+
]
|
|
63
|
+
instruction_list = x['instruction_id_list']
|
|
64
|
+
is_following_list = []
|
|
65
|
+
for index, instruction_id in enumerate(instruction_list):
|
|
66
|
+
instruction_cls = ifeval.INSTRUCTION_DICT[instruction_id]
|
|
67
|
+
instruction = instruction_cls(instruction_id)
|
|
68
|
+
|
|
69
|
+
instruction.build_description(**x['kwargs'][index])
|
|
70
|
+
|
|
71
|
+
is_following = False
|
|
72
|
+
for r in all_responses: # type: ignore
|
|
73
|
+
if r.strip() and instruction.check_following(r): # type: ignore
|
|
74
|
+
is_following = True
|
|
75
|
+
break
|
|
76
|
+
|
|
77
|
+
is_following_list.append(is_following)
|
|
78
|
+
return {
|
|
79
|
+
'follow_instruction_list': is_following_list,
|
|
80
|
+
'instruction_id_list': instruction_list,
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def parse_result(outputs: List[Dict[str, Any]]) -> Tuple[float, float]:
|
|
85
|
+
|
|
86
|
+
prompt_total = 0
|
|
87
|
+
prompt_correct = 0
|
|
88
|
+
instruction_total = 0
|
|
89
|
+
instruction_correct = 0
|
|
90
|
+
|
|
91
|
+
for example in outputs:
|
|
92
|
+
follow_instruction_list = example['follow_instruction_list']
|
|
93
|
+
instruction_id_list = example['instruction_id_list']
|
|
94
|
+
|
|
95
|
+
prompt_total += 1
|
|
96
|
+
if all(follow_instruction_list):
|
|
97
|
+
prompt_correct += 1
|
|
98
|
+
|
|
99
|
+
instruction_total += len(instruction_id_list)
|
|
100
|
+
instruction_correct += sum(follow_instruction_list)
|
|
101
|
+
|
|
102
|
+
return prompt_correct / prompt_total if prompt_total > 0 else 0, \
|
|
103
|
+
instruction_correct / instruction_total if instruction_total > 0 else 0
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def parse_result_no_reduce(outputs: List[Dict[str, Any]]) -> Tuple[List, List]:
|
|
107
|
+
|
|
108
|
+
prompt_res = []
|
|
109
|
+
inst_res = []
|
|
110
|
+
|
|
111
|
+
for example in outputs:
|
|
112
|
+
follow_instruction_list = example['follow_instruction_list']
|
|
113
|
+
instruction_id_list = example['instruction_id_list']
|
|
114
|
+
if all(follow_instruction_list):
|
|
115
|
+
prompt_res.append(1)
|
|
116
|
+
else:
|
|
117
|
+
prompt_res.append(0)
|
|
118
|
+
inst_res.append(sum(follow_instruction_list) / len(instruction_id_list) if instruction_id_list else 0.0)
|
|
119
|
+
|
|
120
|
+
return prompt_res, inst_res
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.evaluator import TaskState
|
|
7
|
+
from evalscope.api.messages import ChatMessageUser, messages_pretty_str
|
|
8
|
+
from evalscope.api.metric import Score
|
|
9
|
+
from evalscope.api.model import Model
|
|
10
|
+
from evalscope.api.registry import register_benchmark
|
|
11
|
+
from evalscope.constants import Tags
|
|
12
|
+
from evalscope.utils.import_utils import check_import
|
|
13
|
+
from evalscope.utils.logger import get_logger
|
|
14
|
+
|
|
15
|
+
logger = get_logger()
|
|
16
|
+
|
|
17
|
+
SUBSET_LIST = [
|
|
18
|
+
'Chinese',
|
|
19
|
+
'English',
|
|
20
|
+
'German',
|
|
21
|
+
'Italian',
|
|
22
|
+
'Vietnamese',
|
|
23
|
+
'Spanish',
|
|
24
|
+
'Hindi',
|
|
25
|
+
'Portuguese',
|
|
26
|
+
'French',
|
|
27
|
+
'Thai',
|
|
28
|
+
'Russian',
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@register_benchmark(
|
|
33
|
+
BenchmarkMeta(
|
|
34
|
+
name='multi_if',
|
|
35
|
+
pretty_name='Multi-IF',
|
|
36
|
+
description=
|
|
37
|
+
'Multi-IF is a benchmark designed to evaluate the performance of LLM models\' capabilities in multi-turn instruction following within a multilingual environment.', # noqa: E501
|
|
38
|
+
tags=[Tags.INSTRUCTION_FOLLOWING, Tags.MULTI_LINGUAL, Tags.MULTI_TURN],
|
|
39
|
+
dataset_id='facebook/Multi-IF',
|
|
40
|
+
subset_list=SUBSET_LIST,
|
|
41
|
+
metric_list=[
|
|
42
|
+
'prompt_level_strict',
|
|
43
|
+
'inst_level_strict',
|
|
44
|
+
'prompt_level_loose',
|
|
45
|
+
'inst_level_loose',
|
|
46
|
+
],
|
|
47
|
+
few_shot_num=0,
|
|
48
|
+
train_split=None,
|
|
49
|
+
eval_split='train',
|
|
50
|
+
extra_params={
|
|
51
|
+
'max_turns': 3, # maximum number of turns to evaluate
|
|
52
|
+
}
|
|
53
|
+
)
|
|
54
|
+
)
|
|
55
|
+
class MultiIFAdapter(DefaultDataAdapter):
|
|
56
|
+
|
|
57
|
+
def __init__(self, **kwargs):
|
|
58
|
+
super().__init__(**kwargs)
|
|
59
|
+
|
|
60
|
+
# Ensure required packages are installed
|
|
61
|
+
check_import(
|
|
62
|
+
module_name=['nltk', 'langdetect'],
|
|
63
|
+
package=['nltk', 'langdetect'],
|
|
64
|
+
raise_error=True,
|
|
65
|
+
feature_name=self.pretty_name
|
|
66
|
+
)
|
|
67
|
+
if 'Chinese' in self.subset_list:
|
|
68
|
+
check_import(module_name='emoji', package='emoji', raise_error=True, feature_name='Chinese subset')
|
|
69
|
+
if 'Thai' in self.subset_list:
|
|
70
|
+
check_import(module_name='pythainlp', package='pythainlp', raise_error=True, feature_name='Thai subset')
|
|
71
|
+
|
|
72
|
+
self.reformat_subset = True
|
|
73
|
+
self.max_turns = self.extra_params.get('max_turns', 3)
|
|
74
|
+
if not isinstance(self.max_turns, int) or self.max_turns < 1 or self.max_turns > 3:
|
|
75
|
+
logger.warning(f'max_turns should be an integer between 1 and 3, got {self.max_turns}, clamping to 3.')
|
|
76
|
+
self.max_turns = 3
|
|
77
|
+
|
|
78
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
79
|
+
return Sample(
|
|
80
|
+
input=[ChatMessageUser(content='')], # NOTE: we will build the multi turn conversation in the evaluator
|
|
81
|
+
target='',
|
|
82
|
+
subset_key=record['language'],
|
|
83
|
+
metadata=record,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
def run_inference(self, model: Model, sample: Sample, output_dir: str, **kwargs) -> TaskState:
|
|
87
|
+
"""
|
|
88
|
+
Run multi-turn inference with the model and sample.
|
|
89
|
+
"""
|
|
90
|
+
record = sample.metadata
|
|
91
|
+
history = []
|
|
92
|
+
step_record = {}
|
|
93
|
+
for step in range(1, self.max_turns + 1):
|
|
94
|
+
current_prompt = json.loads(record[f'turn_{step}_prompt'])
|
|
95
|
+
history.append(ChatMessageUser(content=current_prompt['content']))
|
|
96
|
+
# Generate model output
|
|
97
|
+
model_output = model.generate(input=history, tools=sample.tools)
|
|
98
|
+
|
|
99
|
+
response = model_output.completion
|
|
100
|
+
instruction_id_list = json.loads(record[f'turn_{step}_instruction_id_list'])
|
|
101
|
+
kwargs_list = json.loads(record[f'turn_{step}_kwargs'])
|
|
102
|
+
_kwargs = [json.loads(kwarg) for kwarg in kwargs_list]
|
|
103
|
+
|
|
104
|
+
step_record[step] = {
|
|
105
|
+
'prompt': messages_pretty_str(history),
|
|
106
|
+
'response': response,
|
|
107
|
+
'instruction_id_list': instruction_id_list,
|
|
108
|
+
'kwargs': _kwargs
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
# Append model output to history for next turn
|
|
112
|
+
history.append(model_output.message)
|
|
113
|
+
|
|
114
|
+
sample.metadata['step_record'] = step_record
|
|
115
|
+
return TaskState(
|
|
116
|
+
model=model.name,
|
|
117
|
+
sample=sample,
|
|
118
|
+
messages=history,
|
|
119
|
+
output=model_output,
|
|
120
|
+
completed=True,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
def match_score(
|
|
124
|
+
self, original_prediction: str, filtered_prediction: str, reference: Dict, task_state: TaskState
|
|
125
|
+
) -> Score:
|
|
126
|
+
"""
|
|
127
|
+
Calculate evaluation scores by comparing prediction with reference.
|
|
128
|
+
"""
|
|
129
|
+
from .metrics import gen_acc_loose, gen_acc_strict, parse_result
|
|
130
|
+
|
|
131
|
+
# Initialize the score object with prediction details
|
|
132
|
+
score = Score(
|
|
133
|
+
extracted_prediction=filtered_prediction,
|
|
134
|
+
prediction=original_prediction,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
step_record = task_state.metadata['step_record']
|
|
138
|
+
results = {}
|
|
139
|
+
try:
|
|
140
|
+
for step, record in step_record.items():
|
|
141
|
+
outputs_strict = gen_acc_strict(record)
|
|
142
|
+
outputs_loose = gen_acc_loose(record)
|
|
143
|
+
prompt_level_strict, inst_level_strict = parse_result([outputs_strict])
|
|
144
|
+
prompt_level_loose, inst_level_loose = parse_result([outputs_loose])
|
|
145
|
+
results.update({
|
|
146
|
+
f'turn_{step}_prompt_level_strict': prompt_level_strict,
|
|
147
|
+
f'turn_{step}_inst_level_strict': inst_level_strict,
|
|
148
|
+
f'turn_{step}_prompt_level_loose': prompt_level_loose,
|
|
149
|
+
f'turn_{step}_inst_level_loose': inst_level_loose,
|
|
150
|
+
})
|
|
151
|
+
score.value.update(results)
|
|
152
|
+
|
|
153
|
+
# Set main score name
|
|
154
|
+
if results:
|
|
155
|
+
score.main_score_name = f'turn_{step}_prompt_level_strict'
|
|
156
|
+
|
|
157
|
+
except Exception as e:
|
|
158
|
+
logger.error(f'Error calculating ifeval metrics: {e}')
|
|
159
|
+
score.value = {}
|
|
160
|
+
|
|
161
|
+
return score
|
|
File without changes
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
|
|
2
|
+
from evalscope.api.dataset import Sample
|
|
3
|
+
from evalscope.api.registry import register_benchmark
|
|
4
|
+
from evalscope.constants import Tags
|
|
5
|
+
from evalscope.utils.multi_choices import MultipleChoiceTemplate
|
|
6
|
+
|
|
7
|
+
DESCRIPTION = (
|
|
8
|
+
'MusicTrivia is a curated dataset of multiple-choice questions covering both classical and modern music topics. '
|
|
9
|
+
'It includes questions about composers, musical periods, and popular artists, designed for evaluating '
|
|
10
|
+
'factual recall and domain-specific music knowledge.'
|
|
11
|
+
) # noqa: E501
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@register_benchmark(
|
|
15
|
+
BenchmarkMeta(
|
|
16
|
+
name='music_trivia',
|
|
17
|
+
pretty_name='MusicTrivia',
|
|
18
|
+
tags=[Tags.KNOWLEDGE, Tags.MULTIPLE_CHOICE],
|
|
19
|
+
description=DESCRIPTION.strip(),
|
|
20
|
+
dataset_id='extraordinarylab/music-trivia',
|
|
21
|
+
metric_list=['acc'],
|
|
22
|
+
few_shot_num=0,
|
|
23
|
+
train_split=None,
|
|
24
|
+
eval_split='test',
|
|
25
|
+
prompt_template=MultipleChoiceTemplate.SINGLE_ANSWER,
|
|
26
|
+
)
|
|
27
|
+
)
|
|
28
|
+
class MusicTriviaAdapter(MultiChoiceAdapter):
|
|
29
|
+
|
|
30
|
+
def record_to_sample(self, record) -> Sample:
|
|
31
|
+
return Sample(
|
|
32
|
+
input=record['question'],
|
|
33
|
+
choices=record['choices'],
|
|
34
|
+
target=record['answer'],
|
|
35
|
+
metadata={},
|
|
36
|
+
)
|
|
@@ -36,7 +36,7 @@ Don't give information outside the document or repeat your findings."""
|
|
|
36
36
|
tags=[Tags.RETRIEVAL, Tags.LONG_CONTEXT],
|
|
37
37
|
description='Needle in a Haystack is a benchmark focused on information retrieval tasks. '
|
|
38
38
|
'It requires the model to find specific information within a large corpus of text. '
|
|
39
|
-
'[Usage Example](https://evalscope.readthedocs.io/
|
|
39
|
+
'[Usage Example](https://evalscope.readthedocs.io/en/latest/third_party/needle_haystack.html)', # noqa: E501
|
|
40
40
|
dataset_id='AI-ModelScope/Needle-in-a-Haystack-Corpus',
|
|
41
41
|
metric_list=['acc'],
|
|
42
42
|
subset_list=['english', 'chinese'],
|
|
@@ -73,6 +73,7 @@ class NeedleHaystackAdapter(DefaultDataAdapter):
|
|
|
73
73
|
super().__init__(**kwargs)
|
|
74
74
|
|
|
75
75
|
self._use_llm_judge = True
|
|
76
|
+
self.add_aggregation_name = False # Don't add aggregation name for needle haystack adapter
|
|
76
77
|
# set extra params
|
|
77
78
|
self.retrieval_question = self.extra_params.get(
|
|
78
79
|
'retrieval_question', 'What is the best thing to do in San Francisco?'
|
|
@@ -164,7 +165,11 @@ class NeedleHaystackAdapter(DefaultDataAdapter):
|
|
|
164
165
|
records.append(record)
|
|
165
166
|
|
|
166
167
|
dataset = DictDataLoader(
|
|
167
|
-
dict_list=records,
|
|
168
|
+
dict_list=records,
|
|
169
|
+
limit=self.limit,
|
|
170
|
+
repeats=self.repeats,
|
|
171
|
+
sample_fields=self.record_to_sample,
|
|
172
|
+
shuffle=self.shuffle,
|
|
168
173
|
).load()
|
|
169
174
|
|
|
170
175
|
datasets[subset_name] = dataset
|
|
@@ -355,10 +360,6 @@ class NeedleHaystackAdapter(DefaultDataAdapter):
|
|
|
355
360
|
|
|
356
361
|
return score
|
|
357
362
|
|
|
358
|
-
def _on_generate_report(self, scores, model_name, add_aggregation_name=True):
|
|
359
|
-
# Don't add aggregation name for needle haystack adapter
|
|
360
|
-
return super()._on_generate_report(scores, model_name, False)
|
|
361
|
-
|
|
362
363
|
def _on_generate_report_end(self, report: 'Report', output_dir: str, **kwargs):
|
|
363
364
|
try:
|
|
364
365
|
import os
|
|
File without changes
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
|
|
2
|
+
from evalscope.api.registry import register_benchmark
|
|
3
|
+
from evalscope.constants import Tags
|
|
4
|
+
from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE
|
|
5
|
+
|
|
6
|
+
DESCRIPTION = (
|
|
7
|
+
'BroadTwitterCorpus is a dataset of tweets collected over stratified times, places '
|
|
8
|
+
'and social uses. The goal is to represent a broad range of activities, giving a '
|
|
9
|
+
'dataset more representative of the language used in this hardest of social media '
|
|
10
|
+
'formats to process.'
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@register_benchmark(
|
|
15
|
+
BenchmarkMeta(
|
|
16
|
+
name='broad_twitter_corpus',
|
|
17
|
+
pretty_name='BroadTwitterCorpus',
|
|
18
|
+
dataset_id='extraordinarylab/broad-twitter-corpus',
|
|
19
|
+
tags=[Tags.KNOWLEDGE, Tags.NER],
|
|
20
|
+
description=DESCRIPTION.strip(),
|
|
21
|
+
few_shot_num=5,
|
|
22
|
+
train_split='train',
|
|
23
|
+
eval_split='test',
|
|
24
|
+
metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
|
|
25
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
26
|
+
few_shot_prompt_template=FEWSHOT_TEMPLATE,
|
|
27
|
+
)
|
|
28
|
+
)
|
|
29
|
+
class BroadTwitterCorpusAdapter(NERAdapter):
|
|
30
|
+
"""
|
|
31
|
+
Adapter for the BroadTwitterCorpus Named Entity Recognition dataset.
|
|
32
|
+
|
|
33
|
+
This adapter inherits the NER functionality from NERAdapter and
|
|
34
|
+
configures it specifically for the BroadTwitterCorpus dataset's entity types.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(self, **kwargs):
|
|
38
|
+
# Initialize the parent class first
|
|
39
|
+
super().__init__(**kwargs)
|
|
40
|
+
|
|
41
|
+
# Define BroadTwitterCorpus-specific entity mappings
|
|
42
|
+
self.entity_type_map = {'PER': 'person', 'ORG': 'organization', 'LOC': 'location'}
|
|
43
|
+
|
|
44
|
+
# Add descriptions for each entity type
|
|
45
|
+
self.entity_descriptions = {
|
|
46
|
+
'PER': 'Names of people, including first and last names',
|
|
47
|
+
'ORG': 'Names of companies, institutions, organizations, etc.',
|
|
48
|
+
'LOC': 'Names of locations, cities, states, countries, etc.',
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
# Setup entity mappings based on the defined entity types
|
|
52
|
+
self.setup_entity_mappings()
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
|
|
2
|
+
from evalscope.api.registry import register_benchmark
|
|
3
|
+
from evalscope.constants import Tags
|
|
4
|
+
from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@register_benchmark(
|
|
8
|
+
BenchmarkMeta(
|
|
9
|
+
name='conll2003',
|
|
10
|
+
pretty_name='CoNLL2003',
|
|
11
|
+
dataset_id='evalscope/conll2003',
|
|
12
|
+
tags=[Tags.KNOWLEDGE, Tags.NER],
|
|
13
|
+
description='The ConLL-2003 dataset is for the Named Entity Recognition (NER) task. It was introduced as part '
|
|
14
|
+
'of the ConLL-2003 Shared Task conference and contains texts annotated with entities such as '
|
|
15
|
+
'people, organizations, places, and various names.',
|
|
16
|
+
few_shot_num=5,
|
|
17
|
+
train_split='train',
|
|
18
|
+
eval_split='test',
|
|
19
|
+
metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
|
|
20
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
21
|
+
few_shot_prompt_template=FEWSHOT_TEMPLATE,
|
|
22
|
+
)
|
|
23
|
+
)
|
|
24
|
+
class CoNLL2003Adapter(NERAdapter):
|
|
25
|
+
"""
|
|
26
|
+
Adapter for the CoNLL2003 Named Entity Recognition dataset.
|
|
27
|
+
|
|
28
|
+
This adapter inherits the NER functionality from NERAdapter and
|
|
29
|
+
configures it specifically for the CoNLL2003 dataset's entity types.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(self, **kwargs):
|
|
33
|
+
# Initialize the parent class first
|
|
34
|
+
super().__init__(**kwargs)
|
|
35
|
+
|
|
36
|
+
# Define CoNLL2003-specific entity mappings
|
|
37
|
+
self.entity_type_map = {'PER': 'person', 'ORG': 'organization', 'LOC': 'location', 'MISC': 'miscellaneous'}
|
|
38
|
+
|
|
39
|
+
# Add descriptions for each entity type
|
|
40
|
+
self.entity_descriptions = {
|
|
41
|
+
'PER': 'Names of people, including first and last names',
|
|
42
|
+
'ORG': 'Names of companies, institutions, organizations, etc.',
|
|
43
|
+
'LOC': 'Names of locations, cities, states, countries, etc.',
|
|
44
|
+
'MISC': 'Miscellaneous entities not in the above categories'
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
# Setup entity mappings based on the defined entity types
|
|
48
|
+
self.setup_entity_mappings()
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
|
|
2
|
+
from evalscope.api.registry import register_benchmark
|
|
3
|
+
from evalscope.constants import Tags
|
|
4
|
+
from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE
|
|
5
|
+
|
|
6
|
+
DESCRIPTION = (
|
|
7
|
+
'Copious corpus is a gold standard corpus that covers a wide range of biodiversity '
|
|
8
|
+
'entities, consisting of 668 documents downloaded from the Biodiversity Heritage '
|
|
9
|
+
'Library with over 26K sentences and more than 28K entities.'
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@register_benchmark(
|
|
14
|
+
BenchmarkMeta(
|
|
15
|
+
name='copious',
|
|
16
|
+
pretty_name='Copious',
|
|
17
|
+
dataset_id='extraordinarylab/copious',
|
|
18
|
+
tags=[Tags.KNOWLEDGE, Tags.NER],
|
|
19
|
+
description=DESCRIPTION.strip(),
|
|
20
|
+
few_shot_num=5,
|
|
21
|
+
train_split='train',
|
|
22
|
+
eval_split='test',
|
|
23
|
+
metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
|
|
24
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
25
|
+
few_shot_prompt_template=FEWSHOT_TEMPLATE,
|
|
26
|
+
)
|
|
27
|
+
)
|
|
28
|
+
class CopiousAdapter(NERAdapter):
|
|
29
|
+
"""
|
|
30
|
+
Adapter for the Copious Named Entity Recognition dataset.
|
|
31
|
+
|
|
32
|
+
This adapter inherits the NER functionality from NERAdapter and
|
|
33
|
+
configures it specifically for the Copious dataset's entity types.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(self, **kwargs):
|
|
37
|
+
# Initialize the parent class first
|
|
38
|
+
super().__init__(**kwargs)
|
|
39
|
+
|
|
40
|
+
# Define Copious-specific entity mappings
|
|
41
|
+
self.entity_type_map = {
|
|
42
|
+
'TAXON': 'taxon',
|
|
43
|
+
'GEOGRAPHICAL_LOCATION': 'geographical_location',
|
|
44
|
+
'HABITAT': 'habitat',
|
|
45
|
+
'PERSON': 'person',
|
|
46
|
+
'TEMPORAL_EXPRESSION': 'temporal_expression'
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
# Add descriptions for each entity type
|
|
50
|
+
self.entity_descriptions = {
|
|
51
|
+
'TAXON': (
|
|
52
|
+
'Mentions of taxonomic ranks such as species, genus, and family. '
|
|
53
|
+
'This includes scientific names (e.g., "Salvelinus alpinus") and '
|
|
54
|
+
'vernacular names (e.g., "flying fox"), but excludes general terms '
|
|
55
|
+
'like "fish" or "birds" and microorganism names.'
|
|
56
|
+
),
|
|
57
|
+
'GEOGRAPHICAL_LOCATION': (
|
|
58
|
+
'Identifiable points or areas on the planet, including continents, '
|
|
59
|
+
'countries, cities, landforms, and bodies of water (e.g., "East coast '
|
|
60
|
+
'of Mindoro", "Balayan Bay"). This also includes geographical '
|
|
61
|
+
'coordinates (e.g., "13o 36\' 11\\" N.").'
|
|
62
|
+
),
|
|
63
|
+
'HABITAT': (
|
|
64
|
+
'Descriptions of environments where organisms live. This includes '
|
|
65
|
+
'natural environments (e.g., "Lowland forest", "subalpine calcareous '
|
|
66
|
+
'pastures") and places where parasites or epiphytes reside (e.g., '
|
|
67
|
+
'"parasitic on Achillea holosericea"). It excludes habitat attributes '
|
|
68
|
+
'like altitude or depth.'
|
|
69
|
+
),
|
|
70
|
+
'PERSON': (
|
|
71
|
+
'Proper nouns referring to person names, including those in historical '
|
|
72
|
+
'accounts or citations related to a species observation (e.g., "In 1905, '
|
|
73
|
+
'[Tattersall] follows..."). It excludes titles, general references like '
|
|
74
|
+
'"the researcher", and names that are part of a taxon\'s authority.'
|
|
75
|
+
),
|
|
76
|
+
'TEMPORAL_EXPRESSION': (
|
|
77
|
+
'Spans of text referring to points in time. This includes specific dates '
|
|
78
|
+
'(e.g., "10 June 2013"), years, decades, seasons, and geochronological ages '
|
|
79
|
+
'(e.g., "late Pleistocene"). It excludes time-of-day information and dates '
|
|
80
|
+
'within a taxon name\'s authority.'
|
|
81
|
+
)
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
# Setup entity mappings based on the defined entity types
|
|
85
|
+
self.setup_entity_mappings()
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Set, Tuple
|
|
2
|
+
|
|
3
|
+
from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
|
|
4
|
+
from evalscope.api.dataset import Sample
|
|
5
|
+
from evalscope.api.registry import register_benchmark
|
|
6
|
+
from evalscope.benchmarks.ner.cross_ner_entities import ai, literature, music, politics, science
|
|
7
|
+
from evalscope.constants import Tags
|
|
8
|
+
from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE, create_target_text
|
|
9
|
+
|
|
10
|
+
DESCRIPTION = (
|
|
11
|
+
'CrossNER is a fully-labelled collected of named entity recognition (NER) data '
|
|
12
|
+
'spanning over five diverse domains (AI, Literature, Music, Politics, Science).'
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@register_benchmark(
|
|
17
|
+
BenchmarkMeta(
|
|
18
|
+
name='cross_ner',
|
|
19
|
+
pretty_name='CrossNER',
|
|
20
|
+
dataset_id='extraordinarylab/cross-ner',
|
|
21
|
+
subset_list=['ai', 'literature', 'music', 'politics', 'science'],
|
|
22
|
+
tags=[Tags.KNOWLEDGE, Tags.NER],
|
|
23
|
+
description=DESCRIPTION.strip(),
|
|
24
|
+
few_shot_num=5,
|
|
25
|
+
train_split='train',
|
|
26
|
+
eval_split='test',
|
|
27
|
+
metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
|
|
28
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
29
|
+
few_shot_prompt_template=FEWSHOT_TEMPLATE,
|
|
30
|
+
)
|
|
31
|
+
)
|
|
32
|
+
class CrossNERAdapter(NERAdapter):
|
|
33
|
+
"""
|
|
34
|
+
Adapter for the CrossNER Named Entity Recognition dataset.
|
|
35
|
+
|
|
36
|
+
This adapter inherits the NER functionality from NERAdapter and
|
|
37
|
+
configures it specifically for the CrossNER dataset's entity types.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
def __init__(self, **kwargs):
|
|
41
|
+
# Initialize the parent class first
|
|
42
|
+
super().__init__(**kwargs)
|
|
43
|
+
|
|
44
|
+
# Define CrossNER-specific entity mappings
|
|
45
|
+
self.entity_type_map = {}
|
|
46
|
+
|
|
47
|
+
# Add descriptions for each entity type
|
|
48
|
+
self.entity_descriptions = {}
|
|
49
|
+
|
|
50
|
+
def setup_entity_mappings(self):
|
|
51
|
+
"""
|
|
52
|
+
Setup entity mappings and descriptions for prompt formatting.
|
|
53
|
+
This should be called after entity_type_map and entity_descriptions are defined.
|
|
54
|
+
"""
|
|
55
|
+
if self.current_subset_name == 'ai':
|
|
56
|
+
self.entity_type_map, self.entity_descriptions = ai.get_entity_mappings()
|
|
57
|
+
elif self.current_subset_name == 'literature':
|
|
58
|
+
self.entity_type_map, self.entity_descriptions = literature.get_entity_mappings()
|
|
59
|
+
elif self.current_subset_name == 'music':
|
|
60
|
+
self.entity_type_map, self.entity_descriptions = music.get_entity_mappings()
|
|
61
|
+
elif self.current_subset_name == 'politics':
|
|
62
|
+
self.entity_type_map, self.entity_descriptions = politics.get_entity_mappings()
|
|
63
|
+
elif self.current_subset_name == 'science':
|
|
64
|
+
self.entity_type_map, self.entity_descriptions = science.get_entity_mappings()
|
|
65
|
+
|
|
66
|
+
# Reverse mapping for converting back from prediction to evaluation
|
|
67
|
+
self.reverse_entity_map = {v.lower(): k for k, v in self.entity_type_map.items()}
|
|
68
|
+
|
|
69
|
+
# Create list of tags for prompt formatting
|
|
70
|
+
self.entity_list = [f'<{ent.lower()}>' for ent in self.entity_type_map.values()]
|
|
71
|
+
|
|
72
|
+
# Create description of entities for prompt
|
|
73
|
+
self.entities_description = ', '.join([
|
|
74
|
+
f'{self.entity_type_map[tag]} ({self.entity_descriptions[tag]})' for tag in self.entity_type_map
|
|
75
|
+
])
|
|
76
|
+
|
|
77
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
78
|
+
"""
|
|
79
|
+
Convert a record with tokens and NER tags into a Sample.
|
|
80
|
+
Creates both the raw text input and annotated text target.
|
|
81
|
+
"""
|
|
82
|
+
# Setup entity mappings based on the defined entity types
|
|
83
|
+
self.setup_entity_mappings()
|
|
84
|
+
|
|
85
|
+
tokens: List[str] = record['tokens']
|
|
86
|
+
ner_tags: List[str] = record['ner_tags']
|
|
87
|
+
|
|
88
|
+
# Create the input text by joining tokens
|
|
89
|
+
input_text = ' '.join(tokens)
|
|
90
|
+
|
|
91
|
+
# Process tokens and tags to create annotated target text
|
|
92
|
+
target_text = create_target_text(tokens, ner_tags, self.entity_type_map)
|
|
93
|
+
|
|
94
|
+
# Store tokens and tags in metadata for evaluation
|
|
95
|
+
metadata = {'tokens': tokens, 'ner_tags': ner_tags}
|
|
96
|
+
|
|
97
|
+
return Sample(input=input_text, target=target_text, metadata=metadata)
|
|
98
|
+
|
|
99
|
+
def format_prompt_template(self, sample):
|
|
100
|
+
"""
|
|
101
|
+
Format the prompt with entity types, available tags, and text to annotate.
|
|
102
|
+
"""
|
|
103
|
+
# Setup entity mappings based on the defined entity types
|
|
104
|
+
self.setup_entity_mappings()
|
|
105
|
+
return self.prompt_template.format(
|
|
106
|
+
entities=self.entities_description, entity_list=', '.join(self.entity_list), text=sample.input
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
def format_fewshot_template(self, fewshot, sample):
|
|
110
|
+
"""
|
|
111
|
+
Format the few-shot prompt with all required parameters.
|
|
112
|
+
"""
|
|
113
|
+
# Setup entity mappings based on the defined entity types
|
|
114
|
+
self.setup_entity_mappings()
|
|
115
|
+
return self.few_shot_prompt_template.format(
|
|
116
|
+
fewshot=fewshot,
|
|
117
|
+
entities=self.entities_description,
|
|
118
|
+
entity_list=', '.join(self.entity_list),
|
|
119
|
+
text=sample.input
|
|
120
|
+
)
|
|
File without changes
|