evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +11 -0
- evalscope/api/benchmark/adapters/__init__.py +7 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +404 -0
- evalscope/api/benchmark/meta.py +124 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +370 -0
- evalscope/api/dataset/loader.py +266 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +382 -0
- evalscope/api/evaluator/evaluator.py +61 -0
- evalscope/api/evaluator/state.py +280 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +248 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +60 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/llm_judge_mixin.py +170 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +161 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/__init__.py +28 -0
- evalscope/app/app.py +38 -0
- evalscope/app/arguments.py +11 -0
- evalscope/app/constants.py +22 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +53 -0
- evalscope/app/ui/multi_model.py +353 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +220 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +195 -0
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +96 -0
- evalscope/arguments.py +32 -9
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +10 -7
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +23 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
- evalscope/backend/rag_eval/utils/embedding.py +125 -32
- evalscope/backend/rag_eval/utils/llm.py +16 -16
- evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
- evalscope/benchmarks/__init__.py +17 -5
- evalscope/benchmarks/aa_lcr/__init__.py +0 -0
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/__init__.py +0 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/__init__.py +0 -0
- evalscope/benchmarks/aime/aime24_adapter.py +55 -0
- evalscope/benchmarks/aime/aime25_adapter.py +181 -0
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arc/arc_adapter.py +34 -149
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
- evalscope/benchmarks/arena_hard/utils.py +186 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
- evalscope/benchmarks/bfcl/v3/generation.py +222 -0
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
- evalscope/benchmarks/docmath/utils.py +219 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +155 -0
- evalscope/benchmarks/drop/utils.py +156 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +175 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
- evalscope/benchmarks/general_arena/utils.py +223 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
- evalscope/benchmarks/gpqa/prompt.py +88 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +153 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
- evalscope/benchmarks/ifeval/instructions.py +112 -68
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +43 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/race/race_adapter.py +33 -120
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/prompt.py +88 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
- evalscope/benchmarks/super_gpqa/utils.py +86 -0
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +12 -2
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +10 -2
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +27 -3
- evalscope/collections/sampler.py +12 -11
- evalscope/collections/schema.py +13 -12
- evalscope/config.py +218 -147
- evalscope/constants.py +78 -82
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +334 -318
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +59 -3
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +211 -0
- evalscope/metrics/math_parser.py +545 -0
- evalscope/metrics/metric.py +611 -0
- evalscope/metrics/metrics.py +112 -23
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/metrics/t2v_metrics/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +23 -13
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +69 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +144 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +708 -0
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +103 -69
- evalscope/perf/benchmark.py +114 -163
- evalscope/perf/http_client.py +59 -89
- evalscope/perf/main.py +91 -18
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +27 -7
- evalscope/perf/plugin/api/custom_api.py +170 -57
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +214 -0
- evalscope/perf/plugin/api/openai_api.py +120 -41
- evalscope/perf/plugin/datasets/__init__.py +10 -6
- evalscope/perf/plugin/datasets/base.py +43 -1
- evalscope/perf/plugin/datasets/custom.py +22 -3
- evalscope/perf/plugin/datasets/flickr8k.py +5 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +7 -3
- evalscope/perf/plugin/datasets/longalpaca.py +7 -3
- evalscope/perf/plugin/datasets/openqa.py +13 -14
- evalscope/perf/plugin/datasets/random_dataset.py +67 -0
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +95 -55
- evalscope/perf/utils/db_util.py +115 -78
- evalscope/perf/utils/local_server.py +12 -47
- evalscope/perf/utils/log_utils.py +63 -0
- evalscope/perf/utils/rich_display.py +192 -0
- evalscope/report/__init__.py +46 -3
- evalscope/report/combinator.py +143 -32
- evalscope/report/generator.py +74 -34
- evalscope/report/report.py +238 -0
- evalscope/run.py +71 -46
- evalscope/summarizer.py +5 -5
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +441 -0
- evalscope/third_party/thinkbench/infer.py +130 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +48 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +82 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/chat_service.py +8 -6
- evalscope/utils/deprecation_utils.py +53 -0
- evalscope/utils/function_utils.py +266 -0
- evalscope/utils/import_utils.py +154 -0
- evalscope/utils/io_utils.py +336 -8
- evalscope/utils/json_schema.py +231 -0
- evalscope/utils/logger.py +121 -31
- evalscope/utils/model_utils.py +57 -1
- evalscope/utils/multi_choices.py +303 -0
- evalscope/utils/ner.py +377 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- evalscope-1.2.0.dist-info/METADATA +553 -0
- evalscope-1.2.0.dist-info/RECORD +628 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -76
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -291
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/collections/evaluator.py +0 -198
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/named_metrics.py +0 -17
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- evalscope/models/base_adapter.py +0 -52
- evalscope/models/chat_adapter.py +0 -138
- evalscope/models/choice_adapter.py +0 -211
- evalscope/models/custom/__init__.py +0 -3
- evalscope/models/custom/custom_model.py +0 -53
- evalscope/models/custom/dummy_model.py +0 -63
- evalscope/models/custom_adapter.py +0 -67
- evalscope/models/local_model.py +0 -74
- evalscope/models/model.py +0 -229
- evalscope/models/server_adapter.py +0 -111
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/report/app.py +0 -506
- evalscope/report/utils.py +0 -133
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- evalscope/utils/utils.py +0 -301
- evalscope-0.10.0.dist-info/METADATA +0 -565
- evalscope-0.10.0.dist-info/RECORD +0 -286
- tests/__init__.py +0 -1
- tests/cli/__init__.py +0 -1
- tests/cli/test_collection.py +0 -57
- tests/cli/test_run.py +0 -165
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -101
- tests/rag/test_clip_benchmark.py +0 -85
- tests/rag/test_mteb.py +0 -138
- tests/rag/test_ragas.py +0 -120
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -145
- tests/swift/test_run_swift_vlm_eval.py +0 -127
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
- tests/test_run_all.py +0 -12
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -60
- {tests/rag → evalscope/api}/__init__.py +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,389 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from itertools import product
|
|
3
|
+
from tqdm import tqdm
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Union
|
|
5
|
+
|
|
6
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
7
|
+
from evalscope.api.dataset import DatasetDict, DictDataLoader, MemoryDataset, Sample
|
|
8
|
+
from evalscope.api.evaluator import TaskState
|
|
9
|
+
from evalscope.api.metric import Score
|
|
10
|
+
from evalscope.api.registry import register_benchmark
|
|
11
|
+
from evalscope.constants import Tags
|
|
12
|
+
from evalscope.utils.logger import get_logger
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from evalscope.report import Report
|
|
16
|
+
|
|
17
|
+
logger = get_logger()
|
|
18
|
+
|
|
19
|
+
PROMPT_TEMPLATE = """Please read the following text and answer the question below.
|
|
20
|
+
|
|
21
|
+
<text>
|
|
22
|
+
{context}
|
|
23
|
+
</text>
|
|
24
|
+
|
|
25
|
+
<question>
|
|
26
|
+
{question}
|
|
27
|
+
</question>
|
|
28
|
+
|
|
29
|
+
Don't give information outside the document or repeat your findings."""
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@register_benchmark(
|
|
33
|
+
BenchmarkMeta(
|
|
34
|
+
name='needle_haystack',
|
|
35
|
+
pretty_name='Needle-in-a-Haystack',
|
|
36
|
+
tags=[Tags.RETRIEVAL, Tags.LONG_CONTEXT],
|
|
37
|
+
description='Needle in a Haystack is a benchmark focused on information retrieval tasks. '
|
|
38
|
+
'It requires the model to find specific information within a large corpus of text. '
|
|
39
|
+
'[Usage Example](https://evalscope.readthedocs.io/en/latest/third_party/needle_haystack.html)', # noqa: E501
|
|
40
|
+
dataset_id='AI-ModelScope/Needle-in-a-Haystack-Corpus',
|
|
41
|
+
metric_list=['acc'],
|
|
42
|
+
subset_list=['english', 'chinese'],
|
|
43
|
+
eval_split='test',
|
|
44
|
+
system_prompt='You are a helpful AI bot that answers questions for a user. Keep your response short and direct',
|
|
45
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
46
|
+
extra_params={
|
|
47
|
+
'retrieval_question':
|
|
48
|
+
'What is the best thing to do in San Francisco?',
|
|
49
|
+
'needles':
|
|
50
|
+
['\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n'],
|
|
51
|
+
'context_lengths_min':
|
|
52
|
+
1000,
|
|
53
|
+
'context_lengths_max':
|
|
54
|
+
32000,
|
|
55
|
+
'context_lengths_num_intervals':
|
|
56
|
+
10,
|
|
57
|
+
'document_depth_percent_min':
|
|
58
|
+
0,
|
|
59
|
+
'document_depth_percent_max':
|
|
60
|
+
100,
|
|
61
|
+
'document_depth_percent_intervals':
|
|
62
|
+
10,
|
|
63
|
+
'tokenizer_path':
|
|
64
|
+
'Qwen/Qwen3-0.6B',
|
|
65
|
+
'show_score':
|
|
66
|
+
False,
|
|
67
|
+
}
|
|
68
|
+
)
|
|
69
|
+
)
|
|
70
|
+
class NeedleHaystackAdapter(DefaultDataAdapter):
|
|
71
|
+
|
|
72
|
+
def __init__(self, **kwargs):
|
|
73
|
+
super().__init__(**kwargs)
|
|
74
|
+
|
|
75
|
+
self._use_llm_judge = True
|
|
76
|
+
self.add_aggregation_name = False # Don't add aggregation name for needle haystack adapter
|
|
77
|
+
# set extra params
|
|
78
|
+
self.retrieval_question = self.extra_params.get(
|
|
79
|
+
'retrieval_question', 'What is the best thing to do in San Francisco?'
|
|
80
|
+
)
|
|
81
|
+
self.needles = self.extra_params.get(
|
|
82
|
+
'needles',
|
|
83
|
+
['\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n']
|
|
84
|
+
)
|
|
85
|
+
self.context_lengths_min = self.extra_params.get('context_lengths_min', 1000)
|
|
86
|
+
self.context_lengths_max = self.extra_params.get('context_lengths_max', 32000)
|
|
87
|
+
self.context_lengths_num_intervals = self.extra_params.get('context_lengths_num_intervals', 10)
|
|
88
|
+
self.document_depth_percent_min = self.extra_params.get('document_depth_percent_min', 0)
|
|
89
|
+
self.document_depth_percent_max = self.extra_params.get('document_depth_percent_max', 100)
|
|
90
|
+
self.document_depth_percent_intervals = self.extra_params.get('document_depth_percent_intervals', 10)
|
|
91
|
+
self.tokenizer_path = self.extra_params.get('tokenizer_path', 'Qwen/Qwen3-0.6B')
|
|
92
|
+
self.show_score = self.extra_params.get('show_score', False)
|
|
93
|
+
|
|
94
|
+
self._init_tokenizer()
|
|
95
|
+
self._init_length()
|
|
96
|
+
|
|
97
|
+
def _init_length(self):
|
|
98
|
+
""" Initialize context lengths and document depth percentages based on the provided parameters."""
|
|
99
|
+
import numpy as np
|
|
100
|
+
|
|
101
|
+
self.context_lengths = np.round(
|
|
102
|
+
np.linspace(
|
|
103
|
+
self.context_lengths_min,
|
|
104
|
+
self.context_lengths_max,
|
|
105
|
+
num=self.context_lengths_num_intervals,
|
|
106
|
+
endpoint=True
|
|
107
|
+
)
|
|
108
|
+
).astype(int)
|
|
109
|
+
|
|
110
|
+
self.document_depth_percents = np.round(
|
|
111
|
+
np.linspace(
|
|
112
|
+
self.document_depth_percent_min,
|
|
113
|
+
self.document_depth_percent_max,
|
|
114
|
+
num=self.document_depth_percent_intervals,
|
|
115
|
+
endpoint=True
|
|
116
|
+
)
|
|
117
|
+
).astype(int)
|
|
118
|
+
|
|
119
|
+
def _init_tokenizer(self):
|
|
120
|
+
""" Initialize the tokenizer based on the provided tokenizer path."""
|
|
121
|
+
from modelscope import AutoTokenizer
|
|
122
|
+
self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_path)
|
|
123
|
+
|
|
124
|
+
def load(self):
|
|
125
|
+
"""Load dataset from local disk or remote."""
|
|
126
|
+
dataset_name_or_path = self.dataset_id
|
|
127
|
+
if os.path.exists(dataset_name_or_path):
|
|
128
|
+
logger.info(f'Loading dataset from {dataset_name_or_path}')
|
|
129
|
+
dataset_path = dataset_name_or_path
|
|
130
|
+
else:
|
|
131
|
+
from modelscope import dataset_snapshot_download
|
|
132
|
+
logger.info(f'Loading dataset from modelscope: > dataset_name: {dataset_name_or_path}')
|
|
133
|
+
dataset_path = dataset_snapshot_download(
|
|
134
|
+
dataset_name_or_path, allow_file_pattern=['PaulGraham_Essays.txt', 'Journey_to_the_West.txt']
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# Load datasets for both subsets
|
|
138
|
+
datasets = {}
|
|
139
|
+
file_structure = {'english': ['PaulGraham_Essays.txt'], 'chinese': ['Journey_to_the_West.txt']}
|
|
140
|
+
|
|
141
|
+
for subset_name, files in file_structure.items():
|
|
142
|
+
if subset_name not in self.subset_list:
|
|
143
|
+
continue
|
|
144
|
+
file_path = os.path.join(dataset_path, files[0])
|
|
145
|
+
if os.path.exists(file_path):
|
|
146
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
147
|
+
text = f.read()
|
|
148
|
+
|
|
149
|
+
# Generate samples for all combinations of context length and depth
|
|
150
|
+
records = []
|
|
151
|
+
tokens_context = self._get_context_tokens(text)
|
|
152
|
+
for context_length, depth_percent in tqdm(
|
|
153
|
+
product(self.context_lengths, self.document_depth_percents),
|
|
154
|
+
desc=f'Generating {subset_name} samples'
|
|
155
|
+
):
|
|
156
|
+
context = self._insert_needles(tokens_context, depth_percent, context_length)
|
|
157
|
+
record = {
|
|
158
|
+
'text': text,
|
|
159
|
+
'context_length': int(context_length),
|
|
160
|
+
'depth_percent': int(depth_percent),
|
|
161
|
+
'question': self.retrieval_question,
|
|
162
|
+
'answer': '\n'.join(self.needles),
|
|
163
|
+
'context': context,
|
|
164
|
+
}
|
|
165
|
+
records.append(record)
|
|
166
|
+
|
|
167
|
+
dataset = DictDataLoader(
|
|
168
|
+
dict_list=records,
|
|
169
|
+
limit=self.limit,
|
|
170
|
+
repeats=self.repeats,
|
|
171
|
+
sample_fields=self.record_to_sample,
|
|
172
|
+
shuffle=self.shuffle,
|
|
173
|
+
).load()
|
|
174
|
+
|
|
175
|
+
datasets[subset_name] = dataset
|
|
176
|
+
|
|
177
|
+
test_dataset = DatasetDict(datasets)
|
|
178
|
+
return test_dataset, None
|
|
179
|
+
|
|
180
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
181
|
+
"""Convert a data record to a Sample object."""
|
|
182
|
+
return Sample(
|
|
183
|
+
input=record['question'],
|
|
184
|
+
target=record['answer'],
|
|
185
|
+
metadata={
|
|
186
|
+
'context': record['context'],
|
|
187
|
+
'context_length': record['context_length'],
|
|
188
|
+
'depth_percent': record['depth_percent'],
|
|
189
|
+
}
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
def format_prompt_template(self, sample):
|
|
193
|
+
"""Format the prompt template with context and question."""
|
|
194
|
+
context = sample.metadata['context']
|
|
195
|
+
question = sample.input
|
|
196
|
+
return self.prompt_template.format(context=context, question=question)
|
|
197
|
+
|
|
198
|
+
def _get_context_tokens(self, input_context: str) -> list:
|
|
199
|
+
"""
|
|
200
|
+
Encodes the context string into tokens using the tokenizer, ensuring the tokenized context
|
|
201
|
+
is at least as long as the maximum context length required.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
input_context (str): The context string to be tokenized.
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
List[int]: A list of token IDs representing the context.
|
|
208
|
+
"""
|
|
209
|
+
max_context_length = max(self.context_lengths)
|
|
210
|
+
context = input_context
|
|
211
|
+
tokens_context = self.tokenizer.encode(context, add_special_tokens=False)
|
|
212
|
+
# Repeat the context until reaching the required length
|
|
213
|
+
while len(tokens_context) < max_context_length:
|
|
214
|
+
context += '\n' + input_context
|
|
215
|
+
tokens_context = self.tokenizer.encode(context, add_special_tokens=False)
|
|
216
|
+
return tokens_context
|
|
217
|
+
|
|
218
|
+
def _insert_needles(self, tokens_context, depth_percent, context_length):
|
|
219
|
+
"""
|
|
220
|
+
Inserts multiple needles (specific facts or pieces of information) into the original context string at
|
|
221
|
+
designated depth percentages, effectively distributing these needles throughout the context. This method
|
|
222
|
+
is designed to test a model's ability to retrieve specific information (needles) from a larger body of text
|
|
223
|
+
(haystack) based on the placement depth of these needles.
|
|
224
|
+
|
|
225
|
+
The method first encodes the context and each needle into tokens to calculate their lengths in tokens.
|
|
226
|
+
It then adjusts the context length to accommodate the final buffer length. This is crucial for ensuring
|
|
227
|
+
that the total token count (context plus needles) does not exceed the maximum allowable context length,
|
|
228
|
+
which might otherwise lead to information being truncated.
|
|
229
|
+
|
|
230
|
+
This approach calculates the initial insertion point for the first needle as before but then calculates even
|
|
231
|
+
spacing for the remaining needles based on the remaining context length. It ensures that needles are
|
|
232
|
+
distributed as evenly as possible throughout the context after the first insertion.
|
|
233
|
+
|
|
234
|
+
Args:
|
|
235
|
+
tokens_context (List[int]): The original context tokens.
|
|
236
|
+
depth_percent (float): The depth percent at which to insert the needles.
|
|
237
|
+
context_length (int): The total length of the context in tokens, adjusted for final buffer.
|
|
238
|
+
|
|
239
|
+
Returns:
|
|
240
|
+
str: The new context with needles inserted.
|
|
241
|
+
"""
|
|
242
|
+
|
|
243
|
+
context_length -= 150
|
|
244
|
+
|
|
245
|
+
# Calculate the total length of all needles in tokens
|
|
246
|
+
total_needles_length = sum(len(self.tokenizer.encode(needle)) for needle in self.needles)
|
|
247
|
+
|
|
248
|
+
# Ensure context length accounts for needles
|
|
249
|
+
if len(tokens_context) + total_needles_length > context_length:
|
|
250
|
+
tokens_context = tokens_context[:context_length - total_needles_length]
|
|
251
|
+
|
|
252
|
+
# To evenly distribute the needles, we calculate the intervals they need to be inserted.
|
|
253
|
+
depth_percent_interval = (100 - depth_percent) / len(self.needles)
|
|
254
|
+
|
|
255
|
+
# Reset the insertion percentages list for the current context
|
|
256
|
+
self.insertion_percentages = []
|
|
257
|
+
|
|
258
|
+
# Insert needles at calculated points
|
|
259
|
+
for needle in self.needles:
|
|
260
|
+
|
|
261
|
+
tokens_needle = self.tokenizer.encode(needle)
|
|
262
|
+
|
|
263
|
+
if depth_percent == 100:
|
|
264
|
+
# If your depth percent is 100 (which means your needle is the last thing in the doc),
|
|
265
|
+
# throw it at the end
|
|
266
|
+
tokens_context = tokens_context + tokens_needle
|
|
267
|
+
else:
|
|
268
|
+
# Go get the position (in terms of tokens) to insert your needle
|
|
269
|
+
insertion_point = int(len(tokens_context) * (depth_percent / 100))
|
|
270
|
+
|
|
271
|
+
# tokens_new_context represents the tokens before the needle
|
|
272
|
+
tokens_new_context = tokens_context[:insertion_point]
|
|
273
|
+
|
|
274
|
+
# We want to make sure that we place our needle at a sentence break
|
|
275
|
+
# so we first see what token a '.' is
|
|
276
|
+
period_tokens = self.tokenizer.encode('.') + self.tokenizer.encode(
|
|
277
|
+
'。'
|
|
278
|
+
) # Handle both English and Chinese periods
|
|
279
|
+
|
|
280
|
+
# Then we iteration backwards until we find the first period
|
|
281
|
+
while tokens_new_context and tokens_new_context[-1] not in period_tokens:
|
|
282
|
+
insertion_point -= 1
|
|
283
|
+
tokens_new_context = tokens_context[:insertion_point]
|
|
284
|
+
|
|
285
|
+
# Insert the needle into the context at the found position
|
|
286
|
+
tokens_context = tokens_context[:insertion_point] + tokens_needle + tokens_context[insertion_point:]
|
|
287
|
+
|
|
288
|
+
# Log
|
|
289
|
+
insertion_percentage = (insertion_point / len(tokens_context)) * 100
|
|
290
|
+
self.insertion_percentages.append(insertion_percentage)
|
|
291
|
+
logger.debug(
|
|
292
|
+
f"Inserted '{needle}' at {insertion_percentage:.2f}% of the context, "
|
|
293
|
+
f'total length now: {len(tokens_context)} tokens'
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
# Adjust depth for next needle
|
|
297
|
+
depth_percent += depth_percent_interval
|
|
298
|
+
|
|
299
|
+
new_context = self.tokenizer.decode(tokens_context)
|
|
300
|
+
return new_context
|
|
301
|
+
|
|
302
|
+
def match_score(
|
|
303
|
+
self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
|
|
304
|
+
) -> Score:
|
|
305
|
+
"""Calculate evaluation scores by comparing prediction with reference."""
|
|
306
|
+
from evalscope.metrics import exact_match
|
|
307
|
+
from .utils import normalize_answer
|
|
308
|
+
|
|
309
|
+
score = Score(
|
|
310
|
+
extracted_prediction=filtered_prediction,
|
|
311
|
+
prediction=original_prediction,
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
# Get metadata from task state
|
|
315
|
+
context_length = task_state.metadata.get('context_length', 0)
|
|
316
|
+
depth_percent = task_state.metadata.get('depth_percent', 0)
|
|
317
|
+
|
|
318
|
+
norm_gold = normalize_answer(reference)
|
|
319
|
+
norm_pred = normalize_answer(filtered_prediction)
|
|
320
|
+
accuracy = exact_match(gold=norm_gold, pred=norm_pred)
|
|
321
|
+
|
|
322
|
+
metric_name = f'Context#{context_length} Depth#{depth_percent}'
|
|
323
|
+
score.value = {metric_name: accuracy}
|
|
324
|
+
score.main_score_name = metric_name
|
|
325
|
+
|
|
326
|
+
return score
|
|
327
|
+
|
|
328
|
+
def llm_match_score(
|
|
329
|
+
self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
|
|
330
|
+
) -> Score:
|
|
331
|
+
"""Use LLM as a judge to evaluate the predicted answer against the gold answer."""
|
|
332
|
+
from .utils import GENERAL_ORM_PROMPT, ORM_USER_TEMPLATE, parse_score
|
|
333
|
+
|
|
334
|
+
score = Score(
|
|
335
|
+
extracted_prediction=filtered_prediction,
|
|
336
|
+
prediction=original_prediction,
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
# Get metadata from task state
|
|
340
|
+
context_length = task_state.metadata.get('context_length', 0)
|
|
341
|
+
depth_percent = task_state.metadata.get('depth_percent', 0)
|
|
342
|
+
question = task_state.input_text
|
|
343
|
+
|
|
344
|
+
# Get grading response
|
|
345
|
+
prompt = ORM_USER_TEMPLATE.format(question=question, gold=reference, pred=filtered_prediction)
|
|
346
|
+
orm_response = self.llm_judge.judge(prompt, system_prompt=GENERAL_ORM_PROMPT)
|
|
347
|
+
|
|
348
|
+
# Parse grading score with regex, [[score]]
|
|
349
|
+
accuracy = parse_score(orm_response) if orm_response else 0.0
|
|
350
|
+
|
|
351
|
+
metric_name = f'Context#{context_length} Depth#{depth_percent}'
|
|
352
|
+
score.value = {metric_name: accuracy}
|
|
353
|
+
score.explanation = f'LLM judge: {orm_response}'
|
|
354
|
+
score.metadata = {
|
|
355
|
+
'source': 'llm_judge',
|
|
356
|
+
'judge_strategy': getattr(self, 'judge_strategy', 'default'),
|
|
357
|
+
'model': self.llm_judge.model_id if hasattr(self.llm_judge, 'model_id') else 'unknown'
|
|
358
|
+
}
|
|
359
|
+
score.main_score_name = metric_name
|
|
360
|
+
|
|
361
|
+
return score
|
|
362
|
+
|
|
363
|
+
def _on_generate_report_end(self, report: 'Report', output_dir: str, **kwargs):
|
|
364
|
+
try:
|
|
365
|
+
import os
|
|
366
|
+
|
|
367
|
+
from .utils import draw_score_chat
|
|
368
|
+
|
|
369
|
+
report_path = output_dir
|
|
370
|
+
data_frame = report.to_dataframe()
|
|
371
|
+
# split `Metric` to `Context` and `Depth`
|
|
372
|
+
data_frame[['Context', 'Depth']] = data_frame['Metric'].str.split(' ', n=1, expand=True)
|
|
373
|
+
data_frame['Depth'] = data_frame['Depth'].str.replace('Depth#', '').astype(float)
|
|
374
|
+
data_frame['Context'] = data_frame['Context'].str.replace('Context#', '').astype(int)
|
|
375
|
+
# split by `Subset` to multi sub data frame
|
|
376
|
+
for subset in data_frame['Subset'].unique():
|
|
377
|
+
sub_df = data_frame[data_frame['Subset'] == subset]
|
|
378
|
+
# draw charts for each subset
|
|
379
|
+
pivot_table = sub_df.pivot_table(values='Score', index=['Depth', 'Context'],
|
|
380
|
+
aggfunc='mean').reset_index()
|
|
381
|
+
pivot_table = pivot_table.pivot(index='Depth', columns='Context', values='Score')
|
|
382
|
+
draw_score_chat(
|
|
383
|
+
pivot_table,
|
|
384
|
+
outpath=os.path.join(report_path, f'needle_haystack_heatmap_{subset}.png'),
|
|
385
|
+
show_score=self.show_score
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
except Exception as e:
|
|
389
|
+
logger.error(f'Error generating charts: {e}')
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import matplotlib.pyplot as plt
|
|
2
|
+
import os
|
|
3
|
+
import re
|
|
4
|
+
import seaborn as sns
|
|
5
|
+
import string
|
|
6
|
+
from matplotlib.colors import LinearSegmentedColormap
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def normalize_answer(s):
|
|
10
|
+
|
|
11
|
+
def remove_articles(text):
|
|
12
|
+
return re.sub(r'\b(a|an|the)\b', ' ', text)
|
|
13
|
+
|
|
14
|
+
def white_space_fix(text):
|
|
15
|
+
return ' '.join(text.split())
|
|
16
|
+
|
|
17
|
+
def remove_punc(text):
|
|
18
|
+
exclude = set(string.punctuation)
|
|
19
|
+
return ''.join(ch for ch in text if ch not in exclude)
|
|
20
|
+
|
|
21
|
+
def lower(text):
|
|
22
|
+
return text.lower()
|
|
23
|
+
|
|
24
|
+
return white_space_fix(remove_articles(remove_punc(lower(s))))
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def parse_score(score_str: str) -> int:
|
|
28
|
+
"""
|
|
29
|
+
Parses a score string and returns an integer score.
|
|
30
|
+
The score should be in the format [[score]].
|
|
31
|
+
"""
|
|
32
|
+
score_match = re.search(r'\[\[(\d+)\]\]', score_str)
|
|
33
|
+
if score_match:
|
|
34
|
+
score = int(score_match.group(1))
|
|
35
|
+
return score / 10.0
|
|
36
|
+
else:
|
|
37
|
+
return 0.0
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def draw_score_chat(pivot_table, outpath, show_score=False):
|
|
41
|
+
# Create a custom colormap. Go to https://coolors.co/ and pick cool colors
|
|
42
|
+
cmap = LinearSegmentedColormap.from_list('custom_cmap', ['#F0496E', '#EBB839', '#0CD79F'])
|
|
43
|
+
|
|
44
|
+
# Create the heatmap with better aesthetics
|
|
45
|
+
plt.figure(figsize=(17.5, 8)) # Can adjust these dimensions as needed
|
|
46
|
+
sns.heatmap(pivot_table, vmin=0.0, vmax=1.0, annot=show_score, fmt='.1f', cmap=cmap, cbar_kws={'label': 'Score'})
|
|
47
|
+
|
|
48
|
+
# More aesthetics
|
|
49
|
+
plt.title('Fact Retrieval Across Context Lengths ("Needle In A HayStack")') # Adds a title
|
|
50
|
+
plt.xlabel('Token Limit') # X-axis label
|
|
51
|
+
plt.ylabel('Depth Percent') # Y-axis label
|
|
52
|
+
plt.xticks(rotation=45) # Rotates the x-axis labels to prevent overlap
|
|
53
|
+
plt.yticks(rotation=0) # Ensures the y-axis labels are horizontal
|
|
54
|
+
plt.tight_layout() # Fits everything neatly into the figure area
|
|
55
|
+
|
|
56
|
+
# save the figure
|
|
57
|
+
plt.savefig(outpath, dpi=300, bbox_inches='tight')
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
GENERAL_ORM_PROMPT = """You are an expert in verifying if the model answer is correct based on the reference answer.
|
|
61
|
+
Your input is a question, a reference answer, and a model answer. You need to check if the model answer is correct based on the reference answer.
|
|
62
|
+
You should focus on the correctness of the model answer compared to the reference answer, without attempting to solve the original question.
|
|
63
|
+
You must provide your final score in the form of a number from 1 to 10, where:
|
|
64
|
+
|
|
65
|
+
Score 1: The answer is completely unrelated to the reference.
|
|
66
|
+
Score 3: The answer has minor relevance but does not align with the reference.
|
|
67
|
+
Score 5: The answer has moderate relevance but contains inaccuracies.
|
|
68
|
+
Score 7: The answer aligns with the reference but has minor omissions.
|
|
69
|
+
Score 10: The answer is completely accurate and aligns perfectly with the reference.
|
|
70
|
+
|
|
71
|
+
Only respond with a numberical score with formatted as [[score]].""" # noqa: E501
|
|
72
|
+
|
|
73
|
+
ORM_USER_TEMPLATE = """
|
|
74
|
+
Question: {question}
|
|
75
|
+
|
|
76
|
+
Reference Answer: {gold}
|
|
77
|
+
|
|
78
|
+
Model Answer: {pred}
|
|
79
|
+
"""
|
|
File without changes
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
|
|
2
|
+
from evalscope.api.registry import register_benchmark
|
|
3
|
+
from evalscope.constants import Tags
|
|
4
|
+
from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE
|
|
5
|
+
|
|
6
|
+
DESCRIPTION = (
|
|
7
|
+
'BroadTwitterCorpus is a dataset of tweets collected over stratified times, places '
|
|
8
|
+
'and social uses. The goal is to represent a broad range of activities, giving a '
|
|
9
|
+
'dataset more representative of the language used in this hardest of social media '
|
|
10
|
+
'formats to process.'
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@register_benchmark(
|
|
15
|
+
BenchmarkMeta(
|
|
16
|
+
name='broad_twitter_corpus',
|
|
17
|
+
pretty_name='BroadTwitterCorpus',
|
|
18
|
+
dataset_id='extraordinarylab/broad-twitter-corpus',
|
|
19
|
+
tags=[Tags.KNOWLEDGE, Tags.NER],
|
|
20
|
+
description=DESCRIPTION.strip(),
|
|
21
|
+
few_shot_num=5,
|
|
22
|
+
train_split='train',
|
|
23
|
+
eval_split='test',
|
|
24
|
+
metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
|
|
25
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
26
|
+
few_shot_prompt_template=FEWSHOT_TEMPLATE,
|
|
27
|
+
)
|
|
28
|
+
)
|
|
29
|
+
class BroadTwitterCorpusAdapter(NERAdapter):
|
|
30
|
+
"""
|
|
31
|
+
Adapter for the BroadTwitterCorpus Named Entity Recognition dataset.
|
|
32
|
+
|
|
33
|
+
This adapter inherits the NER functionality from NERAdapter and
|
|
34
|
+
configures it specifically for the BroadTwitterCorpus dataset's entity types.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(self, **kwargs):
|
|
38
|
+
# Initialize the parent class first
|
|
39
|
+
super().__init__(**kwargs)
|
|
40
|
+
|
|
41
|
+
# Define BroadTwitterCorpus-specific entity mappings
|
|
42
|
+
self.entity_type_map = {'PER': 'person', 'ORG': 'organization', 'LOC': 'location'}
|
|
43
|
+
|
|
44
|
+
# Add descriptions for each entity type
|
|
45
|
+
self.entity_descriptions = {
|
|
46
|
+
'PER': 'Names of people, including first and last names',
|
|
47
|
+
'ORG': 'Names of companies, institutions, organizations, etc.',
|
|
48
|
+
'LOC': 'Names of locations, cities, states, countries, etc.',
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
# Setup entity mappings based on the defined entity types
|
|
52
|
+
self.setup_entity_mappings()
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
|
|
2
|
+
from evalscope.api.registry import register_benchmark
|
|
3
|
+
from evalscope.constants import Tags
|
|
4
|
+
from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@register_benchmark(
|
|
8
|
+
BenchmarkMeta(
|
|
9
|
+
name='conll2003',
|
|
10
|
+
pretty_name='CoNLL2003',
|
|
11
|
+
dataset_id='evalscope/conll2003',
|
|
12
|
+
tags=[Tags.KNOWLEDGE, Tags.NER],
|
|
13
|
+
description='The ConLL-2003 dataset is for the Named Entity Recognition (NER) task. It was introduced as part '
|
|
14
|
+
'of the ConLL-2003 Shared Task conference and contains texts annotated with entities such as '
|
|
15
|
+
'people, organizations, places, and various names.',
|
|
16
|
+
few_shot_num=5,
|
|
17
|
+
train_split='train',
|
|
18
|
+
eval_split='test',
|
|
19
|
+
metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
|
|
20
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
21
|
+
few_shot_prompt_template=FEWSHOT_TEMPLATE,
|
|
22
|
+
)
|
|
23
|
+
)
|
|
24
|
+
class CoNLL2003Adapter(NERAdapter):
|
|
25
|
+
"""
|
|
26
|
+
Adapter for the CoNLL2003 Named Entity Recognition dataset.
|
|
27
|
+
|
|
28
|
+
This adapter inherits the NER functionality from NERAdapter and
|
|
29
|
+
configures it specifically for the CoNLL2003 dataset's entity types.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(self, **kwargs):
|
|
33
|
+
# Initialize the parent class first
|
|
34
|
+
super().__init__(**kwargs)
|
|
35
|
+
|
|
36
|
+
# Define CoNLL2003-specific entity mappings
|
|
37
|
+
self.entity_type_map = {'PER': 'person', 'ORG': 'organization', 'LOC': 'location', 'MISC': 'miscellaneous'}
|
|
38
|
+
|
|
39
|
+
# Add descriptions for each entity type
|
|
40
|
+
self.entity_descriptions = {
|
|
41
|
+
'PER': 'Names of people, including first and last names',
|
|
42
|
+
'ORG': 'Names of companies, institutions, organizations, etc.',
|
|
43
|
+
'LOC': 'Names of locations, cities, states, countries, etc.',
|
|
44
|
+
'MISC': 'Miscellaneous entities not in the above categories'
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
# Setup entity mappings based on the defined entity types
|
|
48
|
+
self.setup_entity_mappings()
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
|
|
2
|
+
from evalscope.api.registry import register_benchmark
|
|
3
|
+
from evalscope.constants import Tags
|
|
4
|
+
from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE
|
|
5
|
+
|
|
6
|
+
DESCRIPTION = (
|
|
7
|
+
'Copious corpus is a gold standard corpus that covers a wide range of biodiversity '
|
|
8
|
+
'entities, consisting of 668 documents downloaded from the Biodiversity Heritage '
|
|
9
|
+
'Library with over 26K sentences and more than 28K entities.'
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@register_benchmark(
|
|
14
|
+
BenchmarkMeta(
|
|
15
|
+
name='copious',
|
|
16
|
+
pretty_name='Copious',
|
|
17
|
+
dataset_id='extraordinarylab/copious',
|
|
18
|
+
tags=[Tags.KNOWLEDGE, Tags.NER],
|
|
19
|
+
description=DESCRIPTION.strip(),
|
|
20
|
+
few_shot_num=5,
|
|
21
|
+
train_split='train',
|
|
22
|
+
eval_split='test',
|
|
23
|
+
metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
|
|
24
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
25
|
+
few_shot_prompt_template=FEWSHOT_TEMPLATE,
|
|
26
|
+
)
|
|
27
|
+
)
|
|
28
|
+
class CopiousAdapter(NERAdapter):
|
|
29
|
+
"""
|
|
30
|
+
Adapter for the Copious Named Entity Recognition dataset.
|
|
31
|
+
|
|
32
|
+
This adapter inherits the NER functionality from NERAdapter and
|
|
33
|
+
configures it specifically for the Copious dataset's entity types.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(self, **kwargs):
|
|
37
|
+
# Initialize the parent class first
|
|
38
|
+
super().__init__(**kwargs)
|
|
39
|
+
|
|
40
|
+
# Define Copious-specific entity mappings
|
|
41
|
+
self.entity_type_map = {
|
|
42
|
+
'TAXON': 'taxon',
|
|
43
|
+
'GEOGRAPHICAL_LOCATION': 'geographical_location',
|
|
44
|
+
'HABITAT': 'habitat',
|
|
45
|
+
'PERSON': 'person',
|
|
46
|
+
'TEMPORAL_EXPRESSION': 'temporal_expression'
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
# Add descriptions for each entity type
|
|
50
|
+
self.entity_descriptions = {
|
|
51
|
+
'TAXON': (
|
|
52
|
+
'Mentions of taxonomic ranks such as species, genus, and family. '
|
|
53
|
+
'This includes scientific names (e.g., "Salvelinus alpinus") and '
|
|
54
|
+
'vernacular names (e.g., "flying fox"), but excludes general terms '
|
|
55
|
+
'like "fish" or "birds" and microorganism names.'
|
|
56
|
+
),
|
|
57
|
+
'GEOGRAPHICAL_LOCATION': (
|
|
58
|
+
'Identifiable points or areas on the planet, including continents, '
|
|
59
|
+
'countries, cities, landforms, and bodies of water (e.g., "East coast '
|
|
60
|
+
'of Mindoro", "Balayan Bay"). This also includes geographical '
|
|
61
|
+
'coordinates (e.g., "13o 36\' 11\\" N.").'
|
|
62
|
+
),
|
|
63
|
+
'HABITAT': (
|
|
64
|
+
'Descriptions of environments where organisms live. This includes '
|
|
65
|
+
'natural environments (e.g., "Lowland forest", "subalpine calcareous '
|
|
66
|
+
'pastures") and places where parasites or epiphytes reside (e.g., '
|
|
67
|
+
'"parasitic on Achillea holosericea"). It excludes habitat attributes '
|
|
68
|
+
'like altitude or depth.'
|
|
69
|
+
),
|
|
70
|
+
'PERSON': (
|
|
71
|
+
'Proper nouns referring to person names, including those in historical '
|
|
72
|
+
'accounts or citations related to a species observation (e.g., "In 1905, '
|
|
73
|
+
'[Tattersall] follows..."). It excludes titles, general references like '
|
|
74
|
+
'"the researcher", and names that are part of a taxon\'s authority.'
|
|
75
|
+
),
|
|
76
|
+
'TEMPORAL_EXPRESSION': (
|
|
77
|
+
'Spans of text referring to points in time. This includes specific dates '
|
|
78
|
+
'(e.g., "10 June 2013"), years, decades, seasons, and geochronological ages '
|
|
79
|
+
'(e.g., "late Pleistocene"). It excludes time-of-day information and dates '
|
|
80
|
+
'within a taxon name\'s authority.'
|
|
81
|
+
)
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
# Setup entity mappings based on the defined entity types
|
|
85
|
+
self.setup_entity_mappings()
|