PyPI - evalscope - Versions diffs - 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

evalscope 0.10.0py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (606) hide show

evalscope/__init__.py +4 -1
evalscope/api/benchmark/__init__.py +11 -0
evalscope/api/benchmark/adapters/__init__.py +7 -0
evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
evalscope/api/benchmark/benchmark.py +404 -0
evalscope/api/benchmark/meta.py +124 -0
evalscope/api/dataset/__init__.py +2 -0
evalscope/api/dataset/dataset.py +370 -0
evalscope/api/dataset/loader.py +266 -0
evalscope/api/dataset/utils.py +143 -0
evalscope/api/evaluator/__init__.py +3 -0
evalscope/api/evaluator/cache.py +382 -0
evalscope/api/evaluator/evaluator.py +61 -0
evalscope/api/evaluator/state.py +280 -0
evalscope/api/filter/__init__.py +1 -0
evalscope/api/filter/filter.py +72 -0
evalscope/api/messages/__init__.py +12 -0
evalscope/api/messages/chat_message.py +248 -0
evalscope/api/messages/content.py +102 -0
evalscope/api/messages/utils.py +35 -0
evalscope/api/metric/__init__.py +2 -0
evalscope/api/metric/metric.py +60 -0
evalscope/api/metric/scorer.py +113 -0
evalscope/api/mixin/__init__.py +2 -0
evalscope/api/mixin/llm_judge_mixin.py +170 -0
evalscope/api/mixin/sandbox_mixin.py +182 -0
evalscope/api/model/__init__.py +12 -0
evalscope/api/model/generate_config.py +161 -0
evalscope/api/model/model.py +386 -0
evalscope/api/model/model_output.py +285 -0
evalscope/api/registry.py +182 -0
evalscope/api/tool/__init__.py +3 -0
evalscope/api/tool/tool_call.py +101 -0
evalscope/api/tool/tool_info.py +173 -0
evalscope/api/tool/utils.py +64 -0
evalscope/app/__init__.py +28 -0
evalscope/app/app.py +38 -0
evalscope/app/arguments.py +11 -0
evalscope/app/constants.py +22 -0
evalscope/app/ui/__init__.py +20 -0
evalscope/app/ui/app_ui.py +53 -0
evalscope/app/ui/multi_model.py +353 -0
evalscope/app/ui/sidebar.py +42 -0
evalscope/app/ui/single_model.py +220 -0
evalscope/app/ui/visualization.py +36 -0
evalscope/app/utils/data_utils.py +195 -0
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/localization.py +221 -0
evalscope/app/utils/text_utils.py +119 -0
evalscope/app/utils/visualization.py +96 -0
evalscope/arguments.py +32 -9
evalscope/backend/opencompass/api_meta_template.py +2 -1
evalscope/backend/opencompass/backend_manager.py +10 -7
evalscope/backend/rag_eval/__init__.py +1 -1
evalscope/backend/rag_eval/backend_manager.py +23 -6
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
evalscope/backend/rag_eval/ragas/arguments.py +0 -1
evalscope/backend/rag_eval/ragas/task_template.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
evalscope/backend/rag_eval/utils/embedding.py +125 -32
evalscope/backend/rag_eval/utils/llm.py +16 -16
evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
evalscope/benchmarks/__init__.py +17 -5
evalscope/benchmarks/aa_lcr/__init__.py +0 -0
evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
evalscope/benchmarks/ai2d/__init__.py +0 -0
evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
evalscope/benchmarks/aime/__init__.py +0 -0
evalscope/benchmarks/aime/aime24_adapter.py +55 -0
evalscope/benchmarks/aime/aime25_adapter.py +181 -0
evalscope/benchmarks/aime/grader.py +307 -0
evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
evalscope/benchmarks/amc/__init__.py +0 -0
evalscope/benchmarks/amc/amc_adapter.py +51 -0
evalscope/benchmarks/arc/arc_adapter.py +34 -149
evalscope/benchmarks/arena_hard/__init__.py +0 -0
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
evalscope/benchmarks/arena_hard/utils.py +186 -0
evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
evalscope/benchmarks/bfcl/__init__.py +0 -0
evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
evalscope/benchmarks/bfcl/v3/generation.py +222 -0
evalscope/benchmarks/bfcl/v3/utils.py +23 -0
evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
evalscope/benchmarks/bfcl/v4/utils.py +410 -0
evalscope/benchmarks/biomix_qa/__init__.py +0 -0
evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
evalscope/benchmarks/blink/__init__.py +0 -0
evalscope/benchmarks/blink/blink_adapter.py +61 -0
evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
evalscope/benchmarks/chartqa/__init__.py +0 -0
evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
evalscope/benchmarks/chartqa/utils.py +38 -0
evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
evalscope/benchmarks/coin_flip/__init__.py +0 -0
evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
evalscope/benchmarks/data_collection/__init__.py +0 -0
evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
evalscope/benchmarks/docmath/__init__.py +0 -0
evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
evalscope/benchmarks/docmath/utils.py +219 -0
evalscope/benchmarks/docvqa/__init__.py +0 -0
evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
evalscope/benchmarks/drivelology/__init__.py +0 -0
evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
evalscope/benchmarks/drop/__init__.py +0 -0
evalscope/benchmarks/drop/drop_adapter.py +155 -0
evalscope/benchmarks/drop/utils.py +156 -0
evalscope/benchmarks/frames/__init__.py +0 -0
evalscope/benchmarks/frames/frames_adapter.py +175 -0
evalscope/benchmarks/frames/utils.py +37 -0
evalscope/benchmarks/general_arena/__init__.py +0 -0
evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
evalscope/benchmarks/general_arena/utils.py +223 -0
evalscope/benchmarks/general_mcq/__init__.py +0 -0
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
evalscope/benchmarks/gpqa/__init__.py +0 -0
evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
evalscope/benchmarks/gpqa/prompt.py +88 -0
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
evalscope/benchmarks/halu_eval/__init__.py +0 -0
evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
evalscope/benchmarks/healthbench/__init__.py +0 -0
evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
evalscope/benchmarks/healthbench/utils.py +102 -0
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
evalscope/benchmarks/hle/__init__.py +0 -0
evalscope/benchmarks/hle/hle_adapter.py +153 -0
evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
evalscope/benchmarks/humaneval/utils.py +235 -0
evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
evalscope/benchmarks/ifeval/instructions.py +112 -68
evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/ifeval/utils.py +6 -7
evalscope/benchmarks/image_edit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/infovqa/__init__.py +0 -0
evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
evalscope/benchmarks/live_code_bench/__init__.py +0 -0
evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
evalscope/benchmarks/live_code_bench/prompts.py +207 -0
evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
evalscope/benchmarks/logi_qa/__int__.py +0 -0
evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
evalscope/benchmarks/maritime_bench/__init__.py +0 -0
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
evalscope/benchmarks/math_500/__init__.py +0 -0
evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
evalscope/benchmarks/math_qa/__init__.py +0 -0
evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
evalscope/benchmarks/math_verse/__init__.py +0 -0
evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
evalscope/benchmarks/math_vision/__init__.py +0 -0
evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
evalscope/benchmarks/math_vista/__init__.py +0 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
evalscope/benchmarks/med_mcqa/__init__.py +0 -0
evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
evalscope/benchmarks/minerva_math/__init__.py +0 -0
evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
evalscope/benchmarks/mm_bench/__init__.py +0 -0
evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
evalscope/benchmarks/mm_star/__init__.py +0 -0
evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
evalscope/benchmarks/multi_if/__init__.py +0 -0
evalscope/benchmarks/multi_if/ifeval.py +3354 -0
evalscope/benchmarks/multi_if/metrics.py +120 -0
evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
evalscope/benchmarks/music_trivia/__init__.py +0 -0
evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
evalscope/benchmarks/musr/__init__.py +0 -0
evalscope/benchmarks/musr/musr_adapter.py +43 -0
evalscope/benchmarks/needle_haystack/__init__.py +0 -0
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
evalscope/benchmarks/needle_haystack/utils.py +79 -0
evalscope/benchmarks/ner/__init__.py +0 -0
evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
evalscope/benchmarks/ner/copious_adapter.py +85 -0
evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
evalscope/benchmarks/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
evalscope/benchmarks/olympiad_bench/utils.py +565 -0
evalscope/benchmarks/omni_bench/__init__.py +0 -0
evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
evalscope/benchmarks/piqa/__init__.py +0 -0
evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
evalscope/benchmarks/poly_math/__init__.py +0 -0
evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
evalscope/benchmarks/pope/__init__.py +0 -0
evalscope/benchmarks/pope/pope_adapter.py +112 -0
evalscope/benchmarks/process_bench/__init__.py +0 -0
evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
evalscope/benchmarks/pumed_qa/__init__.py +0 -0
evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
evalscope/benchmarks/qasc/__init__.py +0 -0
evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
evalscope/benchmarks/race/race_adapter.py +33 -120
evalscope/benchmarks/real_world_qa/__init__.py +0 -0
evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
evalscope/benchmarks/sciq/__init__.py +0 -0
evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
evalscope/benchmarks/simple_qa/__init__.py +0 -0
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
evalscope/benchmarks/simple_vqa/__init__.py +0 -0
evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
evalscope/benchmarks/siqa/__init__.py +0 -0
evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
evalscope/benchmarks/super_gpqa/__init__.py +0 -0
evalscope/benchmarks/super_gpqa/prompt.py +88 -0
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
evalscope/benchmarks/super_gpqa/utils.py +86 -0
evalscope/benchmarks/tau_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
evalscope/benchmarks/tool_bench/__init__.py +0 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
evalscope/benchmarks/tool_bench/utils.py +203 -0
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
evalscope/benchmarks/visu_logic/__init__.py +0 -0
evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
evalscope/benchmarks/winogrande/__init__.py +0 -0
evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
evalscope/benchmarks/wmt/__init__.py +0 -0
evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
evalscope/benchmarks/zerobench/__init__.py +0 -0
evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
evalscope/cli/cli.py +2 -0
evalscope/cli/start_app.py +12 -2
evalscope/cli/start_eval.py +4 -3
evalscope/cli/start_perf.py +10 -2
evalscope/cli/start_server.py +6 -3
evalscope/collections/__init__.py +27 -3
evalscope/collections/sampler.py +12 -11
evalscope/collections/schema.py +13 -12
evalscope/config.py +218 -147
evalscope/constants.py +78 -82
evalscope/evaluator/__init__.py +1 -1
evalscope/evaluator/evaluator.py +334 -318
evalscope/filters/__init__.py +2 -0
evalscope/filters/extraction.py +126 -0
evalscope/filters/selection.py +57 -0
evalscope/metrics/__init__.py +59 -3
evalscope/metrics/bert_score/__init__.py +0 -0
evalscope/metrics/bert_score/scorer.py +338 -0
evalscope/metrics/bert_score/utils.py +697 -0
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
evalscope/metrics/llm_judge.py +211 -0
evalscope/metrics/math_parser.py +545 -0
evalscope/metrics/metric.py +611 -0
evalscope/metrics/metrics.py +112 -23
evalscope/metrics/rouge_metric.py +11 -13
evalscope/metrics/t2v_metrics/__init__.py +0 -0
evalscope/metrics/t2v_metrics/clipscore.py +14 -0
evalscope/metrics/t2v_metrics/constants.py +12 -0
evalscope/metrics/t2v_metrics/itmscore.py +14 -0
evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
evalscope/metrics/t2v_metrics/models/model.py +45 -0
evalscope/metrics/t2v_metrics/models/utils.py +25 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
evalscope/metrics/t2v_metrics/score.py +78 -0
evalscope/metrics/t2v_metrics/vqascore.py +14 -0
evalscope/models/__init__.py +23 -13
evalscope/models/image_edit_model.py +125 -0
evalscope/models/mockllm.py +65 -0
evalscope/models/model_apis.py +69 -0
evalscope/models/modelscope.py +455 -0
evalscope/models/openai_compatible.py +144 -0
evalscope/models/text2image_model.py +124 -0
evalscope/models/utils/openai.py +708 -0
evalscope/perf/__init__.py +0 -1
evalscope/perf/arguments.py +103 -69
evalscope/perf/benchmark.py +114 -163
evalscope/perf/http_client.py +59 -89
evalscope/perf/main.py +91 -18
evalscope/perf/plugin/__init__.py +3 -2
evalscope/perf/plugin/api/__init__.py +4 -3
evalscope/perf/plugin/api/base.py +27 -7
evalscope/perf/plugin/api/custom_api.py +170 -57
evalscope/perf/plugin/api/dashscope_api.py +4 -10
evalscope/perf/plugin/api/default_api.py +214 -0
evalscope/perf/plugin/api/openai_api.py +120 -41
evalscope/perf/plugin/datasets/__init__.py +10 -6
evalscope/perf/plugin/datasets/base.py +43 -1
evalscope/perf/plugin/datasets/custom.py +22 -3
evalscope/perf/plugin/datasets/flickr8k.py +5 -27
evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
evalscope/perf/plugin/datasets/line_by_line.py +7 -3
evalscope/perf/plugin/datasets/longalpaca.py +7 -3
evalscope/perf/plugin/datasets/openqa.py +13 -14
evalscope/perf/plugin/datasets/random_dataset.py +67 -0
evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
evalscope/perf/plugin/registry.py +36 -16
evalscope/perf/utils/analysis_result.py +24 -23
evalscope/perf/utils/benchmark_util.py +95 -55
evalscope/perf/utils/db_util.py +115 -78
evalscope/perf/utils/local_server.py +12 -47
evalscope/perf/utils/log_utils.py +63 -0
evalscope/perf/utils/rich_display.py +192 -0
evalscope/report/__init__.py +46 -3
evalscope/report/combinator.py +143 -32
evalscope/report/generator.py +74 -34
evalscope/report/report.py +238 -0
evalscope/run.py +71 -46
evalscope/summarizer.py +5 -5
evalscope/third_party/longbench_write/infer.py +1 -1
evalscope/third_party/thinkbench/__init__.py +3 -0
evalscope/third_party/thinkbench/eval.py +441 -0
evalscope/third_party/thinkbench/infer.py +130 -0
evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
evalscope/third_party/thinkbench/tools/__init__.py +0 -0
evalscope/third_party/thinkbench/tools/llm.py +48 -0
evalscope/third_party/thinkbench/tools/utils.py +13 -0
evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
evalscope/utils/__init__.py +82 -2
evalscope/utils/argument_utils.py +64 -0
evalscope/utils/chat_service.py +8 -6
evalscope/utils/deprecation_utils.py +53 -0
evalscope/utils/function_utils.py +266 -0
evalscope/utils/import_utils.py +154 -0
evalscope/utils/io_utils.py +336 -8
evalscope/utils/json_schema.py +231 -0
evalscope/utils/logger.py +121 -31
evalscope/utils/model_utils.py +57 -1
evalscope/utils/multi_choices.py +303 -0
evalscope/utils/ner.py +377 -0
evalscope/utils/url_utils.py +65 -0
evalscope/version.py +2 -2
evalscope-1.2.0.dist-info/METADATA +553 -0
evalscope-1.2.0.dist-info/RECORD +628 -0
{evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
{evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
evalscope/benchmarks/arc/ai2_arc.py +0 -151
evalscope/benchmarks/benchmark.py +0 -76
evalscope/benchmarks/ceval/ceval_exam.py +0 -146
evalscope/benchmarks/ceval/samples.jsonl +0 -1
evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
evalscope/benchmarks/competition_math/competition_math.py +0 -79
evalscope/benchmarks/data_adapter.py +0 -291
evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
evalscope/benchmarks/humaneval/humaneval.py +0 -79
evalscope/benchmarks/mmlu/mmlu.py +0 -160
evalscope/benchmarks/mmlu/samples.jsonl +0 -5
evalscope/benchmarks/race/race.py +0 -104
evalscope/benchmarks/race/samples.jsonl +0 -5
evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
evalscope/collections/evaluator.py +0 -198
evalscope/evaluator/rating_eval.py +0 -157
evalscope/evaluator/reviewer/__init__.py +0 -1
evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
evalscope/metrics/code_metric.py +0 -98
evalscope/metrics/named_metrics.py +0 -17
evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
evalscope/models/base_adapter.py +0 -52
evalscope/models/chat_adapter.py +0 -138
evalscope/models/choice_adapter.py +0 -211
evalscope/models/custom/__init__.py +0 -3
evalscope/models/custom/custom_model.py +0 -53
evalscope/models/custom/dummy_model.py +0 -63
evalscope/models/custom_adapter.py +0 -67
evalscope/models/local_model.py +0 -74
evalscope/models/model.py +0 -229
evalscope/models/server_adapter.py +0 -111
evalscope/registry/__init__.py +0 -1
evalscope/registry/config/cfg_arena.yaml +0 -77
evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
evalscope/registry/config/cfg_single.yaml +0 -78
evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
evalscope/registry/data/qa_browser/battle.jsonl +0 -634
evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
evalscope/registry/data/question.jsonl +0 -80
evalscope/registry/tasks/arc.yaml +0 -28
evalscope/registry/tasks/bbh.yaml +0 -26
evalscope/registry/tasks/bbh_mini.yaml +0 -26
evalscope/registry/tasks/ceval.yaml +0 -27
evalscope/registry/tasks/ceval_mini.yaml +0 -26
evalscope/registry/tasks/cmmlu.yaml +0 -27
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
evalscope/registry/tasks/general_qa.yaml +0 -27
evalscope/registry/tasks/gsm8k.yaml +0 -29
evalscope/registry/tasks/mmlu.yaml +0 -29
evalscope/registry/tasks/mmlu_mini.yaml +0 -27
evalscope/report/app.py +0 -506
evalscope/report/utils.py +0 -133
evalscope/run_arena.py +0 -202
evalscope/utils/arena_utils.py +0 -217
evalscope/utils/completion_parsers.py +0 -82
evalscope/utils/utils.py +0 -301
evalscope-0.10.0.dist-info/METADATA +0 -565
evalscope-0.10.0.dist-info/RECORD +0 -286
tests/__init__.py +0 -1
tests/cli/__init__.py +0 -1
tests/cli/test_collection.py +0 -57
tests/cli/test_run.py +0 -165
tests/perf/__init__.py +0 -1
tests/perf/test_perf.py +0 -101
tests/rag/test_clip_benchmark.py +0 -85
tests/rag/test_mteb.py +0 -138
tests/rag/test_ragas.py +0 -120
tests/swift/__init__.py +0 -1
tests/swift/test_run_swift_eval.py +0 -145
tests/swift/test_run_swift_vlm_eval.py +0 -127
tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
tests/test_run_all.py +0 -12
tests/vlm/__init__.py +0 -1
tests/vlm/test_vlmeval.py +0 -60
{tests/rag → evalscope/api}/__init__.py +0 -0
{evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0

evalscope/benchmarks/ner/cross_ner_adapter.py ADDED Viewed

@@ -0,0 +1,120 @@
+from typing import Any, Dict, List, Set, Tuple
+from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.registry import register_benchmark
+from evalscope.benchmarks.ner.cross_ner_entities import ai, literature, music, politics, science
+from evalscope.constants import Tags
+from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE, create_target_text
+DESCRIPTION = (
+    'CrossNER is a fully-labelled collected of named entity recognition (NER) data '
+    'spanning over five diverse domains (AI, Literature, Music, Politics, Science).'
+)
+@register_benchmark(
+    BenchmarkMeta(
+        name='cross_ner',
+        pretty_name='CrossNER',
+        dataset_id='extraordinarylab/cross-ner',
+        subset_list=['ai', 'literature', 'music', 'politics', 'science'],
+        tags=[Tags.KNOWLEDGE, Tags.NER],
+        description=DESCRIPTION.strip(),
+        few_shot_num=5,
+        train_split='train',
+        eval_split='test',
+        metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
+        prompt_template=PROMPT_TEMPLATE,
+        few_shot_prompt_template=FEWSHOT_TEMPLATE,
+    )
+)
+class CrossNERAdapter(NERAdapter):
+    """
+    Adapter for the CrossNER Named Entity Recognition dataset.
+    This adapter inherits the NER functionality from NERAdapter and
+    configures it specifically for the CrossNER dataset's entity types.
+    """
+    def __init__(self, **kwargs):
+        # Initialize the parent class first
+        super().__init__(**kwargs)
+        # Define CrossNER-specific entity mappings
+        self.entity_type_map = {}
+        # Add descriptions for each entity type
+        self.entity_descriptions = {}
+    def setup_entity_mappings(self):
+        """
+        Setup entity mappings and descriptions for prompt formatting.
+        This should be called after entity_type_map and entity_descriptions are defined.
+        """
+        if self.current_subset_name == 'ai':
+            self.entity_type_map, self.entity_descriptions = ai.get_entity_mappings()
+        elif self.current_subset_name == 'literature':
+            self.entity_type_map, self.entity_descriptions = literature.get_entity_mappings()
+        elif self.current_subset_name == 'music':
+            self.entity_type_map, self.entity_descriptions = music.get_entity_mappings()
+        elif self.current_subset_name == 'politics':
+            self.entity_type_map, self.entity_descriptions = politics.get_entity_mappings()
+        elif self.current_subset_name == 'science':
+            self.entity_type_map, self.entity_descriptions = science.get_entity_mappings()
+        # Reverse mapping for converting back from prediction to evaluation
+        self.reverse_entity_map = {v.lower(): k for k, v in self.entity_type_map.items()}
+        # Create list of tags for prompt formatting
+        self.entity_list = [f'<{ent.lower()}>' for ent in self.entity_type_map.values()]
+        # Create description of entities for prompt
+        self.entities_description = ', '.join([
+            f'{self.entity_type_map[tag]} ({self.entity_descriptions[tag]})' for tag in self.entity_type_map
+        ])
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        """
+        Convert a record with tokens and NER tags into a Sample.
+        Creates both the raw text input and annotated text target.
+        """
+        # Setup entity mappings based on the defined entity types
+        self.setup_entity_mappings()
+        tokens: List[str] = record['tokens']
+        ner_tags: List[str] = record['ner_tags']
+        # Create the input text by joining tokens
+        input_text = ' '.join(tokens)
+        # Process tokens and tags to create annotated target text
+        target_text = create_target_text(tokens, ner_tags, self.entity_type_map)
+        # Store tokens and tags in metadata for evaluation
+        metadata = {'tokens': tokens, 'ner_tags': ner_tags}
+        return Sample(input=input_text, target=target_text, metadata=metadata)
+    def format_prompt_template(self, sample):
+        """
+        Format the prompt with entity types, available tags, and text to annotate.
+        """
+        # Setup entity mappings based on the defined entity types
+        self.setup_entity_mappings()
+        return self.prompt_template.format(
+            entities=self.entities_description, entity_list=', '.join(self.entity_list), text=sample.input
+        )
+    def format_fewshot_template(self, fewshot, sample):
+        """
+        Format the few-shot prompt with all required parameters.
+        """
+        # Setup entity mappings based on the defined entity types
+        self.setup_entity_mappings()
+        return self.few_shot_prompt_template.format(
+            fewshot=fewshot,
+            entities=self.entities_description,
+            entity_list=', '.join(self.entity_list),
+            text=sample.input
+        )

evalscope/benchmarks/ner/cross_ner_entities/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/ner/cross_ner_entities/ai.py ADDED Viewed

@@ -0,0 +1,54 @@
+def get_entity_mappings():
+    entity_type_map = {
+        'ALGORITHM': 'algorithm',
+        'CONFERENCE': 'conference',
+        'COUNTRY': 'country',
+        'FIELD': 'field',
+        'LOCATION': 'location',
+        'METRICS': 'metrics',
+        'MISC': 'misc',
+        'ORGANISATION': 'organisation',
+        'PERSON': 'person',
+        'PRODUCT': 'product',
+        'PROGRAMLANG': 'programming_language',
+        'RESEARCHER': 'researcher',
+        'TASK': 'task',
+        'UNIVERSITY': 'university'
+    }
+    entity_descriptions = {
+        'ALGORITHM':
+        ('A specific algorithm or model architecture in AI (e.g., "Transformer", '
+         '"gradient descent", "ResNet").'),
+        'CONFERENCE': ('An academic conference related to AI (e.g., "NeurIPS", "ICML", "CVPR").'),
+        'COUNTRY': ('A country mentioned in the context of AI research or development '
+                    '(e.g., "USA", "China").'),
+        'FIELD':
+        ('A sub-field or area of study within AI (e.g., "Natural Language Processing", '
+         '"Computer Vision").'),
+        'LOCATION':
+        ('A specific geographical location relevant to AI, other than countries '
+         '(e.g., "Silicon Valley").'),
+        'METRICS': ('A performance metric used to evaluate AI models (e.g., "F1-score", '
+                    '"BLEU", "accuracy").'),
+        'MISC': ('Miscellaneous AI-related terms that don\'t fit other categories '
+                 '(e.g., "Turing Award").'),
+        'ORGANISATION':
+        ('An organization, company, or lab involved in AI (e.g., "Google AI", '
+         '"OpenAI", "DeepMind").'),
+        'PERSON':
+        ('A person mentioned in the context of AI, who is not a researcher '
+         '(e.g., a CEO or public figure).'),
+        'PRODUCT': ('An AI-related product, framework, or software (e.g., "TensorFlow", '
+                    '"PyTorch", "AlphaGo").'),
+        'PROGRAMLANG': ('A programming language used in AI (e.g., "Python", "C++", "Julia").'),
+        'RESEARCHER': ('A person who conducts research in the field of AI (e.g., "Yann LeCun", '
+                       '"Geoffrey Hinton").'),
+        'TASK': (
+            'A specific problem or task that AI is used to solve (e.g., "Image Classification", '
+            '"Sentiment Analysis").'
+        ),
+        'UNIVERSITY':
+        ('A university or academic institution involved in AI research (e.g., '
+         '"Stanford University", "MIT").')
+    }
+    return entity_type_map, entity_descriptions

evalscope/benchmarks/ner/cross_ner_entities/literature.py ADDED Viewed

@@ -0,0 +1,36 @@
+def get_entity_mappings():
+    entity_type_map = {
+        'AWARD': 'award',
+        'BOOK': 'book',
+        'COUNTRY': 'country',
+        'EVENT': 'event',
+        'LITERARYGENRE': 'literary_genre',
+        'LOCATION': 'location',
+        'MAGAZINE': 'magazine',
+        'MISC': 'misc',
+        'ORGANISATION': 'organisation',
+        'PERSON': 'person',
+        'POEM': 'poem',
+        'WRITER': 'writer'
+    }
+    entity_descriptions = {
+        'AWARD': ('A literary award or prize (e.g., "Nobel Prize in Literature", "Booker Prize").'),
+        'BOOK': ('The title of a book (e.g., "Pride and Prejudice", "One Hundred Years of Solitude").'),
+        'COUNTRY': ('A country relevant to the literary context (e.g., "England", "Russia").'),
+        'EVENT': ('A literary festival or significant event (e.g., "Hay Festival", "Frankfurt Book Fair").'),
+        'LITERARYGENRE':
+        ('A genre or category of literature (e.g., "Science Fiction", "Gothic novel", '
+         '"magical realism").'),
+        'LOCATION': ('A real or fictional place mentioned in a literary context (e.g., "London", '
+                     '"Middle-earth").'),
+        'MAGAZINE': ('A magazine or literary journal (e.g., "The New Yorker", "Paris Review").'),
+        'MISC': ('Miscellaneous literary terms (e.g., "protagonist", "sonnet", '
+                 '"Shakespeare\'s Globe").'),
+        'ORGANISATION': ('A publishing house or literary organization (e.g., "Penguin Random House").'),
+        'PERSON': ('A character or person mentioned who is not a writer (e.g., "Elizabeth Bennet", '
+                   '"King Lear").'),
+        'POEM': ('The title of a poem (e.g., "The Waste Land", "Ozymandias").'),
+        'WRITER': ('The name of a writer, author, or poet (e.g., "Jane Austen", '
+                   '"Gabriel Garcia Marquez").')
+    }
+    return entity_type_map, entity_descriptions

evalscope/benchmarks/ner/cross_ner_entities/music.py ADDED Viewed

@@ -0,0 +1,39 @@
+def get_entity_mappings():
+    entity_type_map = {
+        'ALBUM': 'album',
+        'AWARD': 'award',
+        'BAND': 'band',
+        'COUNTRY': 'country',
+        'EVENT': 'event',
+        'LOCATION': 'location',
+        'MISC': 'misc',
+        'MUSICALARTIST': 'musical_artist',
+        'MUSICALINSTRUMENT': 'musical_instrument',
+        'MUSICGENRE': 'music_genre',
+        'ORGANISATION': 'organisation',
+        'PERSON': 'person',
+        'SONG': 'song'
+    }
+    entity_descriptions = {
+        'ALBUM': ('The title of a music album (e.g., "Abbey Road", "Thriller", "Lemonade").'),
+        'AWARD': ('A music award or prize (e.g., "Grammy Award", "MTV Music Award").'),
+        'BAND': ('The name of a musical group or band (e.g., "The Beatles", "Queen", "BTS").'),
+        'COUNTRY': ('A country relevant to the music context (e.g., "USA", "UK", "South Korea").'),
+        'EVENT': ('A music festival, concert tour, or event (e.g., "Glastonbury Festival", '
+                  '"Woodstock").'),
+        'LOCATION':
+        ('A venue, studio, or place relevant to music (e.g., "Madison Square Garden", '
+         '"Abbey Road Studios").'),
+        'MISC': ('Miscellaneous music-related terms (e.g., "synthesizer", "major key", '
+                 '"a cappella").'),
+        'MUSICALARTIST': ('A solo musician or singer (e.g., "Michael Jackson", "Taylor Swift", '
+                          '"Ed Sheeran").'),
+        'MUSICALINSTRUMENT': ('A musical instrument (e.g., "guitar", "piano", "violin").'),
+        'MUSICGENRE': ('A genre or style of music (e.g., "Rock", "Pop", "Jazz", "K-Pop").'),
+        'ORGANISATION': ('A record label or music organization (e.g., "Capitol Records", "Sony Music").'),
+        'PERSON':
+        ('A person related to music who is not a primary artist (e.g., a producer, '
+         'a songwriter, "John Lennon").'),
+        'SONG': ('The title of a song (e.g., "Bohemian Rhapsody", "Hey Jude", "Dynamite").')
+    }
+    return entity_type_map, entity_descriptions

evalscope/benchmarks/ner/cross_ner_entities/politics.py ADDED Viewed

@@ -0,0 +1,37 @@
+def get_entity_mappings():
+    entity_type_map = {
+        'COUNTRY': 'country',
+        'ELECTION': 'election',
+        'EVENT': 'event',
+        'LOCATION': 'location',
+        'MISC': 'misc',
+        'ORGANISATION': 'organisation',
+        'PERSON': 'person',
+        'POLITICALPARTY': 'political_party',
+        'POLITICIAN': 'politician'
+    }
+    entity_descriptions = {
+        'COUNTRY': ('A country or sovereign state (e.g., "United States", "Germany").'),
+        'ELECTION': ('A specific election event (e.g., "2024 presidential election", '
+                     '"midterm elections").'),
+        'EVENT':
+        ('A significant political event, summit, or incident (e.g., "G7 Summit", '
+         '"Brexit", "Watergate scandal").'),
+        'LOCATION':
+        ('A politically significant building or location (e.g., "The White House", '
+         '"10 Downing Street").'),
+        'MISC': (
+            'Miscellaneous political terms, ideologies, or documents (e.g., "democracy", '
+            '"impeachment", "the Constitution").'
+        ),
+        'ORGANISATION':
+        ('A political or governmental organization (e.g., "United Nations", "NATO", '
+         '"European Union").'),
+        'PERSON':
+        ('A person mentioned in a political context who is not a politician '
+         '(e.g., a journalist, an activist).'),
+        'POLITICALPARTY': ('A named political party (e.g., "Democratic Party", "Conservative Party").'),
+        'POLITICIAN': ('A person who holds or seeks political office (e.g., "Joe Biden", '
+                       '"Angela Merkel").')
+    }
+    return entity_type_map, entity_descriptions

evalscope/benchmarks/ner/cross_ner_entities/science.py ADDED Viewed

@@ -0,0 +1,58 @@
+def get_entity_mappings():
+    entity_type_map = {
+        'ACADEMICJOURNAL': 'academic_journal',
+        'ASTRONOMICALOBJECT': 'astronomical_object',
+        'AWARD': 'award',
+        'CHEMICALCOMPOUND': 'chemical_compound',
+        'CHEMICALELEMENT': 'chemical_element',
+        'COUNTRY': 'country',
+        'DISCIPLINE': 'discipline',
+        'ENZYME': 'enzyme',
+        'EVENT': 'event',
+        'LOCATION': 'location',
+        'MISC': 'misc',
+        'ORGANISATION': 'organisation',
+        'PERSON': 'person',
+        'PROTEIN': 'protein',
+        'SCIENTIST': 'scientist',
+        'THEORY': 'theory',
+        'UNIVERSITY': 'university'
+    }
+    entity_descriptions = {
+        'ACADEMICJOURNAL': ('A scientific journal or publication (e.g., "Nature", "Science", "The Lancet").'),
+        'ASTRONOMICALOBJECT': ('A natural object in space (e.g., "Mars", "Andromeda Galaxy", '
+                               '"Halley\'s Comet").'),
+        'AWARD': ('A scientific award or prize (e.g., "Nobel Prize in Physics", "Fields Medal").'),
+        'CHEMICALCOMPOUND':
+        ('A chemical substance consisting of two or more elements (e.g., "H2O", '
+         '"Carbon Dioxide").'),
+        'CHEMICALELEMENT': ('An element from the periodic table (e.g., "Hydrogen", "Oxygen", "Gold").'),
+        'COUNTRY': ('A country relevant to a scientific context (e.g., "Switzerland" for CERN).'),
+        'DISCIPLINE':
+        ('A branch of science or academic discipline (e.g., "Physics", '
+         '"Molecular Biology", "Astronomy").'),
+        'ENZYME': ('A specific type of protein that acts as a catalyst (e.g., "Lactase", "Catalase").'),
+        'EVENT': ('A significant scientific mission or event (e.g., "Apollo 11 mission", '
+                  '"Human Genome Project").'),
+        'LOCATION':
+        ('A research facility or location of scientific importance (e.g., "CERN", '
+         '"International Space Station").'),
+        'MISC':
+        ('Miscellaneous scientific terms or concepts (e.g., "double helix", '
+         '"black hole", "quantum mechanics").'),
+        'ORGANISATION': ('A scientific organization or agency (e.g., "NASA", "Max Planck Society", "WHO").'),
+        'PERSON':
+        ('A person mentioned in a scientific context who is not a scientist '
+         '(e.g., a patient, a benefactor).'),
+        'PROTEIN': ('A specific protein (that is not an enzyme) (e.g., "Hemoglobin", '
+                    '"Insulin", "Keratin").'),
+        'SCIENTIST':
+        ('A person who is a scientist, researcher, or inventor (e.g., "Albert Einstein", '
+         '"Marie Curie").'),
+        'THEORY': ('A named scientific theory or law (e.g., "Theory of Relativity", '
+                   '"Big Bang Theory").'),
+        'UNIVERSITY':
+        ('A university or academic institution involved in science (e.g., '
+         '"Cambridge University", "Caltech").')
+    }
+    return entity_type_map, entity_descriptions

evalscope/benchmarks/ner/genia_ner_adapter.py ADDED Viewed

@@ -0,0 +1,66 @@
+from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE
+DESCRIPTION = (
+    'GeniaNER consisting of 2,000 MEDLINE abstracts has been released with more than '
+    '400,000 words and almost 100,000 annotations for biological terms.'
+)
+@register_benchmark(
+    BenchmarkMeta(
+        name='genia_ner',
+        pretty_name='GeniaNER',
+        dataset_id='extraordinarylab/genia-ner',
+        tags=[Tags.KNOWLEDGE, Tags.NER],
+        description=DESCRIPTION.strip(),
+        few_shot_num=5,
+        train_split='train',
+        eval_split='test',
+        metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
+        prompt_template=PROMPT_TEMPLATE,
+        few_shot_prompt_template=FEWSHOT_TEMPLATE,
+    )
+)
+class GeniaNERAdapter(NERAdapter):
+    """
+    Adapter for the GeniaNER Named Entity Recognition dataset.
+    This adapter inherits the NER functionality from NERAdapter and
+    configures it specifically for the GeniaNER dataset's entity types.
+    """
+    def __init__(self, **kwargs):
+        # Initialize the parent class first
+        super().__init__(**kwargs)
+        # Define GeniaNER-specific entity mappings
+        self.entity_type_map = {
+            'CELL_LINE': 'cell_line',
+            'CELL_TYPE': 'cell_type',
+            'DNA': 'dna',
+            'PROTEIN': 'protein',
+            'RNA': 'rna'
+        }
+        # Add descriptions for each entity type
+        self.entity_descriptions = {
+            'CELL_LINE':
+            'A population of cells derived from a single cell and grown in a culture.',
+            'CELL_TYPE':
+            ('A category of cells that are part of a larger organism and share a specific '
+             'structure and function.'),
+            'DNA':
+            'Deoxyribonucleic acid. This includes specific genes, domains, and regions of a DNA molecule.',
+            'PROTEIN': (
+                'Molecules composed of amino acids that perform a vast array of functions within '
+                'organisms. This includes enzymes, receptors, and signaling molecules.'
+            ),
+            'RNA':
+            'Ribonucleic acid. This refers to RNA molecules, including messenger RNA (mRNA) and other types.'
+        }
+        # Setup entity mappings based on the defined entity types
+        self.setup_entity_mappings()

evalscope/benchmarks/ner/harvey_ner_adapter.py ADDED Viewed

@@ -0,0 +1,58 @@
+from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE
+DESCRIPTION = (
+    'HarveyNER is a dataset with fine-grained locations annotated in tweets. This dataset '
+    'presents unique challenges and characterizes many complex and long location mentions '
+    'in informal descriptions.'
+)
+@register_benchmark(
+    BenchmarkMeta(
+        name='harvey_ner',
+        pretty_name='HarveyNER',
+        dataset_id='extraordinarylab/harvey-ner',
+        tags=[Tags.KNOWLEDGE, Tags.NER],
+        description=DESCRIPTION.strip(),
+        few_shot_num=5,
+        train_split='train',
+        eval_split='test',
+        metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
+        prompt_template=PROMPT_TEMPLATE,
+        few_shot_prompt_template=FEWSHOT_TEMPLATE,
+    )
+)
+class HarveyNERAdapter(NERAdapter):
+    """
+    Adapter for the HarveyNER Named Entity Recognition dataset.
+    This adapter inherits the NER functionality from NERAdapter and
+    configures it specifically for the HarveyNER dataset's entity types.
+    """
+    def __init__(self, **kwargs):
+        # Initialize the parent class first
+        super().__init__(**kwargs)
+        # Define HarveyNER-specific entity mappings
+        self.entity_type_map = {'AREA': 'area', 'POINT': 'point', 'RIVER': 'river', 'ROAD': 'road'}
+        # Add descriptions for each entity type
+        self.entity_descriptions = {
+            'AREA':
+            'Geographical entities such as city subdivisions, neighborhoods, etc.',
+            'POINT': (
+                'An exact location that a geocoordinate can be assigned. E.g., a uniquely named '
+                'building, intersections of roads or rivers.'
+            ),
+            'RIVER':
+            'A river or a section of a river.',
+            'ROAD':
+            'A road or a section of a road.'
+        }
+        # Setup entity mappings based on the defined entity types
+        self.setup_entity_mappings()

evalscope/benchmarks/ner/mit_movie_trivia_adapter.py ADDED Viewed

@@ -0,0 +1,74 @@
+from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE
+DESCRIPTION = (
+    'The MIT-Movie-Trivia dataset, originally created for slot filling, is modified by '
+    'ignoring some slot types (e.g. genre, rating) and merging others (e.g. director '
+    'and actor in person, and song and movie title in title) in order to keep '
+    'consistent named entity types across all datasets.'
+)
+@register_benchmark(
+    BenchmarkMeta(
+        name='mit_movie_trivia',
+        pretty_name='MIT-Movie-Trivia',
+        dataset_id='extraordinarylab/mit-movie-trivia',
+        tags=[Tags.KNOWLEDGE, Tags.NER],
+        description=DESCRIPTION.strip(),
+        few_shot_num=5,
+        train_split='train',
+        eval_split='test',
+        metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
+        prompt_template=PROMPT_TEMPLATE,
+        few_shot_prompt_template=FEWSHOT_TEMPLATE,
+    )
+)
+class MITMovieTriviaAdapter(NERAdapter):
+    """
+    Adapter for the MIT-Movie-Trivia Named Entity Recognition dataset.
+    This adapter inherits the NER functionality from NERAdapter and
+    configures it specifically for the MIT-Movie-Trivia dataset's entity types.
+    """
+    def __init__(self, **kwargs):
+        # Initialize the parent class first
+        super().__init__(**kwargs)
+        # Define MIT-Movie-Trivia-specific entity mappings
+        self.entity_type_map = {
+            'ACTOR': 'actor',
+            'AWARD': 'award',
+            'CHARACTER_NAME': 'character_name',
+            'DIRECTOR': 'director',
+            'GENRE': 'genre',
+            'OPINION': 'opinion',
+            'ORIGIN': 'origin',
+            'PLOT': 'plot',
+            'QUOTE': 'quote',
+            'RELATIONSHIP': 'relationship',
+            'SOUNDTRACK': 'soundtrack',
+            'YEAR': 'year'
+        }
+        # Add descriptions for each entity type
+        self.entity_descriptions = {
+            'ACTOR': 'The name of an actor or actress starring in the movie.',
+            'AWARD': 'An award the movie won or was nominated for.',
+            'CHARACTER_NAME': 'The name of a character in the movie.',
+            'DIRECTOR': 'The name of the person who directed the movie.',
+            'GENRE': 'The category or style of the movie.',
+            'OPINION': 'A subjective review or personal opinion about the movie.',
+            'ORIGIN': 'The source material or basis for the movie.',
+            'PLOT': 'A description or summary of the movie\'s storyline.',
+            'QUOTE': 'A memorable line or phrase spoken in the movie.',
+            'RELATIONSHIP': 'The connection or relationship between characters.',
+            'SOUNDTRACK': 'The music or a specific song from the movie.',
+            'YEAR': 'The release year of the movie.'
+        }
+        # Setup entity mappings based on the defined entity types
+        self.setup_entity_mappings()

evalscope/benchmarks/ner/mit_restaurant_adapter.py ADDED Viewed

@@ -0,0 +1,66 @@
+from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE
+DESCRIPTION = (
+    'The MIT-Restaurant dataset is a collection of restaurant review text specifically '
+    'curated for training and testing Natural Language Processing (NLP) models, '
+    'particularly for Named Entity Recognition (NER). It contains sentences from real '
+    'reviews, along with corresponding labels in the BIO format.'
+)
+@register_benchmark(
+    BenchmarkMeta(
+        name='mit_restaurant',
+        pretty_name='MIT-Restaurant',
+        dataset_id='extraordinarylab/mit-restaurant',
+        tags=[Tags.KNOWLEDGE, Tags.NER],
+        description=DESCRIPTION.strip(),
+        few_shot_num=5,
+        train_split='train',
+        eval_split='test',
+        metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
+        prompt_template=PROMPT_TEMPLATE,
+        few_shot_prompt_template=FEWSHOT_TEMPLATE,
+    )
+)
+class MITRestaurantAdapter(NERAdapter):
+    """
+    Adapter for the MIT-Restaurant Named Entity Recognition dataset.
+    This adapter inherits the NER functionality from NERAdapter and
+    configures it specifically for the MIT-Restaurant dataset's entity types.
+    """
+    def __init__(self, **kwargs):
+        # Initialize the parent class first
+        super().__init__(**kwargs)
+        # Define MIT-Restaurant-specific entity mappings
+        self.entity_type_map = {
+            'AMENITY': 'amenity',
+            'CUISINE': 'cuisine',
+            'DISH': 'dish',
+            'HOURS': 'hours',
+            'LOCATION': 'location',
+            'PRICE': 'price',
+            'RATING': 'rating',
+            'RESTAURANT_NAME': 'restaurant_name'
+        }
+        # Add descriptions for each entity type
+        self.entity_descriptions = {
+            'AMENITY': 'A feature or service offered by the restaurant.',
+            'CUISINE': 'The type of food a restaurant serves.',
+            'DISH': 'A specific food or drink item.',
+            'HOURS': 'The operating hours of a restaurant.',
+            'LOCATION': 'The address or general location of a restaurant.',
+            'PRICE': 'The price range of a restaurant.',
+            'RATING': 'A rating or review of the restaurant.',
+            'RESTAURANT_NAME': 'The name of a restaurant.',
+        }
+        # Setup entity mappings based on the defined entity types
+        self.setup_entity_mappings()

evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

evalscope 0.10.0py3-none-any.whl → 1.2.0py3-none-any.whl