evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/api/benchmark/__init__.py +9 -1
- evalscope/api/benchmark/adapters/__init__.py +4 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +85 -2
- evalscope/api/benchmark/meta.py +10 -1
- evalscope/api/dataset/dataset.py +27 -6
- evalscope/api/dataset/loader.py +8 -3
- evalscope/api/evaluator/cache.py +31 -4
- evalscope/api/evaluator/evaluator.py +5 -0
- evalscope/api/evaluator/state.py +17 -1
- evalscope/api/messages/__init__.py +1 -0
- evalscope/api/messages/chat_message.py +52 -2
- evalscope/api/metric/__init__.py +1 -1
- evalscope/api/metric/metric.py +6 -1
- evalscope/api/metric/scorer.py +15 -7
- evalscope/api/mixin/__init__.py +1 -1
- evalscope/api/mixin/llm_judge_mixin.py +2 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/generate_config.py +10 -6
- evalscope/api/model/model.py +5 -2
- evalscope/api/tool/tool_info.py +1 -1
- evalscope/app/app.py +3 -0
- evalscope/app/ui/multi_model.py +6 -1
- evalscope/app/ui/single_model.py +11 -5
- evalscope/app/utils/data_utils.py +8 -7
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -12
- evalscope/app/utils/visualization.py +2 -2
- evalscope/arguments.py +8 -4
- evalscope/backend/opencompass/backend_manager.py +0 -2
- evalscope/backend/rag_eval/utils/embedding.py +9 -1
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/aime24_adapter.py +5 -0
- evalscope/benchmarks/aime/aime25_adapter.py +136 -1
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/benchmarks/aime/math_normalize.py +189 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
- evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
- evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/drop_adapter.py +15 -44
- evalscope/benchmarks/drop/utils.py +97 -0
- evalscope/benchmarks/frames/frames_adapter.py +2 -1
- evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
- evalscope/benchmarks/general_arena/utils.py +2 -1
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hle/hle_adapter.py +3 -2
- evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
- evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/config.py +103 -18
- evalscope/constants.py +18 -0
- evalscope/evaluator/evaluator.py +138 -82
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/llm_judge.py +19 -7
- evalscope/metrics/math_parser.py +14 -0
- evalscope/metrics/metric.py +317 -13
- evalscope/metrics/metrics.py +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/model_apis.py +22 -0
- evalscope/models/openai_compatible.py +21 -0
- evalscope/models/text2image_model.py +2 -2
- evalscope/models/utils/openai.py +16 -6
- evalscope/perf/arguments.py +26 -4
- evalscope/perf/benchmark.py +76 -89
- evalscope/perf/http_client.py +31 -16
- evalscope/perf/main.py +15 -2
- evalscope/perf/plugin/api/base.py +9 -7
- evalscope/perf/plugin/api/custom_api.py +13 -58
- evalscope/perf/plugin/api/default_api.py +188 -79
- evalscope/perf/plugin/api/openai_api.py +85 -20
- evalscope/perf/plugin/datasets/base.py +21 -0
- evalscope/perf/plugin/datasets/custom.py +2 -3
- evalscope/perf/plugin/datasets/flickr8k.py +2 -2
- evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
- evalscope/perf/plugin/datasets/line_by_line.py +2 -3
- evalscope/perf/plugin/datasets/longalpaca.py +2 -3
- evalscope/perf/plugin/datasets/openqa.py +2 -4
- evalscope/perf/plugin/datasets/random_dataset.py +1 -3
- evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
- evalscope/perf/utils/benchmark_util.py +43 -27
- evalscope/perf/utils/db_util.py +14 -19
- evalscope/perf/utils/local_server.py +3 -44
- evalscope/perf/utils/log_utils.py +21 -6
- evalscope/report/__init__.py +13 -3
- evalscope/report/combinator.py +91 -20
- evalscope/report/generator.py +8 -87
- evalscope/report/report.py +8 -4
- evalscope/run.py +13 -5
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/argument_utils.py +1 -1
- evalscope/utils/chat_service.py +1 -1
- evalscope/utils/function_utils.py +249 -12
- evalscope/utils/import_utils.py +73 -1
- evalscope/utils/io_utils.py +132 -7
- evalscope/utils/json_schema.py +25 -2
- evalscope/utils/logger.py +69 -18
- evalscope/utils/model_utils.py +4 -3
- evalscope/utils/multi_choices.py +39 -7
- evalscope/utils/ner.py +377 -0
- evalscope/version.py +2 -2
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/api/mixin/dataset_mixin.py +0 -105
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
- tests/__init__.py +0 -1
- tests/aigc/__init__.py +0 -1
- tests/aigc/test_t2i.py +0 -142
- tests/benchmark/__init__.py +0 -1
- tests/benchmark/test_eval.py +0 -386
- tests/cli/__init__.py +0 -1
- tests/cli/test_all.py +0 -229
- tests/cli/test_collection.py +0 -96
- tests/cli/test_custom.py +0 -268
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -176
- tests/rag/test_clip_benchmark.py +0 -90
- tests/rag/test_mteb.py +0 -213
- tests/rag/test_ragas.py +0 -128
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -146
- tests/swift/test_run_swift_vlm_eval.py +0 -128
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
- tests/test_run_all.py +0 -12
- tests/utils.py +0 -13
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -102
- /evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
- {tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
evalscope/utils/logger.py
CHANGED
|
@@ -28,22 +28,41 @@ logging.getLogger('datasets').setLevel(logging.WARNING)
|
|
|
28
28
|
logging.getLogger('httpx').setLevel(logging.WARNING)
|
|
29
29
|
logging.getLogger('modelscope').setLevel(logging.ERROR)
|
|
30
30
|
|
|
31
|
+
info_set = set()
|
|
32
|
+
warning_set = set()
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def info_once(self, msg, *args, **kwargs):
|
|
36
|
+
hash_id = kwargs.get('hash_id') or msg
|
|
37
|
+
if hash_id in info_set:
|
|
38
|
+
return
|
|
39
|
+
info_set.add(hash_id)
|
|
40
|
+
self.info(msg)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def warning_once(self, msg, *args, **kwargs):
|
|
44
|
+
hash_id = kwargs.get('hash_id') or msg
|
|
45
|
+
if hash_id in warning_set:
|
|
46
|
+
return
|
|
47
|
+
warning_set.add(hash_id)
|
|
48
|
+
self.warning(msg)
|
|
49
|
+
|
|
31
50
|
|
|
32
51
|
def get_logger(
|
|
33
52
|
log_file: Optional[str] = None,
|
|
34
53
|
name: Optional[str] = None,
|
|
35
54
|
log_level: int = DEFAULT_LEVEL,
|
|
36
55
|
file_mode: str = 'w',
|
|
37
|
-
force=False
|
|
56
|
+
force: bool = False,
|
|
38
57
|
):
|
|
39
58
|
"""Get logging logger
|
|
40
59
|
|
|
41
60
|
Args:
|
|
42
|
-
log_file: Log filename
|
|
43
|
-
|
|
44
|
-
log_level: Logging level.
|
|
45
|
-
file_mode:
|
|
46
|
-
|
|
61
|
+
log_file: Log filename. If specified, a file handler will be added to the logger.
|
|
62
|
+
name: Logical component name. Used to derive the logger name.
|
|
63
|
+
log_level: Logging level to set.
|
|
64
|
+
file_mode: Mode to open the file when log_file is provided (default 'w').
|
|
65
|
+
force: If True, reconfigure the existing logger (levels, formatters, handlers).
|
|
47
66
|
"""
|
|
48
67
|
|
|
49
68
|
if name:
|
|
@@ -58,7 +77,7 @@ def get_logger(
|
|
|
58
77
|
logger.setLevel(log_level)
|
|
59
78
|
for handler in logger.handlers:
|
|
60
79
|
handler.setLevel(log_level)
|
|
61
|
-
#
|
|
80
|
+
# Select formatter by handler type
|
|
62
81
|
if isinstance(handler, logging.FileHandler):
|
|
63
82
|
handler.setFormatter(
|
|
64
83
|
plain_detailed_formatter if log_level == logging.DEBUG else plain_simple_formatter
|
|
@@ -67,6 +86,7 @@ def get_logger(
|
|
|
67
86
|
handler.setFormatter(
|
|
68
87
|
color_detailed_formatter if log_level == logging.DEBUG else color_simple_formatter
|
|
69
88
|
)
|
|
89
|
+
# Ensure file handler points to current log_file (replace if needed)
|
|
70
90
|
add_file_handler_if_needed(logger, log_file, file_mode, log_level)
|
|
71
91
|
return logger
|
|
72
92
|
|
|
@@ -88,7 +108,7 @@ def get_logger(
|
|
|
88
108
|
handlers = [stream_handler]
|
|
89
109
|
|
|
90
110
|
if is_worker0 and log_file is not None:
|
|
91
|
-
file_handler = logging.FileHandler(log_file, file_mode)
|
|
111
|
+
file_handler = logging.FileHandler(log_file, file_mode, encoding='utf-8')
|
|
92
112
|
handlers.append(file_handler)
|
|
93
113
|
|
|
94
114
|
for handler in handlers:
|
|
@@ -118,23 +138,54 @@ def configure_logging(debug: bool, log_file: Optional[str] = None):
|
|
|
118
138
|
get_logger(log_level=logging.DEBUG, force=True)
|
|
119
139
|
|
|
120
140
|
|
|
121
|
-
def add_file_handler_if_needed(
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
141
|
+
def add_file_handler_if_needed(
|
|
142
|
+
logger: logging.Logger,
|
|
143
|
+
log_file: Optional[str],
|
|
144
|
+
file_mode: str,
|
|
145
|
+
log_level: int,
|
|
146
|
+
) -> None:
|
|
147
|
+
"""Ensure logger has a FileHandler targeting log_file.
|
|
148
|
+
- If no FileHandler exists, add one.
|
|
149
|
+
- If a FileHandler exists but points to a different file, replace it.
|
|
150
|
+
"""
|
|
151
|
+
if log_file is None:
|
|
152
|
+
return
|
|
125
153
|
|
|
154
|
+
# Only worker-0 writes files
|
|
126
155
|
if iutil.find_spec('torch') is not None:
|
|
127
156
|
from modelscope.utils.torch_utils import is_master
|
|
128
|
-
|
|
129
157
|
is_worker0 = is_master()
|
|
130
158
|
else:
|
|
131
159
|
is_worker0 = True
|
|
132
160
|
|
|
133
|
-
if
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
161
|
+
if not is_worker0:
|
|
162
|
+
return
|
|
163
|
+
|
|
164
|
+
target_path = os.path.abspath(log_file)
|
|
165
|
+
existing_file_handlers = [h for h in logger.handlers if isinstance(h, logging.FileHandler)]
|
|
166
|
+
|
|
167
|
+
# If there is a FileHandler already pointing to the target file, nothing to do.
|
|
168
|
+
for fh in existing_file_handlers:
|
|
169
|
+
try:
|
|
170
|
+
if os.path.abspath(getattr(fh, 'baseFilename', '')) == target_path:
|
|
171
|
+
return
|
|
172
|
+
except Exception:
|
|
173
|
+
# If any issue retrieving baseFilename, fall through to replacement
|
|
174
|
+
pass
|
|
175
|
+
|
|
176
|
+
# Replace all existing FileHandlers with the new one
|
|
177
|
+
for fh in existing_file_handlers:
|
|
178
|
+
try:
|
|
179
|
+
logger.removeHandler(fh)
|
|
180
|
+
fh.flush()
|
|
181
|
+
fh.close()
|
|
182
|
+
except Exception:
|
|
183
|
+
pass
|
|
184
|
+
|
|
185
|
+
file_handler = logging.FileHandler(target_path, file_mode, encoding='utf-8')
|
|
186
|
+
file_handler.setFormatter(plain_detailed_formatter if log_level == logging.DEBUG else plain_simple_formatter)
|
|
187
|
+
file_handler.setLevel(log_level)
|
|
188
|
+
logger.addHandler(file_handler)
|
|
138
189
|
|
|
139
190
|
|
|
140
191
|
def warn_once(logger: Logger, message: str) -> None:
|
evalscope/utils/model_utils.py
CHANGED
|
@@ -3,6 +3,8 @@ import random
|
|
|
3
3
|
from enum import Enum
|
|
4
4
|
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
|
|
5
5
|
|
|
6
|
+
from evalscope.utils.import_utils import check_import
|
|
7
|
+
|
|
6
8
|
if TYPE_CHECKING:
|
|
7
9
|
from transformers import GenerationConfig
|
|
8
10
|
|
|
@@ -67,7 +69,8 @@ def seed_everything(seed: int):
|
|
|
67
69
|
"""
|
|
68
70
|
random.seed(seed)
|
|
69
71
|
np.random.seed(seed)
|
|
70
|
-
|
|
72
|
+
|
|
73
|
+
if check_import('torch', raise_warning=False):
|
|
71
74
|
import torch
|
|
72
75
|
|
|
73
76
|
torch.manual_seed(seed)
|
|
@@ -75,5 +78,3 @@ def seed_everything(seed: int):
|
|
|
75
78
|
torch.cuda.manual_seed_all(seed)
|
|
76
79
|
torch.backends.cudnn.deterministic = True
|
|
77
80
|
torch.backends.cudnn.benchmark = False
|
|
78
|
-
except ImportError:
|
|
79
|
-
pass
|
evalscope/utils/multi_choices.py
CHANGED
|
@@ -1,11 +1,8 @@
|
|
|
1
1
|
# flake8: noqa: E501
|
|
2
|
-
from __future__ import annotations
|
|
3
|
-
|
|
4
2
|
import re
|
|
5
|
-
from typing import
|
|
3
|
+
from typing import List, Optional, Union
|
|
6
4
|
|
|
7
|
-
|
|
8
|
-
from evalscope.api.evaluator import Choices, Target, TaskState
|
|
5
|
+
from evalscope.api.evaluator import Choices, Target, TaskState
|
|
9
6
|
|
|
10
7
|
FEW_SHOT_TEMPLATE = r"""Here are some examples of how to answer similar questions:
|
|
11
8
|
|
|
@@ -84,10 +81,27 @@ def answer_options(choices: Choices) -> str:
|
|
|
84
81
|
return '\n'.join([f'{answer_character(i)}) {choices[j].value}' for i, j in enumerate(indexes)])
|
|
85
82
|
|
|
86
83
|
|
|
87
|
-
def
|
|
84
|
+
def format_letter_choices(choices: Union[Choices, List[str]]) -> str:
|
|
85
|
+
"""
|
|
86
|
+
Returns the `choices` formatted as a letter list, e.g.:
|
|
87
|
+
|
|
88
|
+
["choice 1", "choice 2", "choice 3"] ->
|
|
89
|
+
"A,B,C"
|
|
90
|
+
"""
|
|
91
|
+
if isinstance(choices, list):
|
|
92
|
+
choices = Choices(choices)
|
|
93
|
+
|
|
94
|
+
indexes = list(range(len(choices)))
|
|
95
|
+
|
|
96
|
+
return ','.join([f'{answer_character(i)}' for i in indexes])
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def prompt(question: str, choices: Union[Choices, List[str]], template: str, fewshot: Optional[str] = None) -> str:
|
|
100
|
+
if isinstance(choices, list):
|
|
101
|
+
choices = Choices(choices)
|
|
88
102
|
|
|
89
103
|
choices_text = answer_options(choices)
|
|
90
|
-
letters =
|
|
104
|
+
letters = format_letter_choices(choices)
|
|
91
105
|
if not fewshot:
|
|
92
106
|
return template.format(
|
|
93
107
|
choices=choices_text,
|
|
@@ -122,6 +136,14 @@ def format_example(
|
|
|
122
136
|
return f'{question}\n{choices_text}\nANSWER: {answer.text}'
|
|
123
137
|
|
|
124
138
|
|
|
139
|
+
def _fallback_parse_answer(completion: str) -> Optional[set[str]]:
|
|
140
|
+
# Fallback to find the last upper case letter
|
|
141
|
+
for letter in reversed(completion):
|
|
142
|
+
if letter.isupper():
|
|
143
|
+
return {letter}
|
|
144
|
+
return None
|
|
145
|
+
|
|
146
|
+
|
|
125
147
|
def parse_answers(state: TaskState, multiple_correct: bool = False) -> set[str]:
|
|
126
148
|
"""
|
|
127
149
|
Convenience function for extracting answers from the state output.
|
|
@@ -150,6 +172,11 @@ def parse_answers(state: TaskState, multiple_correct: bool = False) -> set[str]:
|
|
|
150
172
|
state.output.completion,
|
|
151
173
|
)
|
|
152
174
|
|
|
175
|
+
if match is None:
|
|
176
|
+
fallback_answer = _fallback_parse_answer(state.output.completion)
|
|
177
|
+
if fallback_answer:
|
|
178
|
+
return fallback_answer
|
|
179
|
+
|
|
153
180
|
if match is None:
|
|
154
181
|
return set()
|
|
155
182
|
|
|
@@ -200,6 +227,11 @@ def parse_answers_zh(state: TaskState, multiple_correct: bool = False) -> set[st
|
|
|
200
227
|
pattern = r'答案\s*[::]\s*([A-Za-z0-9,,]+)'
|
|
201
228
|
match = re.search(pattern, state.output.completion, flags=re.MULTILINE)
|
|
202
229
|
|
|
230
|
+
if match is None:
|
|
231
|
+
fallback_answer = _fallback_parse_answer(state.output.completion)
|
|
232
|
+
if fallback_answer:
|
|
233
|
+
return fallback_answer
|
|
234
|
+
|
|
203
235
|
if match is None:
|
|
204
236
|
return set()
|
|
205
237
|
|
evalscope/utils/ner.py
ADDED
|
@@ -0,0 +1,377 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Any, Dict, List, Set, Tuple
|
|
3
|
+
|
|
4
|
+
from evalscope.utils.logger import get_logger
|
|
5
|
+
|
|
6
|
+
logger = get_logger()
|
|
7
|
+
|
|
8
|
+
PROMPT_TEMPLATE = """
|
|
9
|
+
You are a named entity recognition system that identifies the following entity types:
|
|
10
|
+
{entities}
|
|
11
|
+
|
|
12
|
+
Process the provided text and mark all named entities with XML-style tags.
|
|
13
|
+
|
|
14
|
+
For example:
|
|
15
|
+
<person>John Smith</person> works at <organization>Google</organization> in <location>Mountain View</location>.
|
|
16
|
+
|
|
17
|
+
Available entity tags: {entity_list}
|
|
18
|
+
|
|
19
|
+
INSTRUCTIONS:
|
|
20
|
+
1. Wrap your entire response in <response>...</response> tags.
|
|
21
|
+
2. Inside these tags, include the original text with entity tags inserted.
|
|
22
|
+
3. Do not change the original text in any way (preserve spacing, punctuation, case, etc.).
|
|
23
|
+
4. Tag ALL entities you can identify using the exact tag names provided.
|
|
24
|
+
5. Do not include explanations, just the tagged text.
|
|
25
|
+
6. If entity spans overlap, choose the most specific entity type.
|
|
26
|
+
7. Ensure every opening tag has a matching closing tag.
|
|
27
|
+
|
|
28
|
+
Text to process:
|
|
29
|
+
{text}
|
|
30
|
+
""".lstrip()
|
|
31
|
+
|
|
32
|
+
FEWSHOT_TEMPLATE = """
|
|
33
|
+
Here are some examples of named entity recognition:
|
|
34
|
+
|
|
35
|
+
{fewshot}
|
|
36
|
+
|
|
37
|
+
You are a named entity recognition system that identifies the following entity types:
|
|
38
|
+
{entities}
|
|
39
|
+
|
|
40
|
+
Process the provided text and mark all named entities with XML-style tags.
|
|
41
|
+
|
|
42
|
+
For example:
|
|
43
|
+
<person>John Smith</person> works at <organization>Google</organization> in <location>Mountain View</location>.
|
|
44
|
+
|
|
45
|
+
Available entity tags: {entity_list}
|
|
46
|
+
|
|
47
|
+
INSTRUCTIONS:
|
|
48
|
+
1. Wrap your entire response in <response>...</response> tags.
|
|
49
|
+
2. Inside these tags, include the original text with entity tags inserted.
|
|
50
|
+
3. Do not change the original text in any way (preserve spacing, punctuation, case, etc.).
|
|
51
|
+
4. Tag ALL entities you can identify using the exact tag names provided.
|
|
52
|
+
5. Do not include explanations, just the tagged text.
|
|
53
|
+
6. If entity spans overlap, choose the most specific entity type.
|
|
54
|
+
7. Ensure every opening tag has a matching closing tag.
|
|
55
|
+
|
|
56
|
+
Text to process:
|
|
57
|
+
{text}
|
|
58
|
+
""".lstrip()
|
|
59
|
+
|
|
60
|
+
# Common error patterns to handle in XML predictions
|
|
61
|
+
DEFAULT_TAG_FIX_PATTERNS = [
|
|
62
|
+
# Fix mismatched tags
|
|
63
|
+
(r'<(\w+)>(.*?)</\w+>', r'<\1>\2</\1>'),
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def create_target_text(tokens: List[str], ner_tags: List[str], entity_type_map: Dict[str, str]) -> str:
|
|
68
|
+
"""
|
|
69
|
+
Create annotated text from tokens and NER tags.
|
|
70
|
+
Handles BIO tagging scheme conversion to inline XML-style tags.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
tokens: List of text tokens
|
|
74
|
+
ner_tags: List of BIO tags corresponding to tokens
|
|
75
|
+
entity_type_map: Mapping from BIO entity types to user-friendly tag names
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
String with XML-style entity markup wrapped in <response> tags
|
|
79
|
+
"""
|
|
80
|
+
result = []
|
|
81
|
+
current_entity = None
|
|
82
|
+
entity_tokens = []
|
|
83
|
+
|
|
84
|
+
for i, (token, tag) in enumerate(zip(tokens, ner_tags)):
|
|
85
|
+
if tag.startswith('B-'): # Beginning of entity
|
|
86
|
+
# Close previous entity if exists
|
|
87
|
+
if current_entity:
|
|
88
|
+
entity_type = entity_type_map.get(current_entity, '')
|
|
89
|
+
if entity_type:
|
|
90
|
+
result.append(f'<{entity_type.lower()}>{" ".join(entity_tokens)}</{entity_type.lower()}>')
|
|
91
|
+
else:
|
|
92
|
+
result.append(' '.join(entity_tokens))
|
|
93
|
+
entity_tokens = []
|
|
94
|
+
|
|
95
|
+
current_entity = tag[2:] # Remove B- prefix
|
|
96
|
+
entity_tokens.append(token)
|
|
97
|
+
elif tag.startswith('I-') and current_entity and tag[2:] == current_entity: # Inside entity
|
|
98
|
+
entity_tokens.append(token)
|
|
99
|
+
else: # Outside any entity (O tag)
|
|
100
|
+
if current_entity: # Close previous entity
|
|
101
|
+
entity_type = entity_type_map.get(current_entity, '')
|
|
102
|
+
if entity_type:
|
|
103
|
+
result.append(f'<{entity_type.lower()}>{" ".join(entity_tokens)}</{entity_type.lower()}>')
|
|
104
|
+
else:
|
|
105
|
+
result.append(' '.join(entity_tokens))
|
|
106
|
+
current_entity = None
|
|
107
|
+
entity_tokens = []
|
|
108
|
+
|
|
109
|
+
result.append(token)
|
|
110
|
+
|
|
111
|
+
# Handle any remaining entity at end of sequence
|
|
112
|
+
if current_entity:
|
|
113
|
+
entity_type = entity_type_map.get(current_entity, '')
|
|
114
|
+
if entity_type:
|
|
115
|
+
result.append(f'<{entity_type.lower()}>{" ".join(entity_tokens)}</{entity_type.lower()}>')
|
|
116
|
+
else:
|
|
117
|
+
result.append(' '.join(entity_tokens))
|
|
118
|
+
|
|
119
|
+
# Wrap the entire response in <response> tags as required by the pipeline
|
|
120
|
+
return f'<response>{" ".join(result)}</response>'
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def clean_prediction(text: str, tag_fix_patterns: List[Tuple[str, str]] = None) -> str:
|
|
124
|
+
"""
|
|
125
|
+
Clean and fix common XML errors in model predictions.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
text: The prediction text to clean
|
|
129
|
+
tag_fix_patterns: List of regex patterns and replacements to fix common XML errors
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
Cleaned text with fixed XML tags
|
|
133
|
+
"""
|
|
134
|
+
if tag_fix_patterns is None:
|
|
135
|
+
tag_fix_patterns = DEFAULT_TAG_FIX_PATTERNS
|
|
136
|
+
|
|
137
|
+
cleaned = text
|
|
138
|
+
|
|
139
|
+
# Extract content from response tags if present
|
|
140
|
+
response_match = re.search(r'<response>(.*?)</response>', cleaned, re.DOTALL)
|
|
141
|
+
if response_match:
|
|
142
|
+
cleaned = response_match.group(1)
|
|
143
|
+
|
|
144
|
+
# Apply fix patterns for common XML errors
|
|
145
|
+
for pattern, replacement in tag_fix_patterns:
|
|
146
|
+
cleaned = re.sub(pattern, replacement, cleaned)
|
|
147
|
+
|
|
148
|
+
return cleaned
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def extract_entities_from_text(text: str, reverse_entity_map: Dict[str, str]) -> List[Tuple]:
|
|
152
|
+
"""
|
|
153
|
+
Extract entities from tagged text with robust error handling.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
text: Text with XML entity tags
|
|
157
|
+
reverse_entity_map: Mapping from user-friendly tag names to BIO entity types
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
List of (entity_type, entity_text, start_idx, end_idx) tuples
|
|
161
|
+
"""
|
|
162
|
+
entities = []
|
|
163
|
+
|
|
164
|
+
# Define regex pattern to find XML-style entity tags - handle potential errors
|
|
165
|
+
pattern = r'<(\w+)>(.*?)</\1>'
|
|
166
|
+
|
|
167
|
+
try:
|
|
168
|
+
for match in re.finditer(pattern, text):
|
|
169
|
+
entity_type = match.group(1).lower() # Normalize type to lowercase
|
|
170
|
+
entity_text = match.group(2)
|
|
171
|
+
start_idx = match.start()
|
|
172
|
+
end_idx = match.end()
|
|
173
|
+
|
|
174
|
+
# Map back to entity types if possible
|
|
175
|
+
mapped_type = reverse_entity_map.get(entity_type)
|
|
176
|
+
|
|
177
|
+
if mapped_type:
|
|
178
|
+
entities.append((mapped_type, entity_text, start_idx, end_idx))
|
|
179
|
+
else:
|
|
180
|
+
# Unknown entity type but still count it for evaluation
|
|
181
|
+
entities.append((entity_type, entity_text, start_idx, end_idx))
|
|
182
|
+
|
|
183
|
+
except Exception as e:
|
|
184
|
+
logger.warning(f'Error parsing entities in text: {str(e)}')
|
|
185
|
+
|
|
186
|
+
# Handle malformed XML by trying to find additional tag patterns
|
|
187
|
+
# This is a fallback for when the model produces incorrect tags
|
|
188
|
+
unclosed_pattern = r'<(\w+)>(.*?)(?=<|$)'
|
|
189
|
+
try:
|
|
190
|
+
# Find potential unclosed tags
|
|
191
|
+
for match in re.finditer(unclosed_pattern, text):
|
|
192
|
+
# Skip if already part of a well-formed tag
|
|
193
|
+
if any(start_idx <= match.start() < end_idx for _, _, start_idx, end_idx in entities):
|
|
194
|
+
continue
|
|
195
|
+
|
|
196
|
+
entity_type = match.group(1).lower()
|
|
197
|
+
entity_text = match.group(2)
|
|
198
|
+
start_idx = match.start()
|
|
199
|
+
end_idx = match.end()
|
|
200
|
+
|
|
201
|
+
# Map back to entity types
|
|
202
|
+
mapped_type = reverse_entity_map.get(entity_type)
|
|
203
|
+
if mapped_type:
|
|
204
|
+
entities.append((mapped_type, entity_text, start_idx, end_idx))
|
|
205
|
+
|
|
206
|
+
except Exception as e:
|
|
207
|
+
logger.warning(f'Error handling malformed tags: {str(e)}')
|
|
208
|
+
|
|
209
|
+
return entities
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def xml_to_bio_tags(xml_text: str, original_tokens: List[str], reverse_entity_map: Dict[str, str]) -> List[str]:
|
|
213
|
+
"""
|
|
214
|
+
Convert XML-annotated text back to BIO tags aligned with the original tokens.
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
xml_text: Text with XML entity annotations
|
|
218
|
+
original_tokens: Original tokens to align with
|
|
219
|
+
reverse_entity_map: Mapping from user-friendly tag names to BIO entity types
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
List of BIO tags corresponding to the original tokens
|
|
223
|
+
"""
|
|
224
|
+
# Extract entities with their character positions
|
|
225
|
+
entities = extract_entities_from_text(xml_text, reverse_entity_map)
|
|
226
|
+
|
|
227
|
+
# Initialize all tags as 'O'
|
|
228
|
+
bio_tags = ['O'] * len(original_tokens)
|
|
229
|
+
|
|
230
|
+
# Reconstruct the original text to find character positions for each token
|
|
231
|
+
original_text = ' '.join(original_tokens)
|
|
232
|
+
|
|
233
|
+
# Track token start positions in the original text
|
|
234
|
+
token_positions = []
|
|
235
|
+
pos = 0
|
|
236
|
+
for token in original_tokens:
|
|
237
|
+
token_pos = original_text.find(token, pos)
|
|
238
|
+
if token_pos == -1:
|
|
239
|
+
# Fallback: just use the current position if we can't find the exact match
|
|
240
|
+
token_positions.append(pos)
|
|
241
|
+
else:
|
|
242
|
+
token_positions.append(token_pos)
|
|
243
|
+
pos = token_pos + len(token)
|
|
244
|
+
|
|
245
|
+
# Add token end positions
|
|
246
|
+
token_ends = [pos + len(token) for pos, token in zip(token_positions, original_tokens)]
|
|
247
|
+
|
|
248
|
+
# Map entities to tokens based on character positions
|
|
249
|
+
for entity_type, entity_text, start_pos, end_pos in entities:
|
|
250
|
+
# Extract the context from the XML text to help locate the correct entity occurrence
|
|
251
|
+
# Get some context before and after the entity in the XML text
|
|
252
|
+
context_start = max(0, start_pos - 20)
|
|
253
|
+
context_end = min(len(xml_text), end_pos + 20)
|
|
254
|
+
|
|
255
|
+
# Extract context without XML tags
|
|
256
|
+
context_before = re.sub(r'<[^>]+>', '', xml_text[context_start:start_pos])
|
|
257
|
+
context_after = re.sub(r'<[^>]+>', '', xml_text[end_pos:context_end])
|
|
258
|
+
|
|
259
|
+
# Use context to find the correct entity position in original text
|
|
260
|
+
search_pos = 0
|
|
261
|
+
entity_start = -1
|
|
262
|
+
|
|
263
|
+
while search_pos < len(original_text):
|
|
264
|
+
# Find the next occurrence of the entity
|
|
265
|
+
potential_start = original_text.find(entity_text, search_pos)
|
|
266
|
+
if potential_start == -1:
|
|
267
|
+
break
|
|
268
|
+
|
|
269
|
+
# Check if the context matches
|
|
270
|
+
potential_context_start = max(0, potential_start - len(context_before))
|
|
271
|
+
potential_context_end = min(len(original_text), potential_start + len(entity_text) + len(context_after))
|
|
272
|
+
|
|
273
|
+
before_match = context_before.strip() in original_text[potential_context_start:potential_start].strip()
|
|
274
|
+
after_match = context_after.strip() in original_text[potential_start
|
|
275
|
+
+ len(entity_text):potential_context_end].strip()
|
|
276
|
+
|
|
277
|
+
# If context matches or we can't find a better match, use this position
|
|
278
|
+
if before_match or after_match or search_pos > len(original_text) // 2:
|
|
279
|
+
entity_start = potential_start
|
|
280
|
+
break
|
|
281
|
+
|
|
282
|
+
# Move search position forward
|
|
283
|
+
search_pos = potential_start + 1
|
|
284
|
+
|
|
285
|
+
# If we couldn't find the entity with context, fall back to the first occurrence
|
|
286
|
+
if entity_start == -1:
|
|
287
|
+
entity_start = original_text.find(entity_text)
|
|
288
|
+
if entity_start == -1:
|
|
289
|
+
continue
|
|
290
|
+
|
|
291
|
+
entity_end = entity_start + len(entity_text)
|
|
292
|
+
|
|
293
|
+
# Find tokens that overlap with this entity
|
|
294
|
+
for i, (token_start, token_end) in enumerate(zip(token_positions, token_ends)):
|
|
295
|
+
if token_start <= entity_end and token_end >= entity_start:
|
|
296
|
+
# This token overlaps with the entity
|
|
297
|
+
if bio_tags[i] == 'O':
|
|
298
|
+
# Start of entity
|
|
299
|
+
if i == 0 or bio_tags[i - 1] == 'O' or not bio_tags[i - 1].endswith(entity_type):
|
|
300
|
+
bio_tags[i] = f'B-{entity_type}'
|
|
301
|
+
else:
|
|
302
|
+
# Continuation of entity
|
|
303
|
+
bio_tags[i] = f'I-{entity_type}'
|
|
304
|
+
|
|
305
|
+
return bio_tags
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def calculate_bio_metrics(pred_tags: List[str], gold_tags: List[str], tokens: List[str]) -> Tuple[int, int, int]:
|
|
309
|
+
"""
|
|
310
|
+
Calculate metrics by comparing BIO tag sequences.
|
|
311
|
+
|
|
312
|
+
Args:
|
|
313
|
+
pred_tags: Predicted BIO tags
|
|
314
|
+
gold_tags: Gold standard BIO tags
|
|
315
|
+
tokens: Original tokens
|
|
316
|
+
|
|
317
|
+
Returns:
|
|
318
|
+
Tuple of (true_positives, false_positives, false_negatives)
|
|
319
|
+
"""
|
|
320
|
+
# Extract entity spans from BIO tags
|
|
321
|
+
pred_spans = extract_spans_from_bio(pred_tags, tokens)
|
|
322
|
+
gold_spans = extract_spans_from_bio(gold_tags, tokens)
|
|
323
|
+
|
|
324
|
+
# Calculate metrics
|
|
325
|
+
true_positives = len(pred_spans.intersection(gold_spans))
|
|
326
|
+
false_positives = len(pred_spans - gold_spans)
|
|
327
|
+
false_negatives = len(gold_spans - pred_spans)
|
|
328
|
+
|
|
329
|
+
return true_positives, false_positives, false_negatives
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def extract_spans_from_bio(tags: List[str], tokens: List[str]) -> Set[Tuple]:
|
|
333
|
+
"""
|
|
334
|
+
Extract entity spans from BIO tags.
|
|
335
|
+
|
|
336
|
+
Args:
|
|
337
|
+
tags: List of BIO tags
|
|
338
|
+
tokens: List of tokens corresponding to the tags
|
|
339
|
+
|
|
340
|
+
Returns:
|
|
341
|
+
Set of (entity_type, start_idx, end_idx, text) tuples
|
|
342
|
+
"""
|
|
343
|
+
spans = set()
|
|
344
|
+
current_entity = None
|
|
345
|
+
start_idx = None
|
|
346
|
+
entity_tokens = []
|
|
347
|
+
|
|
348
|
+
for i, (token, tag) in enumerate(zip(tokens, tags)):
|
|
349
|
+
if tag.startswith('B-'): # Beginning of entity
|
|
350
|
+
# Close previous entity if exists
|
|
351
|
+
if current_entity:
|
|
352
|
+
entity_type = current_entity
|
|
353
|
+
entity_text = ' '.join(entity_tokens)
|
|
354
|
+
spans.add((entity_type, start_idx, i - 1, entity_text))
|
|
355
|
+
entity_tokens = []
|
|
356
|
+
|
|
357
|
+
current_entity = tag[2:] # Remove B- prefix
|
|
358
|
+
start_idx = i
|
|
359
|
+
entity_tokens.append(token)
|
|
360
|
+
elif tag.startswith('I-') and current_entity: # Inside entity
|
|
361
|
+
entity_tokens.append(token)
|
|
362
|
+
elif tag == 'O': # Outside any entity
|
|
363
|
+
if current_entity: # Close previous entity
|
|
364
|
+
entity_type = current_entity
|
|
365
|
+
entity_text = ' '.join(entity_tokens)
|
|
366
|
+
spans.add((entity_type, start_idx, i - 1, entity_text))
|
|
367
|
+
current_entity = None
|
|
368
|
+
start_idx = None
|
|
369
|
+
entity_tokens = []
|
|
370
|
+
|
|
371
|
+
# Handle any remaining entity at end of sequence
|
|
372
|
+
if current_entity:
|
|
373
|
+
entity_type = current_entity
|
|
374
|
+
entity_text = ' '.join(entity_tokens)
|
|
375
|
+
spans.add((entity_type, start_idx, len(tokens) - 1, entity_text))
|
|
376
|
+
|
|
377
|
+
return spans
|
evalscope/version.py
CHANGED