PyPI - evalscope - Versions diffs - 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

evalscope 1.0.0py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (324) hide show

evalscope/api/benchmark/__init__.py +9 -1
evalscope/api/benchmark/adapters/__init__.py +4 -0
evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
evalscope/api/benchmark/benchmark.py +85 -2
evalscope/api/benchmark/meta.py +10 -1
evalscope/api/dataset/dataset.py +27 -6
evalscope/api/dataset/loader.py +8 -3
evalscope/api/evaluator/cache.py +31 -4
evalscope/api/evaluator/evaluator.py +5 -0
evalscope/api/evaluator/state.py +17 -1
evalscope/api/messages/__init__.py +1 -0
evalscope/api/messages/chat_message.py +52 -2
evalscope/api/metric/__init__.py +1 -1
evalscope/api/metric/metric.py +6 -1
evalscope/api/metric/scorer.py +15 -7
evalscope/api/mixin/__init__.py +1 -1
evalscope/api/mixin/llm_judge_mixin.py +2 -0
evalscope/api/mixin/sandbox_mixin.py +182 -0
evalscope/api/model/generate_config.py +10 -6
evalscope/api/model/model.py +5 -2
evalscope/api/tool/tool_info.py +1 -1
evalscope/app/app.py +3 -0
evalscope/app/ui/multi_model.py +6 -1
evalscope/app/ui/single_model.py +11 -5
evalscope/app/utils/data_utils.py +8 -7
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/text_utils.py +14 -12
evalscope/app/utils/visualization.py +2 -2
evalscope/arguments.py +8 -4
evalscope/backend/opencompass/backend_manager.py +0 -2
evalscope/backend/rag_eval/utils/embedding.py +9 -1
evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
evalscope/benchmarks/aime/aime24_adapter.py +5 -0
evalscope/benchmarks/aime/aime25_adapter.py +136 -1
evalscope/benchmarks/aime/grader.py +307 -0
evalscope/benchmarks/aime/math_normalize.py +189 -0
evalscope/benchmarks/amc/amc_adapter.py +51 -0
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
evalscope/benchmarks/bfcl/v3/utils.py +23 -0
evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
evalscope/benchmarks/bfcl/v4/utils.py +410 -0
evalscope/benchmarks/biomix_qa/__init__.py +0 -0
evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
evalscope/benchmarks/blink/__init__.py +0 -0
evalscope/benchmarks/blink/blink_adapter.py +61 -0
evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
evalscope/benchmarks/chartqa/__init__.py +0 -0
evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
evalscope/benchmarks/chartqa/utils.py +38 -0
evalscope/benchmarks/coin_flip/__init__.py +0 -0
evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
evalscope/benchmarks/docvqa/__init__.py +0 -0
evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
evalscope/benchmarks/drivelology/__init__.py +0 -0
evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
evalscope/benchmarks/drop/drop_adapter.py +15 -44
evalscope/benchmarks/drop/utils.py +97 -0
evalscope/benchmarks/frames/frames_adapter.py +2 -1
evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
evalscope/benchmarks/general_arena/utils.py +2 -1
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
evalscope/benchmarks/halu_eval/__init__.py +0 -0
evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
evalscope/benchmarks/healthbench/__init__.py +0 -0
evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
evalscope/benchmarks/healthbench/utils.py +102 -0
evalscope/benchmarks/hle/hle_adapter.py +3 -2
evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
evalscope/benchmarks/humaneval/utils.py +235 -0
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/image_edit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/infovqa/__init__.py +0 -0
evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
evalscope/benchmarks/logi_qa/__int__.py +0 -0
evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
evalscope/benchmarks/math_qa/__init__.py +0 -0
evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
evalscope/benchmarks/math_verse/__init__.py +0 -0
evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
evalscope/benchmarks/math_vision/__init__.py +0 -0
evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
evalscope/benchmarks/math_vista/__init__.py +0 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
evalscope/benchmarks/med_mcqa/__init__.py +0 -0
evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
evalscope/benchmarks/minerva_math/__init__.py +0 -0
evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
evalscope/benchmarks/mm_bench/__init__.py +0 -0
evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
evalscope/benchmarks/mm_star/__init__.py +0 -0
evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
evalscope/benchmarks/multi_if/__init__.py +0 -0
evalscope/benchmarks/multi_if/ifeval.py +3354 -0
evalscope/benchmarks/multi_if/metrics.py +120 -0
evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
evalscope/benchmarks/music_trivia/__init__.py +0 -0
evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
evalscope/benchmarks/ner/__init__.py +0 -0
evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
evalscope/benchmarks/ner/copious_adapter.py +85 -0
evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
evalscope/benchmarks/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
evalscope/benchmarks/olympiad_bench/utils.py +565 -0
evalscope/benchmarks/omni_bench/__init__.py +0 -0
evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
evalscope/benchmarks/piqa/__init__.py +0 -0
evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
evalscope/benchmarks/poly_math/__init__.py +0 -0
evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
evalscope/benchmarks/pope/__init__.py +0 -0
evalscope/benchmarks/pope/pope_adapter.py +112 -0
evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
evalscope/benchmarks/pumed_qa/__init__.py +0 -0
evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
evalscope/benchmarks/qasc/__init__.py +0 -0
evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
evalscope/benchmarks/real_world_qa/__init__.py +0 -0
evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
evalscope/benchmarks/sciq/__init__.py +0 -0
evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
evalscope/benchmarks/simple_vqa/__init__.py +0 -0
evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
evalscope/benchmarks/siqa/__init__.py +0 -0
evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
evalscope/benchmarks/visu_logic/__init__.py +0 -0
evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
evalscope/benchmarks/wmt/__init__.py +0 -0
evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
evalscope/benchmarks/zerobench/__init__.py +0 -0
evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
evalscope/cli/start_app.py +7 -1
evalscope/cli/start_perf.py +7 -1
evalscope/config.py +103 -18
evalscope/constants.py +18 -0
evalscope/evaluator/evaluator.py +138 -82
evalscope/metrics/bert_score/__init__.py +0 -0
evalscope/metrics/bert_score/scorer.py +338 -0
evalscope/metrics/bert_score/utils.py +697 -0
evalscope/metrics/llm_judge.py +19 -7
evalscope/metrics/math_parser.py +14 -0
evalscope/metrics/metric.py +317 -13
evalscope/metrics/metrics.py +37 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
evalscope/models/image_edit_model.py +125 -0
evalscope/models/model_apis.py +22 -0
evalscope/models/openai_compatible.py +21 -0
evalscope/models/text2image_model.py +2 -2
evalscope/models/utils/openai.py +16 -6
evalscope/perf/arguments.py +26 -4
evalscope/perf/benchmark.py +76 -89
evalscope/perf/http_client.py +31 -16
evalscope/perf/main.py +15 -2
evalscope/perf/plugin/api/base.py +9 -7
evalscope/perf/plugin/api/custom_api.py +13 -58
evalscope/perf/plugin/api/default_api.py +188 -79
evalscope/perf/plugin/api/openai_api.py +85 -20
evalscope/perf/plugin/datasets/base.py +21 -0
evalscope/perf/plugin/datasets/custom.py +2 -3
evalscope/perf/plugin/datasets/flickr8k.py +2 -2
evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
evalscope/perf/plugin/datasets/line_by_line.py +2 -3
evalscope/perf/plugin/datasets/longalpaca.py +2 -3
evalscope/perf/plugin/datasets/openqa.py +2 -4
evalscope/perf/plugin/datasets/random_dataset.py +1 -3
evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
evalscope/perf/utils/benchmark_util.py +43 -27
evalscope/perf/utils/db_util.py +14 -19
evalscope/perf/utils/local_server.py +3 -44
evalscope/perf/utils/log_utils.py +21 -6
evalscope/report/__init__.py +13 -3
evalscope/report/combinator.py +91 -20
evalscope/report/generator.py +8 -87
evalscope/report/report.py +8 -4
evalscope/run.py +13 -5
evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
evalscope/utils/argument_utils.py +1 -1
evalscope/utils/chat_service.py +1 -1
evalscope/utils/function_utils.py +249 -12
evalscope/utils/import_utils.py +73 -1
evalscope/utils/io_utils.py +132 -7
evalscope/utils/json_schema.py +25 -2
evalscope/utils/logger.py +69 -18
evalscope/utils/model_utils.py +4 -3
evalscope/utils/multi_choices.py +39 -7
evalscope/utils/ner.py +377 -0
evalscope/version.py +2 -2
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
evalscope/api/mixin/dataset_mixin.py +0 -105
evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
tests/__init__.py +0 -1
tests/aigc/__init__.py +0 -1
tests/aigc/test_t2i.py +0 -142
tests/benchmark/__init__.py +0 -1
tests/benchmark/test_eval.py +0 -386
tests/cli/__init__.py +0 -1
tests/cli/test_all.py +0 -229
tests/cli/test_collection.py +0 -96
tests/cli/test_custom.py +0 -268
tests/perf/__init__.py +0 -1
tests/perf/test_perf.py +0 -176
tests/rag/test_clip_benchmark.py +0 -90
tests/rag/test_mteb.py +0 -213
tests/rag/test_ragas.py +0 -128
tests/swift/__init__.py +0 -1
tests/swift/test_run_swift_eval.py +0 -146
tests/swift/test_run_swift_vlm_eval.py +0 -128
tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
tests/test_run_all.py +0 -12
tests/utils.py +0 -13
tests/vlm/__init__.py +0 -1
tests/vlm/test_vlmeval.py +0 -102
/evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
{tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0

evalscope/utils/logger.py CHANGED Viewed

@@ -28,22 +28,41 @@ logging.getLogger('datasets').setLevel(logging.WARNING)
 logging.getLogger('httpx').setLevel(logging.WARNING)
 logging.getLogger('modelscope').setLevel(logging.ERROR)
+info_set = set()
+warning_set = set()
+def info_once(self, msg, *args, **kwargs):
+    hash_id = kwargs.get('hash_id') or msg
+    if hash_id in info_set:
+        return
+    info_set.add(hash_id)
+    self.info(msg)
+def warning_once(self, msg, *args, **kwargs):
+    hash_id = kwargs.get('hash_id') or msg
+    if hash_id in warning_set:
+        return
+    warning_set.add(hash_id)
+    self.warning(msg)
 def get_logger(
     log_file: Optional[str] = None,
     name: Optional[str] = None,
     log_level: int = DEFAULT_LEVEL,
     file_mode: str = 'w',
-    force=False
+    force: bool = False,
 ):
     """Get logging logger
     Args:
-        log_file: Log filename, if specified, file handler will be added to
-            logger
-        log_level: Logging level.
-        file_mode: Specifies the mode to open the file, if filename is
-            specified (if filemode is unspecified, it defaults to 'w').
+        log_file: Log filename. If specified, a file handler will be added to the logger.
+        name: Logical component name. Used to derive the logger name.
+        log_level: Logging level to set.
+        file_mode: Mode to open the file when log_file is provided (default 'w').
+        force: If True, reconfigure the existing logger (levels, formatters, handlers).
     """
     if name:
@@ -58,7 +77,7 @@ def get_logger(
             logger.setLevel(log_level)
             for handler in logger.handlers:
                 handler.setLevel(log_level)
-                # 区分不同类型的 handler，使用相应的格式化器
+                # Select formatter by handler type
                 if isinstance(handler, logging.FileHandler):
                     handler.setFormatter(
                         plain_detailed_formatter if log_level == logging.DEBUG else plain_simple_formatter
@@ -67,6 +86,7 @@ def get_logger(
                     handler.setFormatter(
                         color_detailed_formatter if log_level == logging.DEBUG else color_simple_formatter
                     )
+            # Ensure file handler points to current log_file (replace if needed)
             add_file_handler_if_needed(logger, log_file, file_mode, log_level)
         return logger
@@ -88,7 +108,7 @@ def get_logger(
     handlers = [stream_handler]
     if is_worker0 and log_file is not None:
-        file_handler = logging.FileHandler(log_file, file_mode)
+        file_handler = logging.FileHandler(log_file, file_mode, encoding='utf-8')
         handlers.append(file_handler)
     for handler in handlers:
@@ -118,23 +138,54 @@ def configure_logging(debug: bool, log_file: Optional[str] = None):
         get_logger(log_level=logging.DEBUG, force=True)
-def add_file_handler_if_needed(logger, log_file, file_mode, log_level):
-    for handler in logger.handlers:
-        if isinstance(handler, logging.FileHandler):
-            return
+def add_file_handler_if_needed(
+    logger: logging.Logger,
+    log_file: Optional[str],
+    file_mode: str,
+    log_level: int,
+) -> None:
+    """Ensure logger has a FileHandler targeting log_file.
+    - If no FileHandler exists, add one.
+    - If a FileHandler exists but points to a different file, replace it.
+    """
+    if log_file is None:
+        return
+    # Only worker-0 writes files
     if iutil.find_spec('torch') is not None:
         from modelscope.utils.torch_utils import is_master
         is_worker0 = is_master()
     else:
         is_worker0 = True
-    if is_worker0 and log_file is not None:
-        file_handler = logging.FileHandler(log_file, file_mode)
-        file_handler.setFormatter(plain_detailed_formatter if log_level == logging.DEBUG else plain_simple_formatter)
-        file_handler.setLevel(log_level)
-        logger.addHandler(file_handler)
+    if not is_worker0:
+        return
+    target_path = os.path.abspath(log_file)
+    existing_file_handlers = [h for h in logger.handlers if isinstance(h, logging.FileHandler)]
+    # If there is a FileHandler already pointing to the target file, nothing to do.
+    for fh in existing_file_handlers:
+        try:
+            if os.path.abspath(getattr(fh, 'baseFilename', '')) == target_path:
+                return
+        except Exception:
+            # If any issue retrieving baseFilename, fall through to replacement
+            pass
+    # Replace all existing FileHandlers with the new one
+    for fh in existing_file_handlers:
+        try:
+            logger.removeHandler(fh)
+            fh.flush()
+            fh.close()
+        except Exception:
+            pass
+    file_handler = logging.FileHandler(target_path, file_mode, encoding='utf-8')
+    file_handler.setFormatter(plain_detailed_formatter if log_level == logging.DEBUG else plain_simple_formatter)
+    file_handler.setLevel(log_level)
+    logger.addHandler(file_handler)
 def warn_once(logger: Logger, message: str) -> None:

evalscope/utils/model_utils.py CHANGED Viewed

@@ -3,6 +3,8 @@ import random
 from enum import Enum
 from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
+from evalscope.utils.import_utils import check_import
 if TYPE_CHECKING:
     from transformers import GenerationConfig
@@ -67,7 +69,8 @@ def seed_everything(seed: int):
     """
     random.seed(seed)
     np.random.seed(seed)
-    try:
+    if check_import('torch', raise_warning=False):
         import torch
         torch.manual_seed(seed)
@@ -75,5 +78,3 @@ def seed_everything(seed: int):
             torch.cuda.manual_seed_all(seed)
             torch.backends.cudnn.deterministic = True
             torch.backends.cudnn.benchmark = False
-    except ImportError:
-        pass

evalscope/utils/multi_choices.py CHANGED Viewed

@@ -1,11 +1,8 @@
 # flake8: noqa: E501
-from __future__ import annotations
 import re
-from typing import TYPE_CHECKING, List, Optional
+from typing import List, Optional, Union
-if TYPE_CHECKING:
-    from evalscope.api.evaluator import Choices, Target, TaskState
+from evalscope.api.evaluator import Choices, Target, TaskState
 FEW_SHOT_TEMPLATE = r"""Here are some examples of how to answer similar questions:
@@ -84,10 +81,27 @@ def answer_options(choices: Choices) -> str:
     return '\n'.join([f'{answer_character(i)}) {choices[j].value}' for i, j in enumerate(indexes)])
-def prompt(question: str, choices: Choices, template: str, fewshot: Optional[str] = None) -> str:
+def format_letter_choices(choices: Union[Choices, List[str]]) -> str:
+    """
+    Returns the `choices` formatted as a letter list, e.g.:
+    ["choice 1", "choice 2", "choice 3"] ->
+        "A,B,C"
+    """
+    if isinstance(choices, list):
+        choices = Choices(choices)
+    indexes = list(range(len(choices)))
+    return ','.join([f'{answer_character(i)}' for i in indexes])
+def prompt(question: str, choices: Union[Choices, List[str]], template: str, fewshot: Optional[str] = None) -> str:
+    if isinstance(choices, list):
+        choices = Choices(choices)
     choices_text = answer_options(choices)
-    letters = ','.join(answer_character(i) for i in range(len(choices)))
+    letters = format_letter_choices(choices)
     if not fewshot:
         return template.format(
             choices=choices_text,
@@ -122,6 +136,14 @@ def format_example(
     return f'{question}\n{choices_text}\nANSWER: {answer.text}'
+def _fallback_parse_answer(completion: str) -> Optional[set[str]]:
+    # Fallback to find the last upper case letter
+    for letter in reversed(completion):
+        if letter.isupper():
+            return {letter}
+    return None
 def parse_answers(state: TaskState, multiple_correct: bool = False) -> set[str]:
     """
     Convenience function for extracting answers from the state output.
@@ -150,6 +172,11 @@ def parse_answers(state: TaskState, multiple_correct: bool = False) -> set[str]:
             state.output.completion,
         )
+    if match is None:
+        fallback_answer = _fallback_parse_answer(state.output.completion)
+        if fallback_answer:
+            return fallback_answer
     if match is None:
         return set()
@@ -200,6 +227,11 @@ def parse_answers_zh(state: TaskState, multiple_correct: bool = False) -> set[st
     pattern = r'答案\s*[:：]\s*([A-Za-z0-9,，]+)'
     match = re.search(pattern, state.output.completion, flags=re.MULTILINE)
+    if match is None:
+        fallback_answer = _fallback_parse_answer(state.output.completion)
+        if fallback_answer:
+            return fallback_answer
     if match is None:
         return set()

evalscope/utils/ner.py ADDED Viewed

@@ -0,0 +1,377 @@
+import re
+from typing import Any, Dict, List, Set, Tuple
+from evalscope.utils.logger import get_logger
+logger = get_logger()
+PROMPT_TEMPLATE = """
+You are a named entity recognition system that identifies the following entity types:
+{entities}
+Process the provided text and mark all named entities with XML-style tags.
+For example:
+<person>John Smith</person> works at <organization>Google</organization> in <location>Mountain View</location>.
+Available entity tags: {entity_list}
+INSTRUCTIONS:
+1. Wrap your entire response in <response>...</response> tags.
+2. Inside these tags, include the original text with entity tags inserted.
+3. Do not change the original text in any way (preserve spacing, punctuation, case, etc.).
+4. Tag ALL entities you can identify using the exact tag names provided.
+5. Do not include explanations, just the tagged text.
+6. If entity spans overlap, choose the most specific entity type.
+7. Ensure every opening tag has a matching closing tag.
+Text to process:
+{text}
+""".lstrip()
+FEWSHOT_TEMPLATE = """
+Here are some examples of named entity recognition:
+{fewshot}
+You are a named entity recognition system that identifies the following entity types:
+{entities}
+Process the provided text and mark all named entities with XML-style tags.
+For example:
+<person>John Smith</person> works at <organization>Google</organization> in <location>Mountain View</location>.
+Available entity tags: {entity_list}
+INSTRUCTIONS:
+1. Wrap your entire response in <response>...</response> tags.
+2. Inside these tags, include the original text with entity tags inserted.
+3. Do not change the original text in any way (preserve spacing, punctuation, case, etc.).
+4. Tag ALL entities you can identify using the exact tag names provided.
+5. Do not include explanations, just the tagged text.
+6. If entity spans overlap, choose the most specific entity type.
+7. Ensure every opening tag has a matching closing tag.
+Text to process:
+{text}
+""".lstrip()
+# Common error patterns to handle in XML predictions
+DEFAULT_TAG_FIX_PATTERNS = [
+    # Fix mismatched tags
+    (r'<(\w+)>(.*?)</\w+>', r'<\1>\2</\1>'),
+]
+def create_target_text(tokens: List[str], ner_tags: List[str], entity_type_map: Dict[str, str]) -> str:
+    """
+    Create annotated text from tokens and NER tags.
+    Handles BIO tagging scheme conversion to inline XML-style tags.
+    Args:
+        tokens: List of text tokens
+        ner_tags: List of BIO tags corresponding to tokens
+        entity_type_map: Mapping from BIO entity types to user-friendly tag names
+    Returns:
+        String with XML-style entity markup wrapped in <response> tags
+    """
+    result = []
+    current_entity = None
+    entity_tokens = []
+    for i, (token, tag) in enumerate(zip(tokens, ner_tags)):
+        if tag.startswith('B-'):  # Beginning of entity
+            # Close previous entity if exists
+            if current_entity:
+                entity_type = entity_type_map.get(current_entity, '')
+                if entity_type:
+                    result.append(f'<{entity_type.lower()}>{" ".join(entity_tokens)}</{entity_type.lower()}>')
+                else:
+                    result.append(' '.join(entity_tokens))
+                entity_tokens = []
+            current_entity = tag[2:]  # Remove B- prefix
+            entity_tokens.append(token)
+        elif tag.startswith('I-') and current_entity and tag[2:] == current_entity:  # Inside entity
+            entity_tokens.append(token)
+        else:  # Outside any entity (O tag)
+            if current_entity:  # Close previous entity
+                entity_type = entity_type_map.get(current_entity, '')
+                if entity_type:
+                    result.append(f'<{entity_type.lower()}>{" ".join(entity_tokens)}</{entity_type.lower()}>')
+                else:
+                    result.append(' '.join(entity_tokens))
+                current_entity = None
+                entity_tokens = []
+            result.append(token)
+    # Handle any remaining entity at end of sequence
+    if current_entity:
+        entity_type = entity_type_map.get(current_entity, '')
+        if entity_type:
+            result.append(f'<{entity_type.lower()}>{" ".join(entity_tokens)}</{entity_type.lower()}>')
+        else:
+            result.append(' '.join(entity_tokens))
+    # Wrap the entire response in <response> tags as required by the pipeline
+    return f'<response>{" ".join(result)}</response>'
+def clean_prediction(text: str, tag_fix_patterns: List[Tuple[str, str]] = None) -> str:
+    """
+    Clean and fix common XML errors in model predictions.
+    Args:
+        text: The prediction text to clean
+        tag_fix_patterns: List of regex patterns and replacements to fix common XML errors
+    Returns:
+        Cleaned text with fixed XML tags
+    """
+    if tag_fix_patterns is None:
+        tag_fix_patterns = DEFAULT_TAG_FIX_PATTERNS
+    cleaned = text
+    # Extract content from response tags if present
+    response_match = re.search(r'<response>(.*?)</response>', cleaned, re.DOTALL)
+    if response_match:
+        cleaned = response_match.group(1)
+    # Apply fix patterns for common XML errors
+    for pattern, replacement in tag_fix_patterns:
+        cleaned = re.sub(pattern, replacement, cleaned)
+    return cleaned
+def extract_entities_from_text(text: str, reverse_entity_map: Dict[str, str]) -> List[Tuple]:
+    """
+    Extract entities from tagged text with robust error handling.
+    Args:
+        text: Text with XML entity tags
+        reverse_entity_map: Mapping from user-friendly tag names to BIO entity types
+    Returns:
+        List of (entity_type, entity_text, start_idx, end_idx) tuples
+    """
+    entities = []
+    # Define regex pattern to find XML-style entity tags - handle potential errors
+    pattern = r'<(\w+)>(.*?)</\1>'
+    try:
+        for match in re.finditer(pattern, text):
+            entity_type = match.group(1).lower()  # Normalize type to lowercase
+            entity_text = match.group(2)
+            start_idx = match.start()
+            end_idx = match.end()
+            # Map back to entity types if possible
+            mapped_type = reverse_entity_map.get(entity_type)
+            if mapped_type:
+                entities.append((mapped_type, entity_text, start_idx, end_idx))
+            else:
+                # Unknown entity type but still count it for evaluation
+                entities.append((entity_type, entity_text, start_idx, end_idx))
+    except Exception as e:
+        logger.warning(f'Error parsing entities in text: {str(e)}')
+    # Handle malformed XML by trying to find additional tag patterns
+    # This is a fallback for when the model produces incorrect tags
+    unclosed_pattern = r'<(\w+)>(.*?)(?=<|$)'
+    try:
+        # Find potential unclosed tags
+        for match in re.finditer(unclosed_pattern, text):
+            # Skip if already part of a well-formed tag
+            if any(start_idx <= match.start() < end_idx for _, _, start_idx, end_idx in entities):
+                continue
+            entity_type = match.group(1).lower()
+            entity_text = match.group(2)
+            start_idx = match.start()
+            end_idx = match.end()
+            # Map back to entity types
+            mapped_type = reverse_entity_map.get(entity_type)
+            if mapped_type:
+                entities.append((mapped_type, entity_text, start_idx, end_idx))
+    except Exception as e:
+        logger.warning(f'Error handling malformed tags: {str(e)}')
+    return entities
+def xml_to_bio_tags(xml_text: str, original_tokens: List[str], reverse_entity_map: Dict[str, str]) -> List[str]:
+    """
+    Convert XML-annotated text back to BIO tags aligned with the original tokens.
+    Args:
+        xml_text: Text with XML entity annotations
+        original_tokens: Original tokens to align with
+        reverse_entity_map: Mapping from user-friendly tag names to BIO entity types
+    Returns:
+        List of BIO tags corresponding to the original tokens
+    """
+    # Extract entities with their character positions
+    entities = extract_entities_from_text(xml_text, reverse_entity_map)
+    # Initialize all tags as 'O'
+    bio_tags = ['O'] * len(original_tokens)
+    # Reconstruct the original text to find character positions for each token
+    original_text = ' '.join(original_tokens)
+    # Track token start positions in the original text
+    token_positions = []
+    pos = 0
+    for token in original_tokens:
+        token_pos = original_text.find(token, pos)
+        if token_pos == -1:
+            # Fallback: just use the current position if we can't find the exact match
+            token_positions.append(pos)
+        else:
+            token_positions.append(token_pos)
+            pos = token_pos + len(token)
+    # Add token end positions
+    token_ends = [pos + len(token) for pos, token in zip(token_positions, original_tokens)]
+    # Map entities to tokens based on character positions
+    for entity_type, entity_text, start_pos, end_pos in entities:
+        # Extract the context from the XML text to help locate the correct entity occurrence
+        # Get some context before and after the entity in the XML text
+        context_start = max(0, start_pos - 20)
+        context_end = min(len(xml_text), end_pos + 20)
+        # Extract context without XML tags
+        context_before = re.sub(r'<[^>]+>', '', xml_text[context_start:start_pos])
+        context_after = re.sub(r'<[^>]+>', '', xml_text[end_pos:context_end])
+        # Use context to find the correct entity position in original text
+        search_pos = 0
+        entity_start = -1
+        while search_pos < len(original_text):
+            # Find the next occurrence of the entity
+            potential_start = original_text.find(entity_text, search_pos)
+            if potential_start == -1:
+                break
+            # Check if the context matches
+            potential_context_start = max(0, potential_start - len(context_before))
+            potential_context_end = min(len(original_text), potential_start + len(entity_text) + len(context_after))
+            before_match = context_before.strip() in original_text[potential_context_start:potential_start].strip()
+            after_match = context_after.strip() in original_text[potential_start
+                                                                 + len(entity_text):potential_context_end].strip()
+            # If context matches or we can't find a better match, use this position
+            if before_match or after_match or search_pos > len(original_text) // 2:
+                entity_start = potential_start
+                break
+            # Move search position forward
+            search_pos = potential_start + 1
+        # If we couldn't find the entity with context, fall back to the first occurrence
+        if entity_start == -1:
+            entity_start = original_text.find(entity_text)
+            if entity_start == -1:
+                continue
+        entity_end = entity_start + len(entity_text)
+        # Find tokens that overlap with this entity
+        for i, (token_start, token_end) in enumerate(zip(token_positions, token_ends)):
+            if token_start <= entity_end and token_end >= entity_start:
+                # This token overlaps with the entity
+                if bio_tags[i] == 'O':
+                    # Start of entity
+                    if i == 0 or bio_tags[i - 1] == 'O' or not bio_tags[i - 1].endswith(entity_type):
+                        bio_tags[i] = f'B-{entity_type}'
+                    else:
+                        # Continuation of entity
+                        bio_tags[i] = f'I-{entity_type}'
+    return bio_tags
+def calculate_bio_metrics(pred_tags: List[str], gold_tags: List[str], tokens: List[str]) -> Tuple[int, int, int]:
+    """
+    Calculate metrics by comparing BIO tag sequences.
+    Args:
+        pred_tags: Predicted BIO tags
+        gold_tags: Gold standard BIO tags
+        tokens: Original tokens
+    Returns:
+        Tuple of (true_positives, false_positives, false_negatives)
+    """
+    # Extract entity spans from BIO tags
+    pred_spans = extract_spans_from_bio(pred_tags, tokens)
+    gold_spans = extract_spans_from_bio(gold_tags, tokens)
+    # Calculate metrics
+    true_positives = len(pred_spans.intersection(gold_spans))
+    false_positives = len(pred_spans - gold_spans)
+    false_negatives = len(gold_spans - pred_spans)
+    return true_positives, false_positives, false_negatives
+def extract_spans_from_bio(tags: List[str], tokens: List[str]) -> Set[Tuple]:
+    """
+    Extract entity spans from BIO tags.
+    Args:
+        tags: List of BIO tags
+        tokens: List of tokens corresponding to the tags
+    Returns:
+        Set of (entity_type, start_idx, end_idx, text) tuples
+    """
+    spans = set()
+    current_entity = None
+    start_idx = None
+    entity_tokens = []
+    for i, (token, tag) in enumerate(zip(tokens, tags)):
+        if tag.startswith('B-'):  # Beginning of entity
+            # Close previous entity if exists
+            if current_entity:
+                entity_type = current_entity
+                entity_text = ' '.join(entity_tokens)
+                spans.add((entity_type, start_idx, i - 1, entity_text))
+                entity_tokens = []
+            current_entity = tag[2:]  # Remove B- prefix
+            start_idx = i
+            entity_tokens.append(token)
+        elif tag.startswith('I-') and current_entity:  # Inside entity
+            entity_tokens.append(token)
+        elif tag == 'O':  # Outside any entity
+            if current_entity:  # Close previous entity
+                entity_type = current_entity
+                entity_text = ' '.join(entity_tokens)
+                spans.add((entity_type, start_idx, i - 1, entity_text))
+                current_entity = None
+                start_idx = None
+                entity_tokens = []
+    # Handle any remaining entity at end of sequence
+    if current_entity:
+        entity_type = current_entity
+        entity_text = ' '.join(entity_tokens)
+        spans.add((entity_type, start_idx, len(tokens) - 1, entity_text))
+    return spans

evalscope/version.py CHANGED Viewed

@@ -1,4 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-__version__ = '1.0.0'
-__release_datetime__ = '2025-08-25 12:00:00'
+__version__ = '1.2.0'
+__release_datetime__ = '2025-11-11 12:00:00'

evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

evalscope 1.0.0py3-none-any.whl → 1.2.0py3-none-any.whl