PyPI - evalscope - Versions diffs - 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

evalscope 1.0.0py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (324) hide show

evalscope/api/benchmark/__init__.py +9 -1
evalscope/api/benchmark/adapters/__init__.py +4 -0
evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
evalscope/api/benchmark/benchmark.py +85 -2
evalscope/api/benchmark/meta.py +10 -1
evalscope/api/dataset/dataset.py +27 -6
evalscope/api/dataset/loader.py +8 -3
evalscope/api/evaluator/cache.py +31 -4
evalscope/api/evaluator/evaluator.py +5 -0
evalscope/api/evaluator/state.py +17 -1
evalscope/api/messages/__init__.py +1 -0
evalscope/api/messages/chat_message.py +52 -2
evalscope/api/metric/__init__.py +1 -1
evalscope/api/metric/metric.py +6 -1
evalscope/api/metric/scorer.py +15 -7
evalscope/api/mixin/__init__.py +1 -1
evalscope/api/mixin/llm_judge_mixin.py +2 -0
evalscope/api/mixin/sandbox_mixin.py +182 -0
evalscope/api/model/generate_config.py +10 -6
evalscope/api/model/model.py +5 -2
evalscope/api/tool/tool_info.py +1 -1
evalscope/app/app.py +3 -0
evalscope/app/ui/multi_model.py +6 -1
evalscope/app/ui/single_model.py +11 -5
evalscope/app/utils/data_utils.py +8 -7
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/text_utils.py +14 -12
evalscope/app/utils/visualization.py +2 -2
evalscope/arguments.py +8 -4
evalscope/backend/opencompass/backend_manager.py +0 -2
evalscope/backend/rag_eval/utils/embedding.py +9 -1
evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
evalscope/benchmarks/aime/aime24_adapter.py +5 -0
evalscope/benchmarks/aime/aime25_adapter.py +136 -1
evalscope/benchmarks/aime/grader.py +307 -0
evalscope/benchmarks/aime/math_normalize.py +189 -0
evalscope/benchmarks/amc/amc_adapter.py +51 -0
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
evalscope/benchmarks/bfcl/v3/utils.py +23 -0
evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
evalscope/benchmarks/bfcl/v4/utils.py +410 -0
evalscope/benchmarks/biomix_qa/__init__.py +0 -0
evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
evalscope/benchmarks/blink/__init__.py +0 -0
evalscope/benchmarks/blink/blink_adapter.py +61 -0
evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
evalscope/benchmarks/chartqa/__init__.py +0 -0
evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
evalscope/benchmarks/chartqa/utils.py +38 -0
evalscope/benchmarks/coin_flip/__init__.py +0 -0
evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
evalscope/benchmarks/docvqa/__init__.py +0 -0
evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
evalscope/benchmarks/drivelology/__init__.py +0 -0
evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
evalscope/benchmarks/drop/drop_adapter.py +15 -44
evalscope/benchmarks/drop/utils.py +97 -0
evalscope/benchmarks/frames/frames_adapter.py +2 -1
evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
evalscope/benchmarks/general_arena/utils.py +2 -1
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
evalscope/benchmarks/halu_eval/__init__.py +0 -0
evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
evalscope/benchmarks/healthbench/__init__.py +0 -0
evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
evalscope/benchmarks/healthbench/utils.py +102 -0
evalscope/benchmarks/hle/hle_adapter.py +3 -2
evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
evalscope/benchmarks/humaneval/utils.py +235 -0
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/image_edit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/infovqa/__init__.py +0 -0
evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
evalscope/benchmarks/logi_qa/__int__.py +0 -0
evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
evalscope/benchmarks/math_qa/__init__.py +0 -0
evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
evalscope/benchmarks/math_verse/__init__.py +0 -0
evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
evalscope/benchmarks/math_vision/__init__.py +0 -0
evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
evalscope/benchmarks/math_vista/__init__.py +0 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
evalscope/benchmarks/med_mcqa/__init__.py +0 -0
evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
evalscope/benchmarks/minerva_math/__init__.py +0 -0
evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
evalscope/benchmarks/mm_bench/__init__.py +0 -0
evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
evalscope/benchmarks/mm_star/__init__.py +0 -0
evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
evalscope/benchmarks/multi_if/__init__.py +0 -0
evalscope/benchmarks/multi_if/ifeval.py +3354 -0
evalscope/benchmarks/multi_if/metrics.py +120 -0
evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
evalscope/benchmarks/music_trivia/__init__.py +0 -0
evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
evalscope/benchmarks/ner/__init__.py +0 -0
evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
evalscope/benchmarks/ner/copious_adapter.py +85 -0
evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
evalscope/benchmarks/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
evalscope/benchmarks/olympiad_bench/utils.py +565 -0
evalscope/benchmarks/omni_bench/__init__.py +0 -0
evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
evalscope/benchmarks/piqa/__init__.py +0 -0
evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
evalscope/benchmarks/poly_math/__init__.py +0 -0
evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
evalscope/benchmarks/pope/__init__.py +0 -0
evalscope/benchmarks/pope/pope_adapter.py +112 -0
evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
evalscope/benchmarks/pumed_qa/__init__.py +0 -0
evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
evalscope/benchmarks/qasc/__init__.py +0 -0
evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
evalscope/benchmarks/real_world_qa/__init__.py +0 -0
evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
evalscope/benchmarks/sciq/__init__.py +0 -0
evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
evalscope/benchmarks/simple_vqa/__init__.py +0 -0
evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
evalscope/benchmarks/siqa/__init__.py +0 -0
evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
evalscope/benchmarks/visu_logic/__init__.py +0 -0
evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
evalscope/benchmarks/wmt/__init__.py +0 -0
evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
evalscope/benchmarks/zerobench/__init__.py +0 -0
evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
evalscope/cli/start_app.py +7 -1
evalscope/cli/start_perf.py +7 -1
evalscope/config.py +103 -18
evalscope/constants.py +18 -0
evalscope/evaluator/evaluator.py +138 -82
evalscope/metrics/bert_score/__init__.py +0 -0
evalscope/metrics/bert_score/scorer.py +338 -0
evalscope/metrics/bert_score/utils.py +697 -0
evalscope/metrics/llm_judge.py +19 -7
evalscope/metrics/math_parser.py +14 -0
evalscope/metrics/metric.py +317 -13
evalscope/metrics/metrics.py +37 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
evalscope/models/image_edit_model.py +125 -0
evalscope/models/model_apis.py +22 -0
evalscope/models/openai_compatible.py +21 -0
evalscope/models/text2image_model.py +2 -2
evalscope/models/utils/openai.py +16 -6
evalscope/perf/arguments.py +26 -4
evalscope/perf/benchmark.py +76 -89
evalscope/perf/http_client.py +31 -16
evalscope/perf/main.py +15 -2
evalscope/perf/plugin/api/base.py +9 -7
evalscope/perf/plugin/api/custom_api.py +13 -58
evalscope/perf/plugin/api/default_api.py +188 -79
evalscope/perf/plugin/api/openai_api.py +85 -20
evalscope/perf/plugin/datasets/base.py +21 -0
evalscope/perf/plugin/datasets/custom.py +2 -3
evalscope/perf/plugin/datasets/flickr8k.py +2 -2
evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
evalscope/perf/plugin/datasets/line_by_line.py +2 -3
evalscope/perf/plugin/datasets/longalpaca.py +2 -3
evalscope/perf/plugin/datasets/openqa.py +2 -4
evalscope/perf/plugin/datasets/random_dataset.py +1 -3
evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
evalscope/perf/utils/benchmark_util.py +43 -27
evalscope/perf/utils/db_util.py +14 -19
evalscope/perf/utils/local_server.py +3 -44
evalscope/perf/utils/log_utils.py +21 -6
evalscope/report/__init__.py +13 -3
evalscope/report/combinator.py +91 -20
evalscope/report/generator.py +8 -87
evalscope/report/report.py +8 -4
evalscope/run.py +13 -5
evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
evalscope/utils/argument_utils.py +1 -1
evalscope/utils/chat_service.py +1 -1
evalscope/utils/function_utils.py +249 -12
evalscope/utils/import_utils.py +73 -1
evalscope/utils/io_utils.py +132 -7
evalscope/utils/json_schema.py +25 -2
evalscope/utils/logger.py +69 -18
evalscope/utils/model_utils.py +4 -3
evalscope/utils/multi_choices.py +39 -7
evalscope/utils/ner.py +377 -0
evalscope/version.py +2 -2
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
evalscope/api/mixin/dataset_mixin.py +0 -105
evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
tests/__init__.py +0 -1
tests/aigc/__init__.py +0 -1
tests/aigc/test_t2i.py +0 -142
tests/benchmark/__init__.py +0 -1
tests/benchmark/test_eval.py +0 -386
tests/cli/__init__.py +0 -1
tests/cli/test_all.py +0 -229
tests/cli/test_collection.py +0 -96
tests/cli/test_custom.py +0 -268
tests/perf/__init__.py +0 -1
tests/perf/test_perf.py +0 -176
tests/rag/test_clip_benchmark.py +0 -90
tests/rag/test_mteb.py +0 -213
tests/rag/test_ragas.py +0 -128
tests/swift/__init__.py +0 -1
tests/swift/test_run_swift_eval.py +0 -146
tests/swift/test_run_swift_vlm_eval.py +0 -128
tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
tests/test_run_all.py +0 -12
tests/utils.py +0 -13
tests/vlm/__init__.py +0 -1
tests/vlm/test_vlmeval.py +0 -102
/evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
{tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0

evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py ADDED Viewed

@@ -0,0 +1,433 @@
+# flake8: noqa
+import ast
+import os
+import re
+from .IoUscore_metric import calculate_iou, extract_coordinates, vqa_with_position_evaluation
+from .page_ocr_metric import cal_per_metrics
+from .spotting_metric import extract_bounding_boxes_robust, spotting_evaluation
+from .TEDS_metric import (
+    TEDS,
+    compute_f1_score,
+    convert_markdown_table_to_html,
+    convert_str_to_dict,
+    convert_str_to_multi_dict,
+    dict_to_html,
+    doc_parsing_evaluation,
+    generate_combinations,
+    wrap_html_table,
+)
+from .vqa_metric import (
+    cn_math_expression_evaluation,
+    cn_vqa_evaluation,
+    counting_evaluation,
+    math_expression_evaluation,
+    vqa_evaluation,
+    vqa_evaluation_case_sensitive,
+)
+teds = TEDS(n_jobs=os.cpu_count() or 1)
+def is_nan_value(value):
+    if value is None:
+        return True
+    if isinstance(value, str) and value.lower() == 'nan':
+        return True
+    try:
+        import pandas as pd
+        if pd.isna(value):
+            return True
+    except:
+        pass
+    return False
+def get_value_or_zero(value):
+    return 0.0 if value is None else value
+def ocrbench_v2_process_results(doc, pred):
+    question = doc['question']
+    gt_ans = doc['answers']
+    data_type = doc['type']
+    score = 0
+    if (
+        data_type == 'APP agent en' or data_type == 'ASCII art classification en' or data_type == 'math QA en'
+        or data_type == 'reasoning VQA en' or data_type == 'science QA en' or data_type == 'text recognition en'
+        or data_type == 'document classification en' or data_type == 'cognition VQA en' or data_type == 'diagram QA en'
+    ):
+        if doc['eval'] == 'multiple choice':
+            if not isinstance(gt_ans, list):
+                gt_ans = [gt_ans]
+            assert len(gt_ans) == 1
+            if not isinstance(pred, str):
+                score = 0
+            else:
+                predict = ''.join(c for c in pred if c.isalpha())
+                if predict == gt_ans[0]:
+                    score = 1
+                else:
+                    score = 0
+        elif doc['eval'] == 'case sensitive':
+            score = vqa_evaluation_case_sensitive(pred, gt_ans)
+        else:
+            score = vqa_evaluation(pred, gt_ans)
+    elif data_type == 'cognition VQA cn' or data_type == 'reasoning VQA cn':
+        if doc['eval'] == 'multiple choice':
+            assert len(gt_ans) == 1
+            predict = ''.join(c for c in pred if c.isalpha())
+            if predict == gt_ans[0]:
+                score = 1
+            else:
+                score = 0
+        elif doc['eval'] == 'case sensitive':
+            score = vqa_evaluation_case_sensitive(pred, gt_ans)
+        else:
+            score = cn_vqa_evaluation(pred, gt_ans)
+    elif data_type == 'handwritten answer extraction cn':
+        if '简答' in question:
+            ocr_metric = cal_per_metrics(pred, gt_ans[0])
+            score = (
+                get_value_or_zero(ocr_metric['bleu']) + get_value_or_zero(ocr_metric['meteor'])
+                + get_value_or_zero(ocr_metric['f_measure']) + (1 - get_value_or_zero(ocr_metric['edit_dist']))
+            ) / 4
+        else:
+            assert len(gt_ans) == 1
+            answer = gt_ans[0]
+            chars = list(answer)
+            if len(answer) > 1:
+                answer_list = [
+                    ''.join(chars), '.'.join(chars), '. '.join(chars), ','.join(chars), ', '.join(chars),
+                    '、'.join(chars), ';'.join(chars), '; '.join(chars), ' '.join(chars), '和'.join(chars)
+                ]
+                max_score = 0
+                for answer in answer_list:
+                    if answer in pred:
+                        temp_score = 1
+                    else:
+                        temp_score = 0
+                    if temp_score > max_score:
+                        max_score = temp_score
+                score = max_score
+            else:
+                if gt_ans[0] in pred:
+                    score = 1
+                else:
+                    score = 0
+    elif data_type == 'formula recognition cn':
+        if is_nan_value(pred):
+            score = 0
+        else:
+            score = cn_math_expression_evaluation(pred, gt_ans)
+    elif data_type == 'text counting en':
+        score = counting_evaluation(pred, gt_ans, doc['eval'])
+    elif data_type == 'formula recognition en':
+        score = math_expression_evaluation(pred, gt_ans)
+    elif data_type == 'table parsing en':
+        if type(gt_ans) == list and len(gt_ans) == 1:
+            if not isinstance(pred, str):
+                score = 0
+            elif 'html' in question.lower():
+                no_find = False
+                predict_table = pred.replace('\n', '')
+                if '<body' in predict_table:
+                    predict_table = re.findall('<body.*', predict_table)[0]
+                elif '<table' in predict_table:
+                    predict_table = re.findall('<table.*', predict_table)[0]
+                else:
+                    no_find = True
+                if no_find:
+                    score = 0
+                else:
+                    pred_table_html = wrap_html_table(predict_table)
+                    gold_table_html = wrap_html_table(gt_ans[0])
+                    try:
+                        score = teds.evaluate(pred_table_html, gold_table_html)
+                    except:
+                        score = 0
+            elif 'markdown' in question.lower():
+                if not isinstance(pred, str):
+                    prediction = str(pred)
+                    pred_table_html = convert_markdown_table_to_html(prediction)
+                    gt_table_html = convert_markdown_table_to_html(gt_ans[0])
+                    score = teds.evaluate(pred_table_html, gt_table_html)
+                else:
+                    pred_table_html = convert_markdown_table_to_html(pred)
+                    gt_table_html = convert_markdown_table_to_html(gt_ans[0])
+                    score = teds.evaluate(pred_table_html, gt_table_html)
+        else:
+            raise ValueError
+    elif data_type == 'table parsing cn':
+        if not isinstance(pred, str):
+            score = 0
+        else:
+            no_find = False
+            predict_table = pred.replace('\n', '')
+            if '<body' in predict_table:
+                predict_table = re.findall('<body.*', predict_table)[0]
+            elif '<table' in predict_table:
+                predict_table = re.findall('<table.*', predict_table)[0]
+            else:
+                no_find = True
+            if no_find:
+                score = 0
+            else:
+                pred_table_html = wrap_html_table(predict_table)
+                gold_table_html = wrap_html_table(gt_ans[0])
+                try:
+                    score = teds.evaluate(pred_table_html, gold_table_html)
+                except:
+                    score = 0
+                    print('error')
+    elif data_type == 'chart parsing en':
+        answer = gt_ans[0]
+        if pred:
+            pred_chart_dict = convert_str_to_multi_dict(pred)
+            if len(pred_chart_dict) == 0:
+                score = 0
+            else:
+                pred_chart_html = dict_to_html(pred_chart_dict)
+                if isinstance(answer, str):
+                    answer = convert_str_to_multi_dict(pred)
+                gt_chart_html = dict_to_html(answer)
+                score = teds.evaluate(pred_chart_html, gt_chart_html)
+        else:
+            score = 0
+    elif data_type == 'document parsing en':
+        assert type(gt_ans) == list and len(gt_ans) == 1
+        score = doc_parsing_evaluation(pred, gt_ans[0])
+    elif data_type == 'document parsing cn':
+        assert type(gt_ans) == list and len(gt_ans) == 1
+        score = doc_parsing_evaluation(pred, gt_ans[0])
+    elif data_type == 'key information extraction en' or data_type == 'key information mapping en':
+        assert len(gt_ans) == 1
+        answers = generate_combinations(gt_ans[0])
+        if type(answers) == list and len(answers) == 1:
+            if not isinstance(pred, str):
+                score = 0
+            else:
+                pred_kie_dict = convert_str_to_dict(pred)
+                score = compute_f1_score(pred_kie_dict, answers[0])
+        else:
+            max_score = 0
+            for answer in answers:
+                pred_kie_dict = convert_str_to_dict(pred)
+                score = compute_f1_score(pred_kie_dict, answer)
+                if score > max_score:
+                    max_score = score
+            score = max_score
+    elif data_type == 'key information extraction cn':
+        assert len(gt_ans) == 1
+        answers = ast.literal_eval(gt_ans[0])
+        answers = {k: v if isinstance(v, list) else [v] for k, v in answers.items()}
+        answers = generate_combinations(answers)
+        if type(answers) == list and len(answers) == 1:
+            if not isinstance(pred, str):
+                score = 0
+            else:
+                pred_kie_dict = convert_str_to_dict(pred)
+                score = compute_f1_score(pred_kie_dict, answers[0])
+        else:
+            max_score = 0
+            for answer in answers:
+                pred_kie_dict = convert_str_to_dict(pred)
+                score = compute_f1_score(pred_kie_dict, answer)
+                if score > max_score:
+                    max_score = score
+            score = max_score
+    elif data_type == 'VQA with position en':
+        if not isinstance(pred, str):
+            score = 0
+        else:
+            pred_dict = convert_str_to_dict(pred)
+            score = vqa_with_position_evaluation(pred_dict, doc)
+    elif data_type == 'text translation cn':
+        if len(pred) == 0:
+            score = 0
+        else:
+            ocr_metric = cal_per_metrics(pred, gt_ans[0])
+            score = (
+                ocr_metric['bleu'] + ocr_metric['meteor'] + ocr_metric['f_measure'] + (1 - ocr_metric['edit_dist'])
+            ) / 4
+    elif data_type == 'fine-grained text recognition en':
+        if not isinstance(pred, str):
+            score = 0
+        elif len(pred) == 0:
+            score = 0
+        else:
+            ocr_metric = cal_per_metrics(pred, gt_ans[0])
+            score = (
+                get_value_or_zero(ocr_metric['bleu']) + get_value_or_zero(ocr_metric['meteor'])
+                + get_value_or_zero(ocr_metric['f_measure']) + (1 - get_value_or_zero(ocr_metric['edit_dist']))
+            ) / 4
+    elif data_type == 'full-page OCR en':
+        if not pred:
+            score = 0
+        else:
+            ocr_metric = cal_per_metrics(pred, gt_ans[0])
+            score = (
+                get_value_or_zero(ocr_metric['bleu']) + get_value_or_zero(ocr_metric['meteor'])
+                + get_value_or_zero(ocr_metric['f_measure']) + (1 - get_value_or_zero(ocr_metric['edit_dist']))
+            ) / 4
+    elif data_type == 'full-page OCR cn':
+        if not isinstance(pred, str):
+            score = 0
+        else:
+            if len(pred) == 0:
+                score = 0
+            else:
+                ocr_metric = cal_per_metrics(pred, gt_ans[0])
+                score = (
+                    ocr_metric['bleu'] + ocr_metric['meteor'] + ocr_metric['f_measure'] + (1 - ocr_metric['edit_dist'])
+                ) / 4
+    elif data_type == 'text grounding en':
+        if not isinstance(pred, str):
+            score = 0
+        else:
+            predict_bbox = extract_coordinates(pred)
+            if not predict_bbox:
+                score = 0
+            else:
+                score = calculate_iou(predict_bbox, gt_ans)
+    elif data_type == 'text spotting en':
+        if not isinstance(pred, str):
+            score = 0
+        else:
+            predict_bbox = extract_bounding_boxes_robust(pred)
+            if not predict_bbox:
+                score = 0
+            else:
+                score = spotting_evaluation(predict_bbox, doc)
+    return score
+def calculate_average_score(categories, OCRBench_v2_score):
+    return sum(
+        sum(OCRBench_v2_score[cat]) / len(OCRBench_v2_score[cat]) if len(OCRBench_v2_score[cat]) > 0 else 0
+        for cat in categories
+    ) / len(categories)
+def ocrbench_v2_aggregate_accuracy(results):
+    question_type_scores = {}
+    OCRBench_v2_score = {
+        'text_recognition_en': [],
+        'text_detection_en': [],
+        'text_spotting_en': [],
+        'relationship_extraction_en': [],
+        'element_parsing_en': [],
+        'mathematical_calculation_en': [],
+        'visual_text_understanding_en': [],
+        'knowledge_reasoning_en': [],
+        'text_recognition_cn': [],
+        'relationship_extraction_cn': [],
+        'element_parsing_cn': [],
+        'visual_text_understanding_cn': [],
+        'knowledge_reasoning_cn': [],
+    }
+    for result in results:
+        question_type = result['question_type']
+        score = result['score']
+        if question_type not in question_type_scores:
+            question_type_scores[question_type] = []
+        question_type_scores[question_type].append(score)
+        if question_type in ['text recognition en', 'fine-grained text recognition en', 'full-page OCR en']:
+            OCRBench_v2_score['text_recognition_en'].append(score)
+        elif question_type in ['text grounding en', 'VQA with position en']:
+            OCRBench_v2_score['text_detection_en'].append(score)
+        elif question_type == 'text spotting en':
+            OCRBench_v2_score['text_spotting_en'].append(score)
+        elif question_type in ['key information extraction en', 'key information mapping en']:
+            OCRBench_v2_score['relationship_extraction_en'].append(score)
+        elif question_type in ['document parsing en', 'chart parsing en', 'table parsing en', 'formula recognition en']:
+            OCRBench_v2_score['element_parsing_en'].append(score)
+        elif question_type in ['math QA en', 'text counting en']:
+            OCRBench_v2_score['mathematical_calculation_en'].append(score)
+        elif question_type in ['document classification en', 'cognition VQA en', 'diagram QA en']:
+            OCRBench_v2_score['visual_text_understanding_en'].append(score)
+        elif question_type in ['reasoning VQA en', 'science QA en', 'APP agent en', 'ASCII art classification en']:
+            OCRBench_v2_score['knowledge_reasoning_en'].append(score)
+        elif question_type == 'full-page OCR cn':
+            OCRBench_v2_score['text_recognition_cn'].append(score)
+        elif question_type in ['key information extraction cn', 'handwritten answer extraction cn']:
+            OCRBench_v2_score['relationship_extraction_cn'].append(score)
+        elif question_type in ['document parsing cn', 'table parsing cn', 'formula recognition cn']:
+            OCRBench_v2_score['element_parsing_cn'].append(score)
+        elif question_type == 'cognition VQA cn':
+            OCRBench_v2_score['visual_text_understanding_cn'].append(score)
+        elif question_type in ['reasoning VQA cn', 'text translation cn']:
+            OCRBench_v2_score['knowledge_reasoning_cn'].append(score)
+        else:
+            print('No such task!')
+            raise TypeError
+    english_tasks = [
+        'text_recognition_en', 'text_detection_en', 'text_spotting_en', 'relationship_extraction_en',
+        'element_parsing_en', 'mathematical_calculation_en', 'visual_text_understanding_en', 'knowledge_reasoning_en'
+    ]
+    chinese_tasks = [
+        'text_recognition_cn', 'relationship_extraction_cn', 'element_parsing_cn', 'visual_text_understanding_cn',
+        'knowledge_reasoning_cn'
+    ]
+    OCRBench_v2_English_subset_score = calculate_average_score(english_tasks, OCRBench_v2_score)
+    OCRBench_v2_Chinese_subset_score = calculate_average_score(chinese_tasks, OCRBench_v2_score)
+    Final_score = (OCRBench_v2_English_subset_score + OCRBench_v2_Chinese_subset_score) / 2
+    return Final_score  # return the final score as accuracy

evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py ADDED Viewed

@@ -0,0 +1,254 @@
+# flake8: noqa
+import math
+import re
+def levenshtein_distance(s1, s2):
+    if len(s1) > len(s2):
+        s1, s2 = s2, s1
+    distances = range(len(s1) + 1)
+    for i2, c2 in enumerate(s2):
+        distances_ = [i2 + 1]
+        for i1, c1 in enumerate(s1):
+            if c1 == c2:
+                distances_.append(distances[i1])
+            else:
+                distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
+        distances = distances_
+    return distances[-1]
+def vqa_evaluation(predict, answers):
+    score = 0
+    if isinstance(answers, list):
+        predict_str = str(predict).lower().strip().replace('\n', ' ')
+        for ans in answers:
+            answer = str(ans).lower().strip().replace('\n', ' ')
+            if len(answer.split()) < 5:
+                if answer in predict_str:
+                    score = 1
+            else:
+                dist = levenshtein_distance(predict_str, answer)
+                length = max(len(predict_str), len(answer))
+                ANLS_value = 0.0 if length == 0 else float(dist) / float(length)
+                ANLS_value = 1 - ANLS_value
+                if ANLS_value >= 0.5 and ANLS_value > score:
+                    score = ANLS_value
+    else:
+        answer = str(answers).lower().strip().replace('\n', ' ')
+        predict_str = str(predict).lower().strip().replace('\n', ' ')
+        if len(answer.split()) < 5:
+            if answer in predict_str:
+                score = 1
+        else:
+            dist = levenshtein_distance(predict_str, answer)
+            length = max(len(predict_str), len(answer))
+            ANLS_value = 0.0 if length == 0 else float(dist) / float(length)
+            ANLS_value = 1 - ANLS_value
+            if ANLS_value >= 0.5 and ANLS_value > score:
+                score = ANLS_value
+    return score
+def cn_vqa_evaluation(predict, answers):
+    score = 0
+    if isinstance(answers, list):
+        predict_str = str(predict).lower().strip().replace('\n', ' ').replace(' ', '')
+        for ans in answers:
+            answer = str(ans).lower().strip().replace('\n', ' ').replace(' ', '')
+            if len(answer.split(',')) < 4:
+                if answer in predict_str:
+                    score = 1
+            else:
+                dist = levenshtein_distance(predict_str, answer)
+                length = max(len(predict_str), len(answer))
+                ANLS_value = 0.0 if length == 0 else float(dist) / float(length)
+                ANLS_value = 1 - ANLS_value
+                if ANLS_value >= 0.5 and ANLS_value > score:
+                    score = ANLS_value
+    else:
+        answer = str(answers).lower().strip().replace('\n', ' ').replace(' ', '')
+        predict_str = str(predict).lower().strip().replace('\n', ' ').replace(' ', '')
+        if len(answer.split(',')) < 4:
+            if answer in predict_str:
+                score = 1
+        else:
+            dist = levenshtein_distance(predict_str, answer)
+            length = max(len(predict_str), len(answer))
+            ANLS_value = 0.0 if length == 0 else float(dist) / float(length)
+            ANLS_value = 1 - ANLS_value
+            if ANLS_value >= 0.5 and ANLS_value > score:
+                score = ANLS_value
+    return score
+def vqa_evaluation_case_sensitive(predict, answers):
+    score = 0
+    if isinstance(answers, list):
+        predict_str = str(predict).strip().replace('\n', ' ')
+        for ans in answers:
+            answer = str(ans).strip().replace('\n', ' ')
+            if len(answer.split()) < 5:
+                if answer in predict_str:
+                    score = 1
+            else:
+                dist = levenshtein_distance(predict_str, answer)
+                length = max(len(predict_str), len(answer))
+                ANLS_value = 0.0 if length == 0 else float(dist) / float(length)
+                ANLS_value = 1 - ANLS_value
+                if ANLS_value >= 0.5 and ANLS_value > score:
+                    score = ANLS_value
+    else:
+        answer = str(answers).strip().replace('\n', ' ')
+        predict_str = str(predict).strip().replace('\n', ' ')
+        if len(answer.split()) < 5:
+            if answer in predict_str:
+                score = 1
+        else:
+            dist = levenshtein_distance(predict_str, answer)
+            length = max(len(predict_str), len(answer))
+            ANLS_value = 0.0 if length == 0 else float(dist) / float(length)
+            ANLS_value = 1 - ANLS_value
+            if ANLS_value >= 0.5 and ANLS_value > score:
+                score = ANLS_value
+    return score
+def extract_first_number(string):
+    match = re.search(r'\d+', string)
+    if match:
+        return int(match.group())
+    return None
+def counting_evaluation(predict, answers, eval_method):
+    score = 0
+    # normalize predict to string for both matching and number extraction
+    if isinstance(predict, str):
+        predict_str = predict.lower().strip().replace('\n', ' ')
+    elif isinstance(predict, (int, float)):
+        if isinstance(predict, float) and math.isnan(predict):
+            return 0
+        predict_str = str(predict).lower().strip().replace('\n', ' ')
+    else:
+        predict_str = str(predict).lower().strip().replace('\n', ' ')
+    if isinstance(answers, list):
+        temp_score = 0
+        for ans in answers:
+            answer = str(ans).lower().strip().replace('\n', ' ')
+            if eval_method == 'exact match':
+                score = 1 if answer in predict_str else 0
+            elif eval_method == 'regression':
+                predict_number = extract_first_number(predict_str)
+                if predict_number is not None:
+                    try:
+                        answer_int = int(answer)
+                    except ValueError:
+                        score = 0
+                    else:
+                        if predict_number <= 0 or predict_number >= 2 * answer_int:
+                            score = 0
+                        else:
+                            iou = 1 - abs(predict_number - answer_int) / answer_int
+                            score = iou if iou > 0.5 else 0
+                else:
+                    score = 0
+            if score > temp_score:
+                temp_score = score
+        score = temp_score
+    else:
+        answer = str(answers).lower().strip().replace('\n', ' ')
+        if eval_method == 'exact match':
+            score = 1 if answer in predict_str else 0
+        elif eval_method == 'regression':
+            predict_number = extract_first_number(predict_str)
+            if predict_number is not None:
+                try:
+                    answer_int = int(answer)
+                except ValueError:
+                    score = 0
+                else:
+                    if predict_number <= 0 or predict_number >= 2 * answer_int:
+                        score = 0
+                    else:
+                        iou = 1 - abs(predict_number - answer_int) / answer_int
+                        score = iou if iou > 0.5 else 0
+            else:
+                score = 0
+    return score
+def math_expression_evaluation(predict, answers):
+    score = 0
+    if type(answers) == list:
+        for j in range(len(answers)):
+            answer = answers[j].strip().replace('\n', ' ').replace(' ', '')
+            predict = predict.strip().replace('\n', ' ').replace(' ', '')
+            if answer in predict:
+                score = 1
+    else:
+        answers = answers.strip().replace('\n', ' ').replace(' ', '')
+        predict = predict.strip().replace('\n', ' ').replace(' ', '')
+        if answers in predict:
+            score = 1
+    return score
+def remove_text_tags(latex_str):
+    """
+    Removes LaTeX \text{...} tags while keeping their content.
+    :param latex_str: A string containing LaTeX expressions
+    :return: The processed string with \text{...} tags removed
+    """
+    pattern = r'\\text\{([^{}]*)\}'
+    processed_str = re.sub(pattern, r'\1', latex_str)
+    return processed_str
+def cn_math_expression_evaluation(predict, answers):
+    score = 0
+    assert len(answers) == 1
+    answers = [remove_text_tags(answers[0])]
+    predict = remove_text_tags(predict)
+    if type(answers) == list:
+        for j in range(len(answers)):
+            answer = answers[j].strip().replace('\n', ' ').replace(' ', '')
+            predict = predict.strip().replace('\n', ' ').replace(' ', '')
+            if answer in predict:
+                score = 1
+    else:
+        answers = answers.strip().replace('\n', ' ').replace(' ', '')
+        predict = predict.strip().replace('\n', ' ').replace(' ', '')
+        if answers in predict:
+            score = 1
+    return score
+if __name__ == '__main__':
+    test_predict = 'apple pie and banana'
+    test_answers = ['apple', 'banana pie', 'apple pie and orange']
+    vqa_score = vqa_evaluation(test_predict, test_answers)
+    print(f"VQA evaluation score for predict '{test_predict}' and answers {test_answers}: {vqa_score}")

evalscope/benchmarks/olympiad_bench/__init__.py ADDED Viewed

File without changes

evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

evalscope 1.0.0py3-none-any.whl → 1.2.0py3-none-any.whl