PyPI - evalscope - Versions diffs - 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

evalscope 1.0.0py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (324) hide show

evalscope/api/benchmark/__init__.py +9 -1
evalscope/api/benchmark/adapters/__init__.py +4 -0
evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
evalscope/api/benchmark/benchmark.py +85 -2
evalscope/api/benchmark/meta.py +10 -1
evalscope/api/dataset/dataset.py +27 -6
evalscope/api/dataset/loader.py +8 -3
evalscope/api/evaluator/cache.py +31 -4
evalscope/api/evaluator/evaluator.py +5 -0
evalscope/api/evaluator/state.py +17 -1
evalscope/api/messages/__init__.py +1 -0
evalscope/api/messages/chat_message.py +52 -2
evalscope/api/metric/__init__.py +1 -1
evalscope/api/metric/metric.py +6 -1
evalscope/api/metric/scorer.py +15 -7
evalscope/api/mixin/__init__.py +1 -1
evalscope/api/mixin/llm_judge_mixin.py +2 -0
evalscope/api/mixin/sandbox_mixin.py +182 -0
evalscope/api/model/generate_config.py +10 -6
evalscope/api/model/model.py +5 -2
evalscope/api/tool/tool_info.py +1 -1
evalscope/app/app.py +3 -0
evalscope/app/ui/multi_model.py +6 -1
evalscope/app/ui/single_model.py +11 -5
evalscope/app/utils/data_utils.py +8 -7
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/text_utils.py +14 -12
evalscope/app/utils/visualization.py +2 -2
evalscope/arguments.py +8 -4
evalscope/backend/opencompass/backend_manager.py +0 -2
evalscope/backend/rag_eval/utils/embedding.py +9 -1
evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
evalscope/benchmarks/aime/aime24_adapter.py +5 -0
evalscope/benchmarks/aime/aime25_adapter.py +136 -1
evalscope/benchmarks/aime/grader.py +307 -0
evalscope/benchmarks/aime/math_normalize.py +189 -0
evalscope/benchmarks/amc/amc_adapter.py +51 -0
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
evalscope/benchmarks/bfcl/v3/utils.py +23 -0
evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
evalscope/benchmarks/bfcl/v4/utils.py +410 -0
evalscope/benchmarks/biomix_qa/__init__.py +0 -0
evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
evalscope/benchmarks/blink/__init__.py +0 -0
evalscope/benchmarks/blink/blink_adapter.py +61 -0
evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
evalscope/benchmarks/chartqa/__init__.py +0 -0
evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
evalscope/benchmarks/chartqa/utils.py +38 -0
evalscope/benchmarks/coin_flip/__init__.py +0 -0
evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
evalscope/benchmarks/docvqa/__init__.py +0 -0
evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
evalscope/benchmarks/drivelology/__init__.py +0 -0
evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
evalscope/benchmarks/drop/drop_adapter.py +15 -44
evalscope/benchmarks/drop/utils.py +97 -0
evalscope/benchmarks/frames/frames_adapter.py +2 -1
evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
evalscope/benchmarks/general_arena/utils.py +2 -1
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
evalscope/benchmarks/halu_eval/__init__.py +0 -0
evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
evalscope/benchmarks/healthbench/__init__.py +0 -0
evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
evalscope/benchmarks/healthbench/utils.py +102 -0
evalscope/benchmarks/hle/hle_adapter.py +3 -2
evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
evalscope/benchmarks/humaneval/utils.py +235 -0
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/image_edit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/infovqa/__init__.py +0 -0
evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
evalscope/benchmarks/logi_qa/__int__.py +0 -0
evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
evalscope/benchmarks/math_qa/__init__.py +0 -0
evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
evalscope/benchmarks/math_verse/__init__.py +0 -0
evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
evalscope/benchmarks/math_vision/__init__.py +0 -0
evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
evalscope/benchmarks/math_vista/__init__.py +0 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
evalscope/benchmarks/med_mcqa/__init__.py +0 -0
evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
evalscope/benchmarks/minerva_math/__init__.py +0 -0
evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
evalscope/benchmarks/mm_bench/__init__.py +0 -0
evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
evalscope/benchmarks/mm_star/__init__.py +0 -0
evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
evalscope/benchmarks/multi_if/__init__.py +0 -0
evalscope/benchmarks/multi_if/ifeval.py +3354 -0
evalscope/benchmarks/multi_if/metrics.py +120 -0
evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
evalscope/benchmarks/music_trivia/__init__.py +0 -0
evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
evalscope/benchmarks/ner/__init__.py +0 -0
evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
evalscope/benchmarks/ner/copious_adapter.py +85 -0
evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
evalscope/benchmarks/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
evalscope/benchmarks/olympiad_bench/utils.py +565 -0
evalscope/benchmarks/omni_bench/__init__.py +0 -0
evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
evalscope/benchmarks/piqa/__init__.py +0 -0
evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
evalscope/benchmarks/poly_math/__init__.py +0 -0
evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
evalscope/benchmarks/pope/__init__.py +0 -0
evalscope/benchmarks/pope/pope_adapter.py +112 -0
evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
evalscope/benchmarks/pumed_qa/__init__.py +0 -0
evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
evalscope/benchmarks/qasc/__init__.py +0 -0
evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
evalscope/benchmarks/real_world_qa/__init__.py +0 -0
evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
evalscope/benchmarks/sciq/__init__.py +0 -0
evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
evalscope/benchmarks/simple_vqa/__init__.py +0 -0
evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
evalscope/benchmarks/siqa/__init__.py +0 -0
evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
evalscope/benchmarks/visu_logic/__init__.py +0 -0
evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
evalscope/benchmarks/wmt/__init__.py +0 -0
evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
evalscope/benchmarks/zerobench/__init__.py +0 -0
evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
evalscope/cli/start_app.py +7 -1
evalscope/cli/start_perf.py +7 -1
evalscope/config.py +103 -18
evalscope/constants.py +18 -0
evalscope/evaluator/evaluator.py +138 -82
evalscope/metrics/bert_score/__init__.py +0 -0
evalscope/metrics/bert_score/scorer.py +338 -0
evalscope/metrics/bert_score/utils.py +697 -0
evalscope/metrics/llm_judge.py +19 -7
evalscope/metrics/math_parser.py +14 -0
evalscope/metrics/metric.py +317 -13
evalscope/metrics/metrics.py +37 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
evalscope/models/image_edit_model.py +125 -0
evalscope/models/model_apis.py +22 -0
evalscope/models/openai_compatible.py +21 -0
evalscope/models/text2image_model.py +2 -2
evalscope/models/utils/openai.py +16 -6
evalscope/perf/arguments.py +26 -4
evalscope/perf/benchmark.py +76 -89
evalscope/perf/http_client.py +31 -16
evalscope/perf/main.py +15 -2
evalscope/perf/plugin/api/base.py +9 -7
evalscope/perf/plugin/api/custom_api.py +13 -58
evalscope/perf/plugin/api/default_api.py +188 -79
evalscope/perf/plugin/api/openai_api.py +85 -20
evalscope/perf/plugin/datasets/base.py +21 -0
evalscope/perf/plugin/datasets/custom.py +2 -3
evalscope/perf/plugin/datasets/flickr8k.py +2 -2
evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
evalscope/perf/plugin/datasets/line_by_line.py +2 -3
evalscope/perf/plugin/datasets/longalpaca.py +2 -3
evalscope/perf/plugin/datasets/openqa.py +2 -4
evalscope/perf/plugin/datasets/random_dataset.py +1 -3
evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
evalscope/perf/utils/benchmark_util.py +43 -27
evalscope/perf/utils/db_util.py +14 -19
evalscope/perf/utils/local_server.py +3 -44
evalscope/perf/utils/log_utils.py +21 -6
evalscope/report/__init__.py +13 -3
evalscope/report/combinator.py +91 -20
evalscope/report/generator.py +8 -87
evalscope/report/report.py +8 -4
evalscope/run.py +13 -5
evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
evalscope/utils/argument_utils.py +1 -1
evalscope/utils/chat_service.py +1 -1
evalscope/utils/function_utils.py +249 -12
evalscope/utils/import_utils.py +73 -1
evalscope/utils/io_utils.py +132 -7
evalscope/utils/json_schema.py +25 -2
evalscope/utils/logger.py +69 -18
evalscope/utils/model_utils.py +4 -3
evalscope/utils/multi_choices.py +39 -7
evalscope/utils/ner.py +377 -0
evalscope/version.py +2 -2
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
evalscope/api/mixin/dataset_mixin.py +0 -105
evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
tests/__init__.py +0 -1
tests/aigc/__init__.py +0 -1
tests/aigc/test_t2i.py +0 -142
tests/benchmark/__init__.py +0 -1
tests/benchmark/test_eval.py +0 -386
tests/cli/__init__.py +0 -1
tests/cli/test_all.py +0 -229
tests/cli/test_collection.py +0 -96
tests/cli/test_custom.py +0 -268
tests/perf/__init__.py +0 -1
tests/perf/test_perf.py +0 -176
tests/rag/test_clip_benchmark.py +0 -90
tests/rag/test_mteb.py +0 -213
tests/rag/test_ragas.py +0 -128
tests/swift/__init__.py +0 -1
tests/swift/test_run_swift_eval.py +0 -146
tests/swift/test_run_swift_vlm_eval.py +0 -128
tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
tests/test_run_all.py +0 -12
tests/utils.py +0 -13
tests/vlm/__init__.py +0 -1
tests/vlm/test_vlmeval.py +0 -102
/evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
{tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0

evalscope/evaluator/evaluator.py CHANGED Viewed

@@ -8,15 +8,18 @@ and report generation.
 """
 import os
+import traceback
 from collections import defaultdict
-from concurrent.futures import ThreadPoolExecutor, as_completed
 from tqdm import tqdm
-from typing import TYPE_CHECKING, Dict, List, Tuple, Union
+from typing import TYPE_CHECKING, Callable, Dict, List
 from evalscope.api.dataset import Dataset, DatasetDict, Sample
 from evalscope.api.evaluator import CacheManager, Evaluator, TaskState
 from evalscope.api.metric import AggScore, SampleScore
+from evalscope.constants import HEARTBEAT_INTERVAL_SEC
 from evalscope.report import Report, gen_table
+from evalscope.utils.function_utils import run_in_threads_with_progress
+from evalscope.utils.logger import get_logger
 if TYPE_CHECKING:
     from evalscope.api.benchmark import DataAdapter
@@ -24,8 +27,6 @@ if TYPE_CHECKING:
     from evalscope.config import TaskConfig
     from evalscope.utils.io_utils import OutputsStructure
-from evalscope.utils.logger import get_logger
 logger = get_logger()
@@ -91,17 +92,27 @@ class DefaultEvaluator(Evaluator):
             Report: The complete evaluation report containing all metrics and results.
         """
         # Load the dataset and evaluate each subset
+        logger.info(f'Start evaluating benchmark: {self.benchmark_name}')
         dataset_dict = self.benchmark.load_dataset()
         agg_score_dict = defaultdict(list)
         # Process each subset (e.g., test, validation) independently
+        logger.info('Evaluating all subsets of the dataset...')
         for subset, dataset in dataset_dict.items():
-            assert len(dataset) > 0, f'No samples found in subset: {subset}'
+            if len(dataset) == 0:
+                logger.info(f'No samples found in subset: {subset}, skipping.')
+                continue
+            logger.info(f'Evaluating subset: {subset}')
             subset_score = self.evaluate_subset(subset, dataset)
             agg_score_dict[subset] = subset_score
         # Generate the report based on aggregated scores
+        logger.info('Generating report...')
         report = self.get_report(agg_score_dict)
+        # Finalize the evaluation process
+        self.finalize()
+        logger.info(f'Benchmark {self.benchmark_name} evaluation finished.')
         return report
     def evaluate_subset(self, subset: str, dataset: Dataset) -> List[AggScore]:
@@ -121,12 +132,15 @@ class DefaultEvaluator(Evaluator):
             List[AggScore]: Aggregated scores for this subset.
         """
         # Get model predictions for all samples in the subset
+        logger.info(f'Getting predictions for subset: {subset}')
         task_states = self.get_answers(subset, dataset)
         # Calculate evaluation metrics for each prediction
+        logger.info(f'Getting reviews for subset: {subset}')
         sample_scores = self.get_reviews(subset, task_states)
         # Aggregate individual sample scores into subset-level metrics
+        logger.info(f'Aggregating scores for subset: {subset}')
         agg_scores = self.benchmark.aggregate_scores(sample_scores=sample_scores)
         return agg_scores
@@ -148,51 +162,48 @@ class DefaultEvaluator(Evaluator):
         """
         # Initialize task state list and filter cached predictions if caching is enabled
         if self.use_cache:
-            task_state_list, dataset = self.cache_manager.filter_prediction_cache(subset, dataset)
+            cached_task_state_list, dataset = self.cache_manager.filter_prediction_cache(subset, dataset)
         else:
-            task_state_list = []
+            cached_task_state_list = []
         # Get output directory for storing model predictions
         model_prediction_dir = os.path.dirname(self.cache_manager.get_prediction_cache_path(subset))
         # Convert dataset to list for parallel processing
         dataset_list = list(dataset)
         if not dataset_list:
-            return task_state_list
-        # Process samples in parallel using ThreadPoolExecutor
-        with ThreadPoolExecutor(max_workers=min(len(dataset_list), self.task_config.eval_batch_size)) as executor:
-            # Submit all prediction tasks
-            future_to_sample = {
-                executor.submit(self._predict_sample, sample, model_prediction_dir): sample
-                for sample in dataset_list
-            }
-            # Process completed tasks with progress bar
-            with tqdm(total=len(dataset_list), desc=f'Predicting[{self.benchmark_name}@{subset}]: ') as pbar:
-                for future in as_completed(future_to_sample):
-                    sample = future_to_sample[future]
-                    try:
-                        task_state = future.result()
-                        task_state_list.append(task_state)
-                        # Save the prediction result to cache for future use
-                        model_result = self.cache_manager.save_prediction_cache(
-                            subset, task_state, self.benchmark.save_metadata
-                        )
-                        logger.debug(f'Model result: \n{model_result.model_dump_json(indent=2)}')
-                    except Exception as exc:
-                        logger.error(f'{sample.model_dump_json(indent=2)} prediction failed: due to {exc}')
-                        if self.task_config.ignore_errors:
-                            logger.warning('Error ignored, continuing with next sample.')
-                        else:
-                            raise exc
-                    finally:
-                        pbar.update(1)
-        return task_state_list
+            return cached_task_state_list
+        logger.info(f'Processing {len(dataset_list)} samples, if data is large, it may take a while.')
+        def worker(sample: Sample) -> TaskState:
+            return self._predict_sample(sample, model_prediction_dir)
+        def on_result(sample: Sample, task_state: TaskState) -> None:
+            model_result = self.cache_manager.save_prediction_cache(subset, task_state, self.benchmark.save_metadata)
+            logger.debug(f'Model result: \n{model_result.pretty_print()}')
+        def on_error(sample: Sample, exc: Exception) -> None:
+            tb_str = traceback.format_exc()
+            logger.error(f'{sample.model_dump_json(indent=2)} prediction failed: due to {exc}\nTraceback:\n{tb_str}')
+            if self.task_config.ignore_errors:
+                logger.warning('Error ignored, continuing with next sample.')
+                return
+            raise exc
+        finished_task_states = run_in_threads_with_progress(
+            dataset_list,
+            worker,
+            desc=f'Predicting[{self.benchmark_name}@{subset}]: ',
+            max_workers=self.task_config.eval_batch_size,
+            heartbeat_sec=HEARTBEAT_INTERVAL_SEC,
+            on_result=on_result,
+            on_error=on_error,
+            filter_none_results=True,
+        )
+        logger.info(f'Finished getting predictions for subset: {subset}.')
+        return cached_task_state_list + finished_task_states
     def _predict_sample(self, sample: Sample, model_prediction_dir: str) -> TaskState:
         """
@@ -229,50 +240,58 @@ class DefaultEvaluator(Evaluator):
         """
         # Initialize sample score list and filter cached reviews if caching is enabled
         if self.use_cache and not self.task_config.rerun_review:
-            sample_score_list, task_states = self.cache_manager.filter_review_cache(subset, task_states)
+            cached_score_list, task_states = self.cache_manager.filter_review_cache(subset, task_states)
         else:
             # Init a clean sample score list
-            sample_score_list = []
+            cached_score_list = []
             self.cache_manager.delete_review_cache(subset)
         if not task_states:
-            return sample_score_list
-        # Process task states in parallel using ThreadPoolExecutor
-        with ThreadPoolExecutor(max_workers=min(len(task_states), self.task_config.judge_worker_num)) as executor:
-            # Submit all review tasks
-            future_to_task_state = {
-                executor.submit(self._review_task_state, task_state): task_state
-                for task_state in task_states
-            }
-            # Process completed tasks with progress bar
-            with tqdm(total=len(task_states), desc=f'Reviewing[{self.benchmark_name}@{subset}]: ') as pbar:
-                for future in as_completed(future_to_task_state):
-                    task_state = future_to_task_state[future]
-                    try:
-                        sample_score = future.result()
-                        sample_score_list.append(sample_score)
-                        # Save the review result to cache for future use
-                        review_result = self.cache_manager.save_review_cache(
-                            subset=subset,
-                            task_state=task_state,
-                            sample_score=sample_score,
-                            save_metadata=self.benchmark.save_metadata
-                        )
-                        logger.debug(f'Review result: \n{review_result.model_dump_json(indent=2)}')
-                    except Exception as exc:
-                        logger.error(f'Error when review sample {task_state.sample_id}: {exc}')
-                        if self.task_config.ignore_errors:
-                            logger.warning('Error ignored, continuing with next sample.')
-                        else:
-                            raise exc
-                    finally:
-                        pbar.update(1)
-        return sample_score_list
+            return cached_score_list
+        logger.info(f'Reviewing {len(task_states)} samples, if data is large, it may take a while.')
+        def worker(task_state: TaskState) -> SampleScore:
+            return self._review_task_state(task_state)
+        def on_result(task_state: TaskState, sample_score: SampleScore) -> None:
+            review_result = self.cache_manager.save_review_cache(
+                subset=subset,
+                task_state=task_state,
+                sample_score=sample_score,
+                save_metadata=self.benchmark.save_metadata
+            )
+            logger.debug(f'Review result: \n{review_result.pretty_print()}')
+        def on_error(task_state: TaskState, exc: Exception) -> None:
+            tb_str = traceback.format_exc()
+            logger.error(f'Error when review sample {task_state.sample_id}: due to {exc}\nTraceback:\n{tb_str}')
+            if self.task_config.ignore_errors:
+                logger.warning('Error ignored, continuing with next sample.')
+                return
+            raise exc
+        # Run reviews in parallel
+        reviewed_scores = run_in_threads_with_progress(
+            task_states,
+            worker,
+            desc=f'Reviewing[{self.benchmark_name}@{subset}]: ',
+            max_workers=self.task_config.judge_worker_num,
+            heartbeat_sec=HEARTBEAT_INTERVAL_SEC,
+            on_error=on_error,
+            # Do not persist interim results when batch scoring is enabled
+            on_result=None if self.benchmark.use_batch_scoring else on_result,
+            filter_none_results=False,
+        )
+        # Batch calculate metrics if supported by the benchmark
+        if self.benchmark.use_batch_scoring:
+            reviewed_scores = self._batch_review_task_states(
+                task_states=task_states, reviewed_scores=reviewed_scores, on_result=on_result
+            )
+        logger.info(f'Finished reviewing subset: {subset}. Total reviewed: {len(reviewed_scores)}')
+        return cached_score_list + reviewed_scores
     def _review_task_state(self, task_state: TaskState) -> SampleScore:
         """
@@ -288,6 +307,40 @@ class DefaultEvaluator(Evaluator):
         sample_score = self.benchmark.calculate_metrics(task_state=task_state)
         return sample_score
+    def _batch_review_task_states(
+        self, task_states: List[TaskState], reviewed_scores: List[SampleScore],
+        on_result: Callable[[TaskState, SampleScore], None]
+    ) -> List[SampleScore]:
+        valid_indices = [i for i, score in enumerate(reviewed_scores) if score is not None]
+        if not valid_indices:
+            return reviewed_scores
+        task_states = [task_states[i] for i in valid_indices]
+        reviewed_scores = [reviewed_scores[i] for i in valid_indices]
+        # Iterate in batches with progress bar
+        all_reviewed_scores = []
+        total = len(task_states)
+        batch_size = self.task_config.judge_worker_num
+        with tqdm(total=total, desc='Scoring (batch)', unit='sample') as pbar:
+            for start in range(0, total, batch_size):
+                # Process batch
+                end = min(start + batch_size, total)
+                batch_task_states = task_states[start:end]
+                batch_scores = reviewed_scores[start:end]
+                # Batch calculate metrics
+                updated_reviewed_scores = self.benchmark.batch_calculate_metrics(
+                    task_states=batch_task_states, sample_scores=batch_scores
+                )
+                # Append results
+                all_reviewed_scores.extend(updated_reviewed_scores)
+                # Save each result to cache
+                for task_state, sample_score in zip(batch_task_states, updated_reviewed_scores):
+                    on_result(task_state, sample_score)
+                pbar.update(len(batch_task_states))
+        return all_reviewed_scores
     def get_report(self, agg_score_dict: Dict[str, List[AggScore]]) -> Report:
         """
         Generate a comprehensive evaluation report from aggregated scores.
@@ -317,7 +370,7 @@ class DefaultEvaluator(Evaluator):
         # Generate and display a summary table of results
         try:
-            report_table = gen_table(report_list=[report], add_overall_metric=True)
+            report_table = gen_table(report_list=[report], add_overall_metric=self.benchmark.add_overall_metric)
             logger.info(f'\n{self.benchmark_name} report table:'
                         f'\n{report_table} \n')
         except Exception:
@@ -335,3 +388,6 @@ class DefaultEvaluator(Evaluator):
         report.to_json(report_file)
         logger.info(f'Dump report to: {report_file} \n')
         return report
+    def finalize(self, *args, **kwargs):
+        self.benchmark.finalize(*args, **kwargs)

evalscope/metrics/bert_score/__init__.py ADDED Viewed

File without changes

evalscope/metrics/bert_score/scorer.py ADDED Viewed

@@ -0,0 +1,338 @@
+# flake8: noqa
+import numpy as np
+import os
+import pandas as pd
+import time
+import torch
+import warnings
+from collections import defaultdict
+from .utils import (
+    bert_cos_score_idf,
+    get_bert_embedding,
+    get_hash,
+    get_idf_dict,
+    get_model,
+    get_tokenizer,
+    lang2model,
+    model2layers,
+    sent_encode,
+)
+class BERTScorer:
+    """
+    BERTScore Scorer Object.
+    """
+    def __init__(
+        self,
+        model_id_or_path=None,
+        model_type=None,
+        num_layers=None,
+        batch_size=64,
+        nthreads=4,
+        all_layers=False,
+        idf=False,
+        idf_sents=None,
+        device=None,
+        lang=None,
+        rescale_with_baseline=False,
+        baseline_path=None,
+        use_fast_tokenizer=False,
+    ):
+        """
+        Args:
+            - :param: `model_type` (str): contexual embedding model specification, default using the suggested
+                      model for the target langauge; has to specify at least one of
+                      `model_type` or `lang`
+            - :param: `num_layers` (int): the layer of representation to use.
+                      default using the number of layer tuned on WMT16 correlation data
+            - :param: `verbose` (bool): turn on intermediate status update
+            - :param: `idf` (bool): a booling to specify whether to use idf or not (this should be True even if `idf_sents` is given)
+            - :param: `idf_sents` (List of str): list of sentences used to compute the idf weights
+            - :param: `device` (str): on which the contextual embedding model will be allocated on.
+                      If this argument is None, the model lives on cuda:0 if cuda is available.
+            - :param: `batch_size` (int): bert score processing batch size
+            - :param: `nthreads` (int): number of threads
+            - :param: `lang` (str): language of the sentences; has to specify
+                      at least one of `model_type` or `lang`. `lang` needs to be
+                      specified when `rescale_with_baseline` is True.
+            - :param: `return_hash` (bool): return hash code of the setting
+            - :param: `rescale_with_baseline` (bool): rescale bertscore with pre-computed baseline
+            - :param: `baseline_path` (str): customized baseline file
+            - :param: `use_fast_tokenizer` (bool): `use_fast` parameter passed to HF tokenizer
+        """
+        assert (lang is not None or model_type is not None), 'Either lang or model_type should be specified'
+        if rescale_with_baseline:
+            assert (lang is not None), 'Need to specify Language when rescaling with baseline'
+        if device is None:
+            self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        else:
+            self.device = device
+        self._lang = lang
+        self._rescale_with_baseline = rescale_with_baseline
+        self._idf = idf
+        self.batch_size = batch_size
+        self.nthreads = nthreads
+        self.all_layers = all_layers
+        self.model_id_or_path = model_id_or_path
+        if model_type is None:
+            lang = lang.lower()
+            self._model_type = lang2model[lang]
+        else:
+            self._model_type = model_type
+        if num_layers is None:
+            self._num_layers = model2layers[self.model_type]
+        else:
+            self._num_layers = num_layers
+        # Building model and tokenizer
+        self._use_fast_tokenizer = use_fast_tokenizer
+        self._tokenizer = get_tokenizer(self.model_id_or_path, self._use_fast_tokenizer)
+        self._model = get_model(self.model_id_or_path, self.num_layers, self.all_layers)
+        self._model.to(self.device)
+        self._idf_dict = None
+        if idf_sents is not None:
+            self.compute_idf(idf_sents)
+        self._baseline_vals = None
+        self.baseline_path = baseline_path
+        self.use_custom_baseline = self.baseline_path is not None
+        if self.baseline_path is None:
+            self.baseline_path = os.path.join(
+                os.path.dirname(__file__),
+                f'rescale_baseline/{self.lang}/{self.model_type}.tsv',
+            )
+    @property
+    def lang(self):
+        return self._lang
+    @property
+    def idf(self):
+        return self._idf
+    @property
+    def model_type(self):
+        return self._model_type
+    @property
+    def num_layers(self):
+        return self._num_layers
+    @property
+    def rescale_with_baseline(self):
+        return self._rescale_with_baseline
+    @property
+    def baseline_vals(self):
+        if self._baseline_vals is None:
+            if os.path.isfile(self.baseline_path):
+                if not self.all_layers:
+                    self._baseline_vals = torch.from_numpy(
+                        pd.read_csv(self.baseline_path).iloc[self.num_layers].to_numpy()
+                    )[1:].float()
+                else:
+                    self._baseline_vals = (
+                        torch.from_numpy(pd.read_csv(self.baseline_path).to_numpy())[:, 1:].unsqueeze(1).float()
+                    )
+            else:
+                raise ValueError(f'Baseline not Found for {self.model_type} on {self.lang} at {self.baseline_path}')
+        return self._baseline_vals
+    @property
+    def use_fast_tokenizer(self):
+        return self._use_fast_tokenizer
+    @property
+    def hash(self):
+        return get_hash(
+            self.model_type,
+            self.num_layers,
+            self.idf,
+            self.rescale_with_baseline,
+            self.use_custom_baseline,
+            self.use_fast_tokenizer,
+        )
+    def compute_idf(self, sents):
+        """
+        Args:
+        """
+        if self._idf_dict is not None:
+            warnings.warn('Overwriting the previous importance weights.')
+        self._idf_dict = get_idf_dict(sents, self._tokenizer, nthreads=self.nthreads)
+    def score(self, cands, refs, verbose=False, batch_size=64, return_hash=False):
+        """
+        Args:
+            - :param: `cands` (list of str): candidate sentences
+            - :param: `refs` (list of str or list of list of str): reference sentences
+        Return:
+            - :param: `(P, R, F)`: each is of shape (N); N = number of input
+                      candidate reference pairs. if returning hashcode, the
+                      output will be ((P, R, F), hashcode). If a candidate have
+                      multiple references, the returned score of this candidate is
+                      the *best* score among all references.
+        """
+        ref_group_boundaries = None
+        if not isinstance(refs[0], str):
+            ref_group_boundaries = []
+            ori_cands, ori_refs = cands, refs
+            cands, refs = [], []
+            count = 0
+            for cand, ref_group in zip(ori_cands, ori_refs):
+                cands += [cand] * len(ref_group)
+                refs += ref_group
+                ref_group_boundaries.append((count, count + len(ref_group)))
+                count += len(ref_group)
+        if verbose:
+            print('calculating scores...')
+            start = time.perf_counter()
+        if self.idf:
+            assert self._idf_dict, 'IDF weights are not computed'
+            idf_dict = self._idf_dict
+        else:
+            idf_dict = defaultdict(lambda: 1.0)
+            idf_dict[self._tokenizer.sep_token_id] = 0
+            idf_dict[self._tokenizer.cls_token_id] = 0
+        all_preds = bert_cos_score_idf(
+            self._model,
+            refs,
+            cands,
+            self._tokenizer,
+            idf_dict,
+            verbose=verbose,
+            device=self.device,
+            batch_size=batch_size,
+            all_layers=self.all_layers,
+        ).cpu()
+        if ref_group_boundaries is not None:
+            max_preds = []
+            for start, end in ref_group_boundaries:
+                max_preds.append(all_preds[start:end].max(dim=0)[0])
+            all_preds = torch.stack(max_preds, dim=0)
+        if self.rescale_with_baseline:
+            all_preds = (all_preds - self.baseline_vals) / (1 - self.baseline_vals)
+        out = all_preds[..., 0], all_preds[..., 1], all_preds[..., 2]  # P, R, F
+        if verbose:
+            time_diff = time.perf_counter() - start
+            print(f'done in {time_diff:.2f} seconds, {len(refs) / time_diff:.2f} sentences/sec')
+        if return_hash:
+            out = tuple([out, self.hash])
+        return out
+    def plot_example(self, candidate, reference, fname=''):
+        """
+        Args:
+            - :param: `candidate` (str): a candidate sentence
+            - :param: `reference` (str): a reference sentence
+            - :param: `fname` (str): path to save the output plot
+        """
+        import matplotlib.pyplot as plt
+        from mpl_toolkits.axes_grid1 import make_axes_locatable
+        assert isinstance(candidate, str)
+        assert isinstance(reference, str)
+        idf_dict = defaultdict(lambda: 1.0)
+        idf_dict[self._tokenizer.sep_token_id] = 0
+        idf_dict[self._tokenizer.cls_token_id] = 0
+        hyp_embedding, masks, padded_idf = get_bert_embedding(
+            [candidate],
+            self._model,
+            self._tokenizer,
+            idf_dict,
+            device=self.device,
+            all_layers=False,
+        )
+        ref_embedding, masks, padded_idf = get_bert_embedding(
+            [reference],
+            self._model,
+            self._tokenizer,
+            idf_dict,
+            device=self.device,
+            all_layers=False,
+        )
+        ref_embedding.div_(torch.norm(ref_embedding, dim=-1).unsqueeze(-1))
+        hyp_embedding.div_(torch.norm(hyp_embedding, dim=-1).unsqueeze(-1))
+        sim = torch.bmm(hyp_embedding, ref_embedding.transpose(1, 2))
+        sim = sim.squeeze(0).cpu()
+        r_tokens = [self._tokenizer.decode([i]) for i in sent_encode(self._tokenizer, reference)][1:-1]
+        h_tokens = [self._tokenizer.decode([i]) for i in sent_encode(self._tokenizer, candidate)][1:-1]
+        sim = sim[1:-1, 1:-1]
+        if self.rescale_with_baseline:
+            sim = (sim - self.baseline_vals[2].item()) / (1 - self.baseline_vals[2].item())
+        fig, ax = plt.subplots(figsize=(len(r_tokens), len(h_tokens)))
+        im = ax.imshow(sim, cmap='Blues', vmin=0, vmax=1)
+        # We want to show all ticks...
+        ax.set_xticks(np.arange(len(r_tokens)))
+        ax.set_yticks(np.arange(len(h_tokens)))
+        # ... and label them with the respective list entries
+        ax.set_xticklabels(r_tokens, fontsize=10)
+        ax.set_yticklabels(h_tokens, fontsize=10)
+        ax.grid(False)
+        plt.xlabel('Reference (tokenized)', fontsize=14)
+        plt.ylabel('Candidate (tokenized)', fontsize=14)
+        title = 'Similarity Matrix'
+        if self.rescale_with_baseline:
+            title += ' (after Rescaling)'
+        plt.title(title, fontsize=14)
+        divider = make_axes_locatable(ax)
+        cax = divider.append_axes('right', size='2%', pad=0.2)
+        fig.colorbar(im, cax=cax)
+        # Rotate the tick labels and set their alignment.
+        plt.setp(ax.get_xticklabels(), rotation=45, ha='right', rotation_mode='anchor')
+        # Loop over data dimensions and create text annotations.
+        for i in range(len(h_tokens)):
+            for j in range(len(r_tokens)):
+                text = ax.text(
+                    j,
+                    i,
+                    '{:.3f}'.format(sim[i, j].item()),
+                    ha='center',
+                    va='center',
+                    color='k' if sim[i, j].item() < 0.5 else 'w',
+                )
+        fig.tight_layout()
+        if fname != '':
+            plt.savefig(fname, dpi=100)
+            print('Saved figure to file: ', fname)
+        plt.show()
+    def __repr__(self):
+        return f'{self.__class__.__name__}(hash={self.hash}, batch_size={self.batch_size}, nthreads={self.nthreads})'
+    def __str__(self):
+        return self.__repr__()

evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

evalscope 1.0.0py3-none-any.whl → 1.2.0py3-none-any.whl