PyPI - evalscope - Versions diffs - 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl - Mend

evalscope 0.17.1py3-none-any.whl → 1.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show

evalscope/__init__.py +4 -1
evalscope/api/benchmark/__init__.py +3 -0
evalscope/api/benchmark/adapters/__init__.py +5 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
evalscope/api/benchmark/benchmark.py +356 -0
evalscope/api/benchmark/meta.py +121 -0
evalscope/api/dataset/__init__.py +2 -0
evalscope/api/dataset/dataset.py +349 -0
evalscope/api/dataset/loader.py +262 -0
evalscope/api/dataset/utils.py +143 -0
evalscope/api/evaluator/__init__.py +3 -0
evalscope/api/evaluator/cache.py +378 -0
evalscope/api/evaluator/evaluator.py +56 -0
evalscope/api/evaluator/state.py +275 -0
evalscope/api/filter/__init__.py +1 -0
evalscope/api/filter/filter.py +72 -0
evalscope/api/messages/__init__.py +12 -0
evalscope/api/messages/chat_message.py +243 -0
evalscope/api/messages/content.py +102 -0
evalscope/api/messages/utils.py +35 -0
evalscope/api/metric/__init__.py +2 -0
evalscope/api/metric/metric.py +55 -0
evalscope/api/metric/scorer.py +113 -0
evalscope/api/mixin/__init__.py +1 -0
evalscope/api/mixin/llm_judge_mixin.py +168 -0
evalscope/api/model/__init__.py +12 -0
evalscope/api/model/generate_config.py +155 -0
evalscope/api/model/model.py +386 -0
evalscope/api/model/model_output.py +285 -0
evalscope/api/registry.py +182 -0
evalscope/api/tool/__init__.py +3 -0
evalscope/api/tool/tool_call.py +101 -0
evalscope/api/tool/tool_info.py +173 -0
evalscope/api/tool/utils.py +64 -0
evalscope/app/app.py +3 -0
evalscope/app/ui/app_ui.py +2 -1
evalscope/app/ui/multi_model.py +50 -25
evalscope/app/ui/single_model.py +26 -14
evalscope/app/utils/data_utils.py +43 -27
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/text_utils.py +14 -14
evalscope/app/utils/visualization.py +9 -4
evalscope/arguments.py +7 -10
evalscope/backend/opencompass/api_meta_template.py +2 -1
evalscope/backend/opencompass/backend_manager.py +6 -5
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
evalscope/backend/rag_eval/ragas/task_template.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
evalscope/backend/rag_eval/utils/embedding.py +10 -1
evalscope/backend/rag_eval/utils/llm.py +13 -12
evalscope/benchmarks/__init__.py +0 -2
evalscope/benchmarks/aime/aime24_adapter.py +38 -40
evalscope/benchmarks/aime/aime25_adapter.py +34 -40
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
evalscope/benchmarks/arc/arc_adapter.py +34 -147
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
evalscope/benchmarks/arena_hard/utils.py +37 -1
evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
evalscope/benchmarks/bfcl/generation.py +222 -0
evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
evalscope/benchmarks/docmath/utils.py +4 -5
evalscope/benchmarks/drop/drop_adapter.py +88 -40
evalscope/benchmarks/frames/frames_adapter.py +136 -52
evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
evalscope/benchmarks/general_arena/utils.py +23 -27
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
evalscope/benchmarks/hle/hle_adapter.py +127 -93
evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
evalscope/benchmarks/ifeval/instructions.py +109 -64
evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/ifeval/utils.py +6 -7
evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
evalscope/benchmarks/math_vista/__init__.py +0 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
evalscope/benchmarks/musr/musr_adapter.py +33 -64
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
evalscope/benchmarks/race/race_adapter.py +33 -119
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
evalscope/benchmarks/super_gpqa/utils.py +2 -1
evalscope/benchmarks/tau_bench/generation.py +147 -0
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
evalscope/cli/cli.py +2 -0
evalscope/cli/start_app.py +7 -1
evalscope/cli/start_perf.py +7 -1
evalscope/cli/start_server.py +6 -3
evalscope/collections/__init__.py +2 -10
evalscope/collections/sampler.py +10 -10
evalscope/collections/schema.py +13 -11
evalscope/config.py +157 -57
evalscope/constants.py +37 -61
evalscope/evaluator/__init__.py +1 -1
evalscope/evaluator/evaluator.py +275 -419
evalscope/filters/__init__.py +2 -0
evalscope/filters/extraction.py +126 -0
evalscope/filters/selection.py +57 -0
evalscope/metrics/__init__.py +13 -13
evalscope/metrics/llm_judge.py +47 -33
evalscope/metrics/math_parser.py +27 -22
evalscope/metrics/metric.py +307 -0
evalscope/metrics/metrics.py +22 -18
evalscope/metrics/t2v_metrics/__init__.py +0 -52
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
evalscope/models/__init__.py +6 -29
evalscope/models/image_edit_model.py +125 -0
evalscope/models/mockllm.py +65 -0
evalscope/models/model_apis.py +67 -0
evalscope/models/modelscope.py +455 -0
evalscope/models/openai_compatible.py +126 -0
evalscope/models/text2image_model.py +124 -0
evalscope/models/utils/openai.py +701 -0
evalscope/perf/benchmark.py +4 -1
evalscope/perf/http_client.py +4 -2
evalscope/perf/plugin/api/custom_api.py +5 -4
evalscope/perf/plugin/api/openai_api.py +11 -9
evalscope/perf/plugin/datasets/custom.py +2 -1
evalscope/perf/plugin/datasets/flickr8k.py +1 -1
evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
evalscope/perf/plugin/datasets/line_by_line.py +2 -1
evalscope/perf/plugin/datasets/longalpaca.py +2 -1
evalscope/perf/plugin/datasets/openqa.py +4 -2
evalscope/perf/utils/benchmark_util.py +15 -10
evalscope/perf/utils/db_util.py +9 -6
evalscope/perf/utils/local_server.py +11 -3
evalscope/perf/utils/rich_display.py +16 -10
evalscope/report/__init__.py +2 -3
evalscope/report/combinator.py +18 -12
evalscope/report/generator.py +51 -35
evalscope/report/{utils.py → report.py} +8 -6
evalscope/run.py +33 -47
evalscope/summarizer.py +1 -1
evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
evalscope/utils/__init__.py +21 -2
evalscope/utils/chat_service.py +3 -2
evalscope/utils/deprecation_utils.py +12 -1
evalscope/utils/function_utils.py +29 -0
evalscope/utils/import_utils.py +23 -1
evalscope/utils/io_utils.py +142 -6
evalscope/utils/json_schema.py +208 -0
evalscope/utils/logger.py +51 -12
evalscope/utils/model_utils.py +11 -7
evalscope/utils/multi_choices.py +288 -0
evalscope/utils/url_utils.py +65 -0
evalscope/version.py +2 -2
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
tests/benchmark/test_eval.py +385 -0
tests/benchmark/test_image_edit.py +65 -0
tests/{aigc → benchmark}/test_t2i.py +22 -4
tests/benchmark/test_vlm.py +80 -0
tests/cli/test_all.py +85 -47
tests/cli/test_collection.py +20 -8
tests/cli/test_custom.py +22 -15
tests/cli/test_reasoning.py +81 -0
tests/common.py +73 -0
tests/perf/test_perf.py +4 -2
tests/rag/test_clip_benchmark.py +0 -2
evalscope/benchmarks/aigc/t2i/base.py +0 -56
evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
evalscope/benchmarks/arc/ai2_arc.py +0 -151
evalscope/benchmarks/benchmark.py +0 -81
evalscope/benchmarks/ceval/ceval_exam.py +0 -146
evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
evalscope/benchmarks/competition_math/competition_math.py +0 -79
evalscope/benchmarks/data_adapter.py +0 -528
evalscope/benchmarks/filters.py +0 -59
evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
evalscope/benchmarks/humaneval/humaneval.py +0 -79
evalscope/benchmarks/mmlu/mmlu.py +0 -160
evalscope/benchmarks/mmlu/samples.jsonl +0 -5
evalscope/benchmarks/process_bench/critique_template.txt +0 -13
evalscope/benchmarks/race/race.py +0 -104
evalscope/benchmarks/race/samples.jsonl +0 -5
evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
evalscope/benchmarks/utils.py +0 -60
evalscope/collections/evaluator.py +0 -375
evalscope/metrics/completion_parsers.py +0 -227
evalscope/metrics/named_metrics.py +0 -55
evalscope/models/adapters/__init__.py +0 -14
evalscope/models/adapters/base_adapter.py +0 -84
evalscope/models/adapters/bfcl_adapter.py +0 -246
evalscope/models/adapters/chat_adapter.py +0 -207
evalscope/models/adapters/choice_adapter.py +0 -222
evalscope/models/adapters/custom_adapter.py +0 -71
evalscope/models/adapters/server_adapter.py +0 -236
evalscope/models/adapters/t2i_adapter.py +0 -79
evalscope/models/adapters/tau_bench_adapter.py +0 -189
evalscope/models/custom/__init__.py +0 -4
evalscope/models/custom/custom_model.py +0 -50
evalscope/models/custom/dummy_model.py +0 -99
evalscope/models/local_model.py +0 -128
evalscope/models/register.py +0 -41
tests/cli/test_run.py +0 -489
/evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
/tests/{aigc → benchmark}/__init__.py +0 -0

evalscope/evaluator/evaluator.py CHANGED Viewed

@@ -1,483 +1,339 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+"""
+Default evaluator implementation for running benchmark evaluations.
+This module provides the DefaultEvaluator class which orchestrates the entire
+evaluation process including data loading, model inference, metric calculation,
+and report generation.
+"""
-import json
 import os
-import time
-from collections import OrderedDict, defaultdict
+from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from copy import deepcopy
 from tqdm import tqdm
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Dict, List, Tuple, Union
-from evalscope.benchmarks import DataAdapter
-from evalscope.config import TaskConfig
-from evalscope.constants import AnswerKeys, DumpMode, EvalStage, EvalType, JudgeStrategy, ReviewKeys
+from evalscope.api.dataset import Dataset, DatasetDict, Sample
+from evalscope.api.evaluator import CacheManager, Evaluator, TaskState
+from evalscope.api.metric import AggScore, SampleScore
 from evalscope.report import Report, gen_table
-from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, gen_hash, jsonl_to_list
-from evalscope.utils.logger import get_logger
-from evalscope.utils.model_utils import dict_torch_dtype_to_str
 if TYPE_CHECKING:
-    from evalscope.models import BaseModelAdapter
+    from evalscope.api.benchmark import DataAdapter
+    from evalscope.api.model import Model
+    from evalscope.config import TaskConfig
+    from evalscope.utils.io_utils import OutputsStructure
+from evalscope.utils.logger import get_logger
 logger = get_logger()
-class Evaluator(object):
+class DefaultEvaluator(Evaluator):
     """
-    The evaluator for model on datasets.
+    Default Evaluator for running evaluations on benchmarks.
+    This evaluator handles the complete evaluation pipeline:
+    1. Loading datasets from benchmarks
+    2. Running model inference on samples
+    3. Calculating evaluation metrics
+    4. Generating and saving reports
+    5. Managing caching for predictions and reviews
     Args:
-        dataset_name_or_path: str, the dataset name or path.
-                if the dataset is a local path, e.g. /path/to/your_dataset_name,
-                then the task name will be the basename of the path, which is `your_dataset_name`.
-        data_adapter: DataAdapter, the data adapter for the dataset.
-        model_adapter: BaseModelAdapter, the model adapter for the model.
-        outputs: OutputsStructure, the outputs dir. Default: None
-        task_cfg: TaskConfig, the overall task config. Default: None
-        **kwargs: kwargs.
+        benchmark: The data adapter for loading and processing data.
+        model: The model to be evaluated.
+        outputs: The output structure for saving evaluation results.
+        task_config: The task configuration.
     """
-    def __init__(self,
-                 data_adapter: DataAdapter,
-                 model_adapter: 'BaseModelAdapter',
-                 outputs: OutputsStructure = None,
-                 task_cfg: TaskConfig = None,
-                 **kwargs):
-        self.dataset_name = data_adapter.name
-        self.dataset_name_or_path = os.path.expanduser(data_adapter.dataset_id)
-        self.model_name = task_cfg.model_id
-        self.data_adapter = data_adapter
-        self.model_adapter = model_adapter
-        self.model_cfg = model_adapter.model_cfg
-        self.eval_type = task_cfg.eval_type
-        self.dataset_hub = task_cfg.dataset_hub
-        self.stage = task_cfg.stage
-        self.use_cache = task_cfg.use_cache
-        self.task_cfg = task_cfg
-        # Deal with the output paths
-        self.outputs_structure = outputs
-        self.kwargs = kwargs
-        self._init_judge()
-    def _init_judge(self):
-        if self.task_cfg.judge_strategy == JudgeStrategy.RULE:
-            self.judge = None
-        else:
-            from evalscope.metrics import LLMJudge
-            self.judge = LLMJudge(**self.task_cfg.judge_model_args)
-    def load_dataset(self):
-        dataset = self.data_adapter.load(
-            work_dir=os.path.expanduser(self.task_cfg.dataset_dir), datasets_hub=self.dataset_hub, **self.kwargs)
-        # Get prompts from dataset
-        prompts = self.data_adapter.gen_prompts(data_dict=dataset)
-        # Limit and index prompts
-        limited_prompts = defaultdict(list)
-        for subset_name, prompts_list in prompts.items():
-            # If limit is None, use all prompts
-            if self.task_cfg.limit is None:
-                limit = len(prompts_list)
-            else:
-                if isinstance(self.task_cfg.limit, int):
-                    limit = self.task_cfg.limit
-                elif isinstance(self.task_cfg.limit, float):
-                    limit = int(len(prompts_list) * self.task_cfg.limit)
-            # Limit the number of prompts
-            for index, prompt in enumerate(prompts_list[:min(limit, len(prompts_list))]):
-                prompt[AnswerKeys.INDEX] = index
-                limited_prompts[subset_name].append(prompt)
-        return limited_prompts
-    def _generate_answer_id(self, model_cfg, input_d, infer_cfg):
-        model_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(model_cfg).items())), ensure_ascii=False)
-        input_prompt_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(input_d).items())), ensure_ascii=False)
-        infer_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False)
-        return 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
-    def _process_answer(self, answer_d, input_d, subset_name, answer_id):
-        answer_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
-        answer_d[AnswerKeys.ANSWER_ID] = answer_id
-        answer_d[AnswerKeys.SUBSET_NAME] = subset_name
-        answer_d[AnswerKeys.RAW_INPUT] = input_d[AnswerKeys.RAW_INPUT]
-        answer_d[AnswerKeys.INDEX] = input_d[AnswerKeys.INDEX]
-        return answer_d
-    def _get_answer(self, input_prompts, subset_name, infer_cfg) -> List[dict]:
-        try:
-            # get answer from model
-            answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
-        except Exception as e:
-            logger.error(f'Failed to get answer for {input_prompts}, due to {e}')
-            # if ignore_errors is True, continue to next input
-            if self.task_cfg.ignore_errors:
-                logger.warning('`ignore_errors` is set to True. Dropping this prompt and continuing with evaluation.')
-                return []
-            else:
-                raise e
-        # process answer
-        answers_list = []
-        for answer_d, input_prompt in zip(answer_ds, input_prompts):
-            answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
-            processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
-            answers_list.append(processed_answer)
-        return answers_list
-    @staticmethod
-    def filter_answer(use_cache, prompts_list, pred_file_path) -> dict:
-        # Filter prompts that have been answered
-        answers_list = []
-        if not use_cache or not os.path.exists(pred_file_path):
-            return answers_list, prompts_list
-        def get_answered_indices(answers_list: List[Dict]) -> List[int]:
-            indices = [answer.get(AnswerKeys.INDEX) for answer in answers_list]
-            if all(index is None for index in indices):
-                return list(range(len(answers_list)))
-            return [index for index in indices if index is not None]
-        answers_list = jsonl_to_list(pred_file_path)
-        answered_indices = set(get_answered_indices(answers_list))
-        logger.info(f'Reusing predictions from {pred_file_path}, got {len(answered_indices)} answers.')
-        prompts = [prompt for i, prompt in enumerate(prompts_list) if i not in answered_indices]
-        return answers_list, prompts
-    def get_answers(self, subset_name: str, prompts_list: List[dict], infer_cfg: dict = None, **kwargs) -> list:
+    def __init__(
+        self,
+        benchmark: 'DataAdapter',
+        model: 'Model',
+        outputs: 'OutputsStructure',
+        task_config: 'TaskConfig',
+    ):
+        # Store core components needed for evaluation
+        self.benchmark = benchmark
+        self.model = model
+        self.outputs = outputs
+        self.task_config = task_config
+        # Extract frequently used identifiers
+        self.benchmark_name = benchmark.name
+        """Name of the benchmark being evaluated."""
+        self.model_name = task_config.model_id
+        """ID of the model being evaluated."""
+        self.use_cache = task_config.use_cache
+        """Whether to use cache for predictions."""
+        # Initialize cache manager for storing and retrieving cached results
+        self.cache_manager = CacheManager(
+            outputs=outputs,
+            model_name=self.model_name,
+            benchmark_name=self.benchmark_name,
+        )
+    def eval(self) -> Report:
         """
-        Get answers from model inference.
-        It is required to rewrite this method to support your own evaluator.
+        Run the complete evaluation process.
-        Args:
-            subset_name: subset name for benchmark.
-            prompts_list: prompts list.
-            infer_cfg: model inference config.
-                Attributes:
-                    do_sample: bool, whether to use sampling.
-                    top_k: int, the number of highest probability vocabulary tokens to keep for top-k-filtering.
-                    top_p: float, if set to float < 1, only the most probable tokens with probabilities to add.
-                    temperature: float, the value used to module the next token probabilities.
-                    num_beams: int, number of beams for beam search. 1 means no beam search.
-                    max_length: int, the max length of the sequence to be generated.
-                    max_new_tokens: int, the max number of new tokens to be generated.
-                    repetition_penalty: float, the parameter for repetition penalty. 1.0 means no penalty.
-            **kwargs: kwargs.
-        Returns: The list of answers.
+        This is the main entry point that orchestrates the entire evaluation:
+        1. Load dataset from benchmark
+        2. Evaluate each subset independently
+        3. Aggregate scores across subsets
+        4. Generate final evaluation report
+        Returns:
+            Report: The complete evaluation report containing all metrics and results.
         """
-        assert self.data_adapter is not None, 'data_adapter must be provided when calling func get_answers() !'
-        assert self.model_adapter is not None, 'model must be provided when calling func get_answers() !'
-        assert len(prompts_list) > 0, 'prompts_list must not be empty when calling func get_answers() !'
-        pred_file_name = self.dataset_name + '_' + subset_name + '.jsonl'
-        pred_file_path = os.path.join(self.outputs_structure.predictions_dir, self.model_name, pred_file_name)
-        os.makedirs(os.path.dirname(pred_file_path), exist_ok=True)
-        answers_list, prompts_list = Evaluator.filter_answer(self.use_cache, prompts_list, pred_file_path)
-        eval_batch_size = self.task_cfg.eval_batch_size
-        if self.task_cfg.eval_type == EvalType.SERVICE:
-            with tqdm(total=len(prompts_list), desc=f'Predicting({subset_name}): ') as pbar:
-                with ThreadPoolExecutor(max_workers=eval_batch_size) as executor:
-                    futures = []
-                    for input_prompt in prompts_list:
-                        futures.append(executor.submit(self._get_answer, [input_prompt], subset_name, infer_cfg))
-                    for future in as_completed(futures):
-                        answer_ds: List[dict] = future.result()
-                        answers_list.extend(answer_ds)
-                        dump_jsonl_data(answer_ds, pred_file_path, dump_mode=DumpMode.APPEND)
-                        pbar.update(len(answer_ds))
-        else:
-            batch_prompts_list = [
-                prompts_list[i:i + eval_batch_size] for i in range(0, len(prompts_list), eval_batch_size)
-            ]
-            with tqdm(total=len(prompts_list), desc=f'Predicting({subset_name}): ') as pbar:
-                for batch_prompts in batch_prompts_list:
-                    answer_ds: List[dict] = self._get_answer(
-                        input_prompts=batch_prompts, subset_name=subset_name, infer_cfg=infer_cfg)
-                    answers_list.extend(answer_ds)
-                    dump_jsonl_data(answer_ds, pred_file_path, dump_mode=DumpMode.APPEND)
-                    pbar.update(len(batch_prompts))
-        logger.info(f'Dump predictions to {pred_file_path}.')
-        return answers_list
-    def _get_review(self, answer_d: dict, review_id: str = None, reviewer_spec: dict = None) -> dict:
-        if reviewer_spec is None:
-            reviewer_spec = {}
-        review_res = deepcopy(answer_d)
-        if AnswerKeys.CHOICES not in review_res:
-            review_res[AnswerKeys.CHOICES] = []
-            review_res[ReviewKeys.REVIEWED] = True
-            review_res[ReviewKeys.REVIEW_ID] = None
-            review_res[ReviewKeys.REVIEWER_SPEC] = reviewer_spec
-            review_res[ReviewKeys.REVIEW_TIME] = time.time()
-            logger.warning(f'No choices found for answer dict: {review_res}')
-            return review_res
-        rev_choices = []
-        for choice in review_res[AnswerKeys.CHOICES]:
-            raw_input_d: dict = review_res[AnswerKeys.RAW_INPUT]
-            answer_content = choice[ReviewKeys.MESSAGE][ReviewKeys.CONTENT]
-            gold_content = self.data_adapter.get_gold_answer(raw_input_d)
-            # Get review result based on judge strategy
-            use_llm = (
-                self.task_cfg.judge_strategy == JudgeStrategy.LLM
-                or (self.task_cfg.judge_strategy == JudgeStrategy.AUTO and self.data_adapter.llm_as_a_judge))
-            if use_llm:
-                # Use LLM as judge
-                assert self.judge is not None, f'Judge model is required for LLM judging {self.data_adapter.name}'
-                pred_content = self.data_adapter.llm_parse_pred_result(
-                    result=answer_content, raw_input_d=raw_input_d, eval_type=self.eval_type)
-                review_result = self.data_adapter.llm_match(
-                    gold_content, pred_content, self.judge, raw_input=raw_input_d)
-            else:
-                # Use rule-based judging
-                pred_content = self.data_adapter.parse_pred_result(
-                    result=answer_content, raw_input_d=raw_input_d, eval_type=self.eval_type)
-                review_result = self.data_adapter.match(gold_content, pred_content)
-                # For LLM_RECALL strategy, use LLM to re-judge if rule-based result is not good
-                if (self.task_cfg.judge_strategy == JudgeStrategy.LLM_RECALL
-                        and isinstance(review_result, (bool, int, float)) and not bool(review_result)):
-                    assert self.judge is not None, f'Judge model is required for LLM_RECALL strategy {self.data_adapter.name}'  # noqa: E501
-                    pred_content = self.data_adapter.llm_parse_pred_result(
-                        result=answer_content, raw_input_d=raw_input_d, eval_type=self.eval_type)
-                    review_result = self.data_adapter.llm_match(
-                        gold_content, pred_content, self.judge, raw_input=raw_input_d)
-            choice[ReviewKeys.REVIEW] = {
-                ReviewKeys.GOLD: gold_content if gold_content != raw_input_d else '*Same as Input*',
-                ReviewKeys.PRED: pred_content,
-                ReviewKeys.RESULT: review_result
-            }
-            rev_choices.append(choice)
-        review_res[AnswerKeys.CHOICES] = rev_choices
-        review_res[ReviewKeys.REVIEWED] = True
-        review_res[ReviewKeys.REVIEW_ID] = review_id
-        review_res[ReviewKeys.REVIEWER_SPEC] = reviewer_spec
-        review_res[ReviewKeys.REVIEW_TIME] = time.time()
-        return review_res
-    def _generate_review_id(self, answer_d):
-        # Gen review_id (concat: answer_id + reviewer_spec)
-        answer_id = answer_d[AnswerKeys.ANSWER_ID]
-        reviewer_spec = {'metric': self.data_adapter.metric_list, 'reviewer': ['Evaluator'], 'revision': ['default']}
-        reviewer_spec_str = json.dumps(
-            OrderedDict(sorted(dict_torch_dtype_to_str(reviewer_spec).items())), ensure_ascii=False)
-        review_id = 'review-' + gen_hash(answer_id + reviewer_spec_str)
-        return review_id, reviewer_spec
-    def get_reviews(self, subset_name: str, answers_list: List[dict], **kwargs) -> list:
+        # Load the dataset and evaluate each subset
+        dataset_dict = self.benchmark.load_dataset()
+        agg_score_dict = defaultdict(list)
+        # Process each subset (e.g., test, validation) independently
+        for subset, dataset in dataset_dict.items():
+            if len(dataset) == 0:
+                logger.info(f'No samples found in subset: {subset}, skipping.')
+                continue
+            subset_score = self.evaluate_subset(subset, dataset)
+            agg_score_dict[subset] = subset_score
+        # Generate the report based on aggregated scores
+        report = self.get_report(agg_score_dict)
+        return report
+    def evaluate_subset(self, subset: str, dataset: Dataset) -> List[AggScore]:
         """
-        Get reviews from answers.
-        It is required to rewrite this method to support your own evaluator.
+        Evaluate a single subset of the dataset.
+        This method processes one subset through the complete evaluation pipeline:
+        1. Get model predictions for all samples
+        2. Calculate evaluation metrics for predictions
+        3. Aggregate individual sample scores
         Args:
-            subset_name: subset name of benchmark
-            answers_list: inference results list.
-            **kwargs: kwargs.
+            subset: Name of the subset being evaluated (e.g., 'test', 'validation').
+            dataset: The dataset subset containing samples to evaluate.
-        Returns: reviews list.
+        Returns:
+            List[AggScore]: Aggregated scores for this subset.
         """
-        reviews_list = []
-        review_file_name = self.dataset_name + '_' + subset_name + '.jsonl'
-        review_file_path = os.path.join(self.outputs_structure.reviews_dir, self.model_name, review_file_name)
-        os.makedirs(os.path.dirname(review_file_path), exist_ok=True)
-        # Load existing reviews if using cache
-        existing_reviews = {}
-        if self.use_cache and os.path.exists(review_file_path):
-            with open(review_file_path, 'r') as f:
-                for line in f:
-                    review = json.loads(line.strip())
-                    existing_reviews[review['index']] = review
-            logger.info(f'Reusing review result from {review_file_path}, got {len(existing_reviews)} reviews.')
-        def process_single_review(answer_d):
-            # Check if review already exists in cache
-            if self.use_cache and answer_d['index'] in existing_reviews:
-                return existing_reviews[answer_d['index']]
-            review_id, reviewer_spec = self._generate_review_id(answer_d)
-            # Get review
-            review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
-            logger.debug(review_d)
-            return review_d
-        with ThreadPoolExecutor(max_workers=self.task_cfg.judge_worker_num) as executor:
-            # Submit all tasks and get futures
-            futures = [executor.submit(process_single_review, answer_d) for answer_d in answers_list]
-            # Process completed futures with progress bar
-            for future in tqdm(as_completed(futures), total=len(futures), desc=f'Reviewing({subset_name}): '):
-                review_d = future.result()
-                reviews_list.append(review_d)
-                # Dump new reviews only if not using cache or review is new
-                if not self.use_cache or review_d['index'] not in existing_reviews:
-                    dump_jsonl_data(review_d, review_file_path, dump_mode=DumpMode.APPEND)
-        return reviews_list
-    def compute_metrics(self, reviews_list: List[dict]) -> List[dict]:
+        # Get model predictions for all samples in the subset
+        task_states = self.get_answers(subset, dataset)
+        # Calculate evaluation metrics for each prediction
+        sample_scores = self.get_reviews(subset, task_states)
+        # Aggregate individual sample scores into subset-level metrics
+        agg_scores = self.benchmark.aggregate_scores(sample_scores=sample_scores)
+        return agg_scores
+    def get_answers(self, subset: str, dataset: Dataset) -> List[TaskState]:
         """
-        To compute metrics from reviews_list for each subset.
-        It is required to rewrite this method to support your own evaluator.
+        Get model predictions for all samples in the dataset subset.
+        This method handles:
+        1. Loading cached predictions if available and caching is enabled
+        2. Running model inference on remaining samples in parallel
+        3. Saving new predictions to cache
         Args:
-            reviews_list: reviews list.
+            subset: Name of the subset being processed.
+            dataset: The dataset subset containing samples for prediction.
         Returns:
-            The metric result. Depends on the metric function in data_adapter.
+            List[TaskState]: Task states containing model predictions for each sample.
         """
-        # Get max choices
-        choices_lengths = [
-            len(review_d[AnswerKeys.CHOICES]) for review_d in reviews_list if review_d.get(ReviewKeys.REVIEWED)
-        ]
-        if choices_lengths:
-            max_choices = max(choices_lengths)
+        # Initialize task state list and filter cached predictions if caching is enabled
+        if self.use_cache:
+            task_state_list, dataset = self.cache_manager.filter_prediction_cache(subset, dataset)
         else:
-            max_choices = 0
-        # Get review result
-        review_res_list = []
-        for review_d in reviews_list:
-            if not review_d[ReviewKeys.REVIEWED]:
-                logger.warning(f'Review not finished for answer_id: {review_d[AnswerKeys.ANSWER_ID]}, skipping ...')
-                continue
+            task_state_list = []
-            if len(review_d[AnswerKeys.CHOICES]) == 0:
-                logger.warning(f'No choices found for answer_id: {review_d[AnswerKeys.ANSWER_ID]}, skipping ...')
-                continue
-            elif len(review_d[AnswerKeys.CHOICES]) == 1 and max_choices == 1:
-                review_res = review_d[AnswerKeys.CHOICES][0][ReviewKeys.REVIEW][ReviewKeys.RESULT]
-            else:
-                review_res = [choice[ReviewKeys.REVIEW][ReviewKeys.RESULT] for choice in review_d[AnswerKeys.CHOICES]]
-                if len(review_d[AnswerKeys.CHOICES]) < max_choices:
-                    logger.warning(
-                        f'Less choices found for answer_id: {review_d[AnswerKeys.ANSWER_ID]}, '
-                        f'max_choices is {max_choices}, but only {len(review_d[AnswerKeys.CHOICES])} choices found')
+        # Get output directory for storing model predictions
+        model_prediction_dir = os.path.dirname(self.cache_manager.get_prediction_cache_path(subset))
-            review_res_list.append(review_res)
+        # Convert dataset to list for parallel processing
+        dataset_list = list(dataset)
-        metric_score: List[dict] = self.data_adapter.compute_metric(
-            review_res_list=review_res_list, reviews_list=reviews_list)
+        if not dataset_list:
+            return task_state_list
-        return metric_score
+        # Process samples in parallel using ThreadPoolExecutor
+        with ThreadPoolExecutor(max_workers=min(len(dataset_list), self.task_config.eval_batch_size)) as executor:
+            # Submit all prediction tasks
+            future_to_sample = {
+                executor.submit(self._predict_sample, sample, model_prediction_dir): sample
+                for sample in dataset_list
+            }
-    def dump_report(self, reviews_score_all: List[dict]):
+            # Process completed tasks with progress bar
+            with tqdm(total=len(dataset_list), desc=f'Predicting[{self.benchmark_name}@{subset}]: ') as pbar:
+                for future in as_completed(future_to_sample):
+                    sample = future_to_sample[future]
+                    try:
+                        task_state = future.result()
+                        task_state_list.append(task_state)
+                        # Save the prediction result to cache for future use
+                        model_result = self.cache_manager.save_prediction_cache(
+                            subset, task_state, self.benchmark.save_metadata
+                        )
+                        logger.debug(f'Model result: \n{model_result.pretty_print()}')
+                    except Exception as exc:
+                        logger.error(f'{sample.model_dump_json(indent=2)} prediction failed: due to {exc}')
+                        if self.task_config.ignore_errors:
+                            logger.warning('Error ignored, continuing with next sample.')
+                        else:
+                            raise exc
+                    finally:
+                        pbar.update(1)
+        return task_state_list
+    def _predict_sample(self, sample: Sample, model_prediction_dir: str) -> TaskState:
         """
-        Get report for total reviews of specific dataset.
-        It is required to rewrite this method to support your own evaluator.
+        Helper method to predict a single sample.
         Args:
-            reviews_score_all: reviews score list. Generated by func self.data_adapter.compute_metric().
+            sample: The sample to predict.
+            model_prediction_dir: Directory for storing model predictions.
-        Returns: None
+        Returns:
+            TaskState: The task state containing the prediction result.
         """
-        report_path = os.path.join(self.outputs_structure.reports_dir, self.model_name)
-        os.makedirs(report_path, exist_ok=True)
-        # Get report map
-        report_map: Report = self.data_adapter.gen_report(
-            subset_score_map=reviews_score_all, model_name=self.model_name)
-        # Make table
-        try:
-            report_table = gen_table(report_list=[report_map], add_overall_metric=True)
-            logger.info(f'\n{self.dataset_name_or_path} report table:'
-                        f'\n{report_table} \n')
-        except Exception:
-            logger.error('Failed to generate report table.')
-        # Make report analysis
-        if self.task_cfg.analysis_report:
-            logger.info('Generating report analysis, please wait ...')
-            analysis = report_map.generate_analysis(self.task_cfg.judge_model_args)
-            logger.info('Report analysis:\n%s', analysis)
-        else:
-            logger.info('Skipping report analysis (`analysis_report=False`).')
+        logger.debug(f'\n{sample.pretty_print()}')
-        # Dump report
-        report_file = os.path.join(report_path, f'{self.dataset_name}.json')
-        report_map.to_json(report_file)
-        logger.info(f'Dump report to: {report_file} \n')
+        # Run model inference on the current sample
+        task_state = self.benchmark.run_inference(model=self.model, sample=sample, output_dir=model_prediction_dir)
+        return task_state
-        # Post process report
-        try:
-            self.data_adapter.post_process_report(report_map, report_path=report_path)
-        except Exception as e:
-            logger.error(f'Failed to post process report: {e}')
-        return report_map
-    def eval(self, **kwargs) -> dict:
+    def get_reviews(self, subset: str, task_states: List[TaskState]) -> List[SampleScore]:
         """
-        Evaluate the model on the specific benchmark. Streaming & parallel mode is supported.
-        It is required to rewrite this method to support your own evaluator.
+        Calculate evaluation metrics for model predictions.
-        The evaluation process is as follows:
-            1. Get the input samples from the dataset (benchmarks on the ModelScope or HuggingFace).
-            2. Get the input prompts from dataset with specific data adapter.
-            3. Get answers with model inference.
-            4. Get reviews with metric function (or reviewers).
-            5. Generate report from review results.
+        This method handles:
+        1. Loading cached review results if available and caching is enabled
+        2. Computing metrics for remaining task states in parallel
+        3. Saving new review results to cache
         Args:
-            infer_cfg: The config for model inference.
+            subset: Name of the subset being reviewed.
+            task_states: List of task states containing model predictions.
         Returns:
-            Dict of results. Depends on the stage of evaluation.
+            List[SampleScore]: Evaluation scores for each sample.
+        """
+        # Initialize sample score list and filter cached reviews if caching is enabled
+        if self.use_cache and not self.task_config.rerun_review:
+            sample_score_list, task_states = self.cache_manager.filter_review_cache(subset, task_states)
+        else:
+            # Init a clean sample score list
+            sample_score_list = []
+            self.cache_manager.delete_review_cache(subset)
+        if not task_states:
+            return sample_score_list
+        # Process task states in parallel using ThreadPoolExecutor
+        with ThreadPoolExecutor(max_workers=min(len(task_states), self.task_config.judge_worker_num)) as executor:
+            # Submit all review tasks
+            future_to_task_state = {
+                executor.submit(self._review_task_state, task_state): task_state
+                for task_state in task_states
+            }
-            stage == 'all': return the report_map
-            stage == 'infer': return the answers_map
-            stage == 'review': return the reviews_map
+            # Process completed tasks with progress bar
+            with tqdm(total=len(task_states), desc=f'Reviewing[{self.benchmark_name}@{subset}]: ') as pbar:
+                for future in as_completed(future_to_task_state):
+                    task_state = future_to_task_state[future]
+                    try:
+                        sample_score = future.result()
+                        sample_score_list.append(sample_score)
+                        # Save the review result to cache for future use
+                        review_result = self.cache_manager.save_review_cache(
+                            subset=subset,
+                            task_state=task_state,
+                            sample_score=sample_score,
+                            save_metadata=self.benchmark.save_metadata
+                        )
+                        logger.debug(f'Review result: \n{review_result.pretty_print()}')
+                    except Exception as exc:
+                        logger.error(f'Error when review sample {task_state.sample_id}: due to {exc}')
+                        if self.task_config.ignore_errors:
+                            logger.warning('Error ignored, continuing with next sample.')
+                        else:
+                            raise exc
+                    finally:
+                        pbar.update(1)
+        return sample_score_list
+    def _review_task_state(self, task_state: TaskState) -> SampleScore:
         """
+        Helper method to review a single task state.
-        logger.info(f'Start evaluating on dataset {self.dataset_name_or_path}')
+        Args:
+            task_state: The task state to review.
-        reviews_score_all = {}  # {subset_name: (score, num)}
-        stage_answers_dict = {}
-        stage_reviews_dict = {}
+        Returns:
+            SampleScore: The evaluation score for the task state.
+        """
+        # Compute evaluation metrics using the benchmark's metric calculation
+        sample_score = self.benchmark.calculate_metrics(task_state=task_state)
+        return sample_score
-        prompts = self.load_dataset()
-        for subset_name, prompts_list in prompts.items():
+    def get_report(self, agg_score_dict: Dict[str, List[AggScore]]) -> Report:
+        """
+        Generate a comprehensive evaluation report from aggregated scores.
-            answers_list: list = self.get_answers(
-                subset_name=subset_name, prompts_list=prompts_list, infer_cfg=self.task_cfg.generation_config, **kwargs)
-            if self.stage == EvalStage.INFER:
-                stage_answers_dict[subset_name] = answers_list
-                continue
+        This method handles:
+        1. Creating the evaluation report from scores
+        2. Generating and displaying a summary table
+        3. Optionally generating detailed analysis
+        4. Saving the report to file
-            reviews_list: list = self.get_reviews(subset_name=subset_name, answers_list=answers_list, **kwargs)
+        Args:
+            agg_score_dict: Dictionary mapping subset names to their aggregated scores.
-            metric_res = self.compute_metrics(reviews_list=reviews_list)
-            reviews_score_all[subset_name] = metric_res
-            stage_reviews_dict[subset_name] = reviews_list
+        Returns:
+            Report: The complete evaluation report.
+        """
+        assert agg_score_dict, 'No scores to generate report from.'
-        if self.stage == EvalStage.INFER:
-            return stage_answers_dict
+        # Get paths for saving the report
+        report_path = self.cache_manager.get_report_path()
+        report_file = self.cache_manager.get_report_file()
-        if self.stage == EvalStage.REVIEW:
-            return stage_reviews_dict
+        # Generate the main evaluation report using benchmark-specific logic
+        report = self.benchmark.generate_report(
+            scores=agg_score_dict, model_name=self.model_name, output_dir=report_path
+        )
-        # Generate report
-        report_map = self.dump_report(reviews_score_all)
+        # Generate and display a summary table of results
+        try:
+            report_table = gen_table(report_list=[report], add_overall_metric=True)
+            logger.info(f'\n{self.benchmark_name} report table:'
+                        f'\n{report_table} \n')
+        except Exception:
+            logger.error('Failed to generate report table.')
-        logger.info(f'Evaluation finished on {self.dataset_name_or_path}')
+        # Generate detailed analysis if requested in configuration
+        if self.task_config.analysis_report:
+            logger.info('Generating report analysis, please wait ...')
+            analysis = report.generate_analysis(self.task_config.judge_model_args)
+            logger.info(f'Report analysis:\n{analysis}')
+        else:
+            logger.info('Skipping report analysis (`analysis_report=False`).')
-        return report_map
+        # Save the complete report to file
+        report.to_json(report_file)
+        logger.info(f'Dump report to: {report_file} \n')
+        return report

evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

Potentially problematic release.

evalscope 0.17.1py3-none-any.whl → 1.0.1py3-none-any.whl