evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/__init__.py +0 -0
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +3 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
- evalscope/api/benchmark/benchmark.py +321 -0
- evalscope/api/benchmark/meta.py +115 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +261 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +355 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +264 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +11 -0
- evalscope/api/messages/chat_message.py +198 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +105 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/dataset_mixin.py +105 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +157 -0
- evalscope/api/model/model.py +383 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +23 -11
- evalscope/app/utils/data_utils.py +42 -26
- evalscope/app/utils/text_utils.py +0 -2
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +6 -7
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -3
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +2 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +135 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +95 -54
- evalscope/constants.py +29 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +277 -423
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +32 -30
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +47 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +123 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +698 -0
- evalscope/perf/benchmark.py +2 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +7 -5
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +8 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -2
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +101 -6
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +26 -44
- evalscope/summarizer.py +1 -1
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +2 -1
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/io_utils.py +100 -5
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +10 -7
- evalscope/utils/multi_choices.py +271 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
- tests/aigc/test_t2i.py +22 -4
- tests/benchmark/__init__.py +1 -0
- tests/benchmark/test_eval.py +386 -0
- tests/cli/test_all.py +3 -5
- tests/cli/test_collection.py +13 -4
- tests/cli/test_custom.py +22 -15
- tests/rag/test_clip_benchmark.py +1 -0
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
evalscope/evaluator/evaluator.py
CHANGED
|
@@ -1,483 +1,337 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
"""
|
|
3
|
+
Default evaluator implementation for running benchmark evaluations.
|
|
4
|
+
|
|
5
|
+
This module provides the DefaultEvaluator class which orchestrates the entire
|
|
6
|
+
evaluation process including data loading, model inference, metric calculation,
|
|
7
|
+
and report generation.
|
|
8
|
+
"""
|
|
2
9
|
|
|
3
|
-
import json
|
|
4
10
|
import os
|
|
5
|
-
import
|
|
6
|
-
from collections import OrderedDict, defaultdict
|
|
11
|
+
from collections import defaultdict
|
|
7
12
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
8
|
-
from copy import deepcopy
|
|
9
13
|
from tqdm import tqdm
|
|
10
|
-
from typing import TYPE_CHECKING,
|
|
14
|
+
from typing import TYPE_CHECKING, Dict, List, Tuple, Union
|
|
11
15
|
|
|
12
|
-
from evalscope.
|
|
13
|
-
from evalscope.
|
|
14
|
-
from evalscope.
|
|
16
|
+
from evalscope.api.dataset import Dataset, DatasetDict, Sample
|
|
17
|
+
from evalscope.api.evaluator import CacheManager, Evaluator, TaskState
|
|
18
|
+
from evalscope.api.metric import AggScore, SampleScore
|
|
15
19
|
from evalscope.report import Report, gen_table
|
|
16
|
-
from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, gen_hash, jsonl_to_list
|
|
17
|
-
from evalscope.utils.logger import get_logger
|
|
18
|
-
from evalscope.utils.model_utils import dict_torch_dtype_to_str
|
|
19
20
|
|
|
20
21
|
if TYPE_CHECKING:
|
|
21
|
-
from evalscope.
|
|
22
|
+
from evalscope.api.benchmark import DataAdapter
|
|
23
|
+
from evalscope.api.model import Model
|
|
24
|
+
from evalscope.config import TaskConfig
|
|
25
|
+
from evalscope.utils.io_utils import OutputsStructure
|
|
26
|
+
|
|
27
|
+
from evalscope.utils.logger import get_logger
|
|
22
28
|
|
|
23
29
|
logger = get_logger()
|
|
24
30
|
|
|
25
31
|
|
|
26
|
-
class Evaluator
|
|
32
|
+
class DefaultEvaluator(Evaluator):
|
|
27
33
|
"""
|
|
28
|
-
|
|
34
|
+
Default Evaluator for running evaluations on benchmarks.
|
|
35
|
+
|
|
36
|
+
This evaluator handles the complete evaluation pipeline:
|
|
37
|
+
1. Loading datasets from benchmarks
|
|
38
|
+
2. Running model inference on samples
|
|
39
|
+
3. Calculating evaluation metrics
|
|
40
|
+
4. Generating and saving reports
|
|
41
|
+
5. Managing caching for predictions and reviews
|
|
29
42
|
|
|
30
43
|
Args:
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
model_adapter: BaseModelAdapter, the model adapter for the model.
|
|
36
|
-
outputs: OutputsStructure, the outputs dir. Default: None
|
|
37
|
-
task_cfg: TaskConfig, the overall task config. Default: None
|
|
38
|
-
**kwargs: kwargs.
|
|
44
|
+
benchmark: The data adapter for loading and processing data.
|
|
45
|
+
model: The model to be evaluated.
|
|
46
|
+
outputs: The output structure for saving evaluation results.
|
|
47
|
+
task_config: The task configuration.
|
|
39
48
|
"""
|
|
40
49
|
|
|
41
|
-
def __init__(
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
self.
|
|
50
|
-
self.
|
|
51
|
-
|
|
52
|
-
self.
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
self.
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
self.
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
self.
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
self.
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
def load_dataset(self):
|
|
74
|
-
dataset = self.data_adapter.load(
|
|
75
|
-
work_dir=os.path.expanduser(self.task_cfg.dataset_dir), datasets_hub=self.dataset_hub, **self.kwargs)
|
|
76
|
-
|
|
77
|
-
# Get prompts from dataset
|
|
78
|
-
prompts = self.data_adapter.gen_prompts(data_dict=dataset)
|
|
79
|
-
|
|
80
|
-
# Limit and index prompts
|
|
81
|
-
limited_prompts = defaultdict(list)
|
|
82
|
-
for subset_name, prompts_list in prompts.items():
|
|
83
|
-
# If limit is None, use all prompts
|
|
84
|
-
if self.task_cfg.limit is None:
|
|
85
|
-
limit = len(prompts_list)
|
|
86
|
-
else:
|
|
87
|
-
if isinstance(self.task_cfg.limit, int):
|
|
88
|
-
limit = self.task_cfg.limit
|
|
89
|
-
elif isinstance(self.task_cfg.limit, float):
|
|
90
|
-
limit = int(len(prompts_list) * self.task_cfg.limit)
|
|
91
|
-
# Limit the number of prompts
|
|
92
|
-
for index, prompt in enumerate(prompts_list[:min(limit, len(prompts_list))]):
|
|
93
|
-
prompt[AnswerKeys.INDEX] = index
|
|
94
|
-
limited_prompts[subset_name].append(prompt)
|
|
95
|
-
|
|
96
|
-
return limited_prompts
|
|
97
|
-
|
|
98
|
-
def _generate_answer_id(self, model_cfg, input_d, infer_cfg):
|
|
99
|
-
model_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(model_cfg).items())), ensure_ascii=False)
|
|
100
|
-
input_prompt_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(input_d).items())), ensure_ascii=False)
|
|
101
|
-
infer_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False)
|
|
102
|
-
return 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
|
|
103
|
-
|
|
104
|
-
def _process_answer(self, answer_d, input_d, subset_name, answer_id):
|
|
105
|
-
answer_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
|
|
106
|
-
answer_d[AnswerKeys.ANSWER_ID] = answer_id
|
|
107
|
-
answer_d[AnswerKeys.SUBSET_NAME] = subset_name
|
|
108
|
-
answer_d[AnswerKeys.RAW_INPUT] = input_d[AnswerKeys.RAW_INPUT]
|
|
109
|
-
answer_d[AnswerKeys.INDEX] = input_d[AnswerKeys.INDEX]
|
|
110
|
-
return answer_d
|
|
111
|
-
|
|
112
|
-
def _get_answer(self, input_prompts, subset_name, infer_cfg) -> List[dict]:
|
|
113
|
-
try:
|
|
114
|
-
# get answer from model
|
|
115
|
-
answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
|
|
116
|
-
except Exception as e:
|
|
117
|
-
logger.error(f'Failed to get answer for {input_prompts}, due to {e}')
|
|
118
|
-
# if ignore_errors is True, continue to next input
|
|
119
|
-
if self.task_cfg.ignore_errors:
|
|
120
|
-
logger.warning('`ignore_errors` is set to True. Dropping this prompt and continuing with evaluation.')
|
|
121
|
-
return []
|
|
122
|
-
else:
|
|
123
|
-
raise e
|
|
124
|
-
# process answer
|
|
125
|
-
answers_list = []
|
|
126
|
-
for answer_d, input_prompt in zip(answer_ds, input_prompts):
|
|
127
|
-
answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
|
|
128
|
-
processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
|
|
129
|
-
answers_list.append(processed_answer)
|
|
130
|
-
return answers_list
|
|
131
|
-
|
|
132
|
-
@staticmethod
|
|
133
|
-
def filter_answer(use_cache, prompts_list, pred_file_path) -> dict:
|
|
134
|
-
# Filter prompts that have been answered
|
|
135
|
-
answers_list = []
|
|
136
|
-
if not use_cache or not os.path.exists(pred_file_path):
|
|
137
|
-
return answers_list, prompts_list
|
|
138
|
-
|
|
139
|
-
def get_answered_indices(answers_list: List[Dict]) -> List[int]:
|
|
140
|
-
indices = [answer.get(AnswerKeys.INDEX) for answer in answers_list]
|
|
141
|
-
|
|
142
|
-
if all(index is None for index in indices):
|
|
143
|
-
return list(range(len(answers_list)))
|
|
144
|
-
|
|
145
|
-
return [index for index in indices if index is not None]
|
|
146
|
-
|
|
147
|
-
answers_list = jsonl_to_list(pred_file_path)
|
|
148
|
-
answered_indices = set(get_answered_indices(answers_list))
|
|
149
|
-
logger.info(f'Reusing predictions from {pred_file_path}, got {len(answered_indices)} answers.')
|
|
150
|
-
|
|
151
|
-
prompts = [prompt for i, prompt in enumerate(prompts_list) if i not in answered_indices]
|
|
152
|
-
return answers_list, prompts
|
|
153
|
-
|
|
154
|
-
def get_answers(self, subset_name: str, prompts_list: List[dict], infer_cfg: dict = None, **kwargs) -> list:
|
|
50
|
+
def __init__(
|
|
51
|
+
self,
|
|
52
|
+
benchmark: 'DataAdapter',
|
|
53
|
+
model: 'Model',
|
|
54
|
+
outputs: 'OutputsStructure',
|
|
55
|
+
task_config: 'TaskConfig',
|
|
56
|
+
):
|
|
57
|
+
# Store core components needed for evaluation
|
|
58
|
+
self.benchmark = benchmark
|
|
59
|
+
self.model = model
|
|
60
|
+
self.outputs = outputs
|
|
61
|
+
self.task_config = task_config
|
|
62
|
+
|
|
63
|
+
# Extract frequently used identifiers
|
|
64
|
+
self.benchmark_name = benchmark.name
|
|
65
|
+
"""Name of the benchmark being evaluated."""
|
|
66
|
+
|
|
67
|
+
self.model_name = task_config.model_id
|
|
68
|
+
"""ID of the model being evaluated."""
|
|
69
|
+
|
|
70
|
+
self.use_cache = task_config.use_cache
|
|
71
|
+
"""Whether to use cache for predictions."""
|
|
72
|
+
|
|
73
|
+
# Initialize cache manager for storing and retrieving cached results
|
|
74
|
+
self.cache_manager = CacheManager(
|
|
75
|
+
outputs=outputs,
|
|
76
|
+
model_name=self.model_name,
|
|
77
|
+
benchmark_name=self.benchmark_name,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
def eval(self) -> Report:
|
|
155
81
|
"""
|
|
156
|
-
|
|
157
|
-
It is required to rewrite this method to support your own evaluator.
|
|
82
|
+
Run the complete evaluation process.
|
|
158
83
|
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
temperature: float, the value used to module the next token probabilities.
|
|
168
|
-
num_beams: int, number of beams for beam search. 1 means no beam search.
|
|
169
|
-
max_length: int, the max length of the sequence to be generated.
|
|
170
|
-
max_new_tokens: int, the max number of new tokens to be generated.
|
|
171
|
-
repetition_penalty: float, the parameter for repetition penalty. 1.0 means no penalty.
|
|
172
|
-
**kwargs: kwargs.
|
|
173
|
-
|
|
174
|
-
Returns: The list of answers.
|
|
84
|
+
This is the main entry point that orchestrates the entire evaluation:
|
|
85
|
+
1. Load dataset from benchmark
|
|
86
|
+
2. Evaluate each subset independently
|
|
87
|
+
3. Aggregate scores across subsets
|
|
88
|
+
4. Generate final evaluation report
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
Report: The complete evaluation report containing all metrics and results.
|
|
175
92
|
"""
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
for input_prompt in prompts_list:
|
|
192
|
-
futures.append(executor.submit(self._get_answer, [input_prompt], subset_name, infer_cfg))
|
|
193
|
-
for future in as_completed(futures):
|
|
194
|
-
answer_ds: List[dict] = future.result()
|
|
195
|
-
answers_list.extend(answer_ds)
|
|
196
|
-
dump_jsonl_data(answer_ds, pred_file_path, dump_mode=DumpMode.APPEND)
|
|
197
|
-
pbar.update(len(answer_ds))
|
|
198
|
-
else:
|
|
199
|
-
batch_prompts_list = [
|
|
200
|
-
prompts_list[i:i + eval_batch_size] for i in range(0, len(prompts_list), eval_batch_size)
|
|
201
|
-
]
|
|
202
|
-
with tqdm(total=len(prompts_list), desc=f'Predicting({subset_name}): ') as pbar:
|
|
203
|
-
for batch_prompts in batch_prompts_list:
|
|
204
|
-
answer_ds: List[dict] = self._get_answer(
|
|
205
|
-
input_prompts=batch_prompts, subset_name=subset_name, infer_cfg=infer_cfg)
|
|
206
|
-
answers_list.extend(answer_ds)
|
|
207
|
-
dump_jsonl_data(answer_ds, pred_file_path, dump_mode=DumpMode.APPEND)
|
|
208
|
-
pbar.update(len(batch_prompts))
|
|
209
|
-
|
|
210
|
-
logger.info(f'Dump predictions to {pred_file_path}.')
|
|
211
|
-
return answers_list
|
|
212
|
-
|
|
213
|
-
def _get_review(self, answer_d: dict, review_id: str = None, reviewer_spec: dict = None) -> dict:
|
|
214
|
-
|
|
215
|
-
if reviewer_spec is None:
|
|
216
|
-
reviewer_spec = {}
|
|
217
|
-
|
|
218
|
-
review_res = deepcopy(answer_d)
|
|
219
|
-
if AnswerKeys.CHOICES not in review_res:
|
|
220
|
-
review_res[AnswerKeys.CHOICES] = []
|
|
221
|
-
review_res[ReviewKeys.REVIEWED] = True
|
|
222
|
-
review_res[ReviewKeys.REVIEW_ID] = None
|
|
223
|
-
review_res[ReviewKeys.REVIEWER_SPEC] = reviewer_spec
|
|
224
|
-
review_res[ReviewKeys.REVIEW_TIME] = time.time()
|
|
225
|
-
logger.warning(f'No choices found for answer dict: {review_res}')
|
|
226
|
-
return review_res
|
|
227
|
-
|
|
228
|
-
rev_choices = []
|
|
229
|
-
for choice in review_res[AnswerKeys.CHOICES]:
|
|
230
|
-
raw_input_d: dict = review_res[AnswerKeys.RAW_INPUT]
|
|
231
|
-
answer_content = choice[ReviewKeys.MESSAGE][ReviewKeys.CONTENT]
|
|
232
|
-
gold_content = self.data_adapter.get_gold_answer(raw_input_d)
|
|
233
|
-
|
|
234
|
-
# Get review result based on judge strategy
|
|
235
|
-
use_llm = (
|
|
236
|
-
self.task_cfg.judge_strategy == JudgeStrategy.LLM
|
|
237
|
-
or (self.task_cfg.judge_strategy == JudgeStrategy.AUTO and self.data_adapter.llm_as_a_judge))
|
|
238
|
-
|
|
239
|
-
if use_llm:
|
|
240
|
-
# Use LLM as judge
|
|
241
|
-
assert self.judge is not None, f'Judge model is required for LLM judging {self.data_adapter.name}'
|
|
242
|
-
pred_content = self.data_adapter.llm_parse_pred_result(
|
|
243
|
-
result=answer_content, raw_input_d=raw_input_d, eval_type=self.eval_type)
|
|
244
|
-
review_result = self.data_adapter.llm_match(
|
|
245
|
-
gold_content, pred_content, self.judge, raw_input=raw_input_d)
|
|
246
|
-
else:
|
|
247
|
-
# Use rule-based judging
|
|
248
|
-
pred_content = self.data_adapter.parse_pred_result(
|
|
249
|
-
result=answer_content, raw_input_d=raw_input_d, eval_type=self.eval_type)
|
|
250
|
-
review_result = self.data_adapter.match(gold_content, pred_content)
|
|
251
|
-
|
|
252
|
-
# For LLM_RECALL strategy, use LLM to re-judge if rule-based result is not good
|
|
253
|
-
if (self.task_cfg.judge_strategy == JudgeStrategy.LLM_RECALL
|
|
254
|
-
and isinstance(review_result, (bool, int, float)) and not bool(review_result)):
|
|
255
|
-
assert self.judge is not None, f'Judge model is required for LLM_RECALL strategy {self.data_adapter.name}' # noqa: E501
|
|
256
|
-
pred_content = self.data_adapter.llm_parse_pred_result(
|
|
257
|
-
result=answer_content, raw_input_d=raw_input_d, eval_type=self.eval_type)
|
|
258
|
-
review_result = self.data_adapter.llm_match(
|
|
259
|
-
gold_content, pred_content, self.judge, raw_input=raw_input_d)
|
|
260
|
-
|
|
261
|
-
choice[ReviewKeys.REVIEW] = {
|
|
262
|
-
ReviewKeys.GOLD: gold_content if gold_content != raw_input_d else '*Same as Input*',
|
|
263
|
-
ReviewKeys.PRED: pred_content,
|
|
264
|
-
ReviewKeys.RESULT: review_result
|
|
265
|
-
}
|
|
266
|
-
rev_choices.append(choice)
|
|
267
|
-
|
|
268
|
-
review_res[AnswerKeys.CHOICES] = rev_choices
|
|
269
|
-
review_res[ReviewKeys.REVIEWED] = True
|
|
270
|
-
review_res[ReviewKeys.REVIEW_ID] = review_id
|
|
271
|
-
review_res[ReviewKeys.REVIEWER_SPEC] = reviewer_spec
|
|
272
|
-
review_res[ReviewKeys.REVIEW_TIME] = time.time()
|
|
273
|
-
|
|
274
|
-
return review_res
|
|
275
|
-
|
|
276
|
-
def _generate_review_id(self, answer_d):
|
|
277
|
-
# Gen review_id (concat: answer_id + reviewer_spec)
|
|
278
|
-
answer_id = answer_d[AnswerKeys.ANSWER_ID]
|
|
279
|
-
reviewer_spec = {'metric': self.data_adapter.metric_list, 'reviewer': ['Evaluator'], 'revision': ['default']}
|
|
280
|
-
reviewer_spec_str = json.dumps(
|
|
281
|
-
OrderedDict(sorted(dict_torch_dtype_to_str(reviewer_spec).items())), ensure_ascii=False)
|
|
282
|
-
review_id = 'review-' + gen_hash(answer_id + reviewer_spec_str)
|
|
283
|
-
return review_id, reviewer_spec
|
|
284
|
-
|
|
285
|
-
def get_reviews(self, subset_name: str, answers_list: List[dict], **kwargs) -> list:
|
|
93
|
+
# Load the dataset and evaluate each subset
|
|
94
|
+
dataset_dict = self.benchmark.load_dataset()
|
|
95
|
+
agg_score_dict = defaultdict(list)
|
|
96
|
+
|
|
97
|
+
# Process each subset (e.g., test, validation) independently
|
|
98
|
+
for subset, dataset in dataset_dict.items():
|
|
99
|
+
assert len(dataset) > 0, f'No samples found in subset: {subset}'
|
|
100
|
+
subset_score = self.evaluate_subset(subset, dataset)
|
|
101
|
+
agg_score_dict[subset] = subset_score
|
|
102
|
+
|
|
103
|
+
# Generate the report based on aggregated scores
|
|
104
|
+
report = self.get_report(agg_score_dict)
|
|
105
|
+
return report
|
|
106
|
+
|
|
107
|
+
def evaluate_subset(self, subset: str, dataset: Dataset) -> List[AggScore]:
|
|
286
108
|
"""
|
|
287
|
-
|
|
288
|
-
|
|
109
|
+
Evaluate a single subset of the dataset.
|
|
110
|
+
|
|
111
|
+
This method processes one subset through the complete evaluation pipeline:
|
|
112
|
+
1. Get model predictions for all samples
|
|
113
|
+
2. Calculate evaluation metrics for predictions
|
|
114
|
+
3. Aggregate individual sample scores
|
|
289
115
|
|
|
290
116
|
Args:
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
**kwargs: kwargs.
|
|
117
|
+
subset: Name of the subset being evaluated (e.g., 'test', 'validation').
|
|
118
|
+
dataset: The dataset subset containing samples to evaluate.
|
|
294
119
|
|
|
295
|
-
Returns:
|
|
120
|
+
Returns:
|
|
121
|
+
List[AggScore]: Aggregated scores for this subset.
|
|
296
122
|
"""
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
#
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
review = json.loads(line.strip())
|
|
309
|
-
existing_reviews[review['index']] = review
|
|
310
|
-
logger.info(f'Reusing review result from {review_file_path}, got {len(existing_reviews)} reviews.')
|
|
311
|
-
|
|
312
|
-
def process_single_review(answer_d):
|
|
313
|
-
# Check if review already exists in cache
|
|
314
|
-
if self.use_cache and answer_d['index'] in existing_reviews:
|
|
315
|
-
return existing_reviews[answer_d['index']]
|
|
316
|
-
|
|
317
|
-
review_id, reviewer_spec = self._generate_review_id(answer_d)
|
|
318
|
-
# Get review
|
|
319
|
-
review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
|
|
320
|
-
logger.debug(review_d)
|
|
321
|
-
return review_d
|
|
322
|
-
|
|
323
|
-
with ThreadPoolExecutor(max_workers=self.task_cfg.judge_worker_num) as executor:
|
|
324
|
-
# Submit all tasks and get futures
|
|
325
|
-
futures = [executor.submit(process_single_review, answer_d) for answer_d in answers_list]
|
|
326
|
-
|
|
327
|
-
# Process completed futures with progress bar
|
|
328
|
-
for future in tqdm(as_completed(futures), total=len(futures), desc=f'Reviewing({subset_name}): '):
|
|
329
|
-
review_d = future.result()
|
|
330
|
-
reviews_list.append(review_d)
|
|
331
|
-
# Dump new reviews only if not using cache or review is new
|
|
332
|
-
if not self.use_cache or review_d['index'] not in existing_reviews:
|
|
333
|
-
dump_jsonl_data(review_d, review_file_path, dump_mode=DumpMode.APPEND)
|
|
334
|
-
|
|
335
|
-
return reviews_list
|
|
336
|
-
|
|
337
|
-
def compute_metrics(self, reviews_list: List[dict]) -> List[dict]:
|
|
123
|
+
# Get model predictions for all samples in the subset
|
|
124
|
+
task_states = self.get_answers(subset, dataset)
|
|
125
|
+
|
|
126
|
+
# Calculate evaluation metrics for each prediction
|
|
127
|
+
sample_scores = self.get_reviews(subset, task_states)
|
|
128
|
+
|
|
129
|
+
# Aggregate individual sample scores into subset-level metrics
|
|
130
|
+
agg_scores = self.benchmark.aggregate_scores(sample_scores=sample_scores)
|
|
131
|
+
return agg_scores
|
|
132
|
+
|
|
133
|
+
def get_answers(self, subset: str, dataset: Dataset) -> List[TaskState]:
|
|
338
134
|
"""
|
|
339
|
-
|
|
340
|
-
|
|
135
|
+
Get model predictions for all samples in the dataset subset.
|
|
136
|
+
|
|
137
|
+
This method handles:
|
|
138
|
+
1. Loading cached predictions if available and caching is enabled
|
|
139
|
+
2. Running model inference on remaining samples in parallel
|
|
140
|
+
3. Saving new predictions to cache
|
|
341
141
|
|
|
342
142
|
Args:
|
|
343
|
-
|
|
143
|
+
subset: Name of the subset being processed.
|
|
144
|
+
dataset: The dataset subset containing samples for prediction.
|
|
344
145
|
|
|
345
146
|
Returns:
|
|
346
|
-
|
|
147
|
+
List[TaskState]: Task states containing model predictions for each sample.
|
|
347
148
|
"""
|
|
348
|
-
#
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
]
|
|
352
|
-
if choices_lengths:
|
|
353
|
-
max_choices = max(choices_lengths)
|
|
149
|
+
# Initialize task state list and filter cached predictions if caching is enabled
|
|
150
|
+
if self.use_cache:
|
|
151
|
+
task_state_list, dataset = self.cache_manager.filter_prediction_cache(subset, dataset)
|
|
354
152
|
else:
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
# Get review result
|
|
358
|
-
review_res_list = []
|
|
359
|
-
for review_d in reviews_list:
|
|
360
|
-
if not review_d[ReviewKeys.REVIEWED]:
|
|
361
|
-
logger.warning(f'Review not finished for answer_id: {review_d[AnswerKeys.ANSWER_ID]}, skipping ...')
|
|
362
|
-
continue
|
|
363
|
-
|
|
364
|
-
if len(review_d[AnswerKeys.CHOICES]) == 0:
|
|
365
|
-
logger.warning(f'No choices found for answer_id: {review_d[AnswerKeys.ANSWER_ID]}, skipping ...')
|
|
366
|
-
continue
|
|
367
|
-
elif len(review_d[AnswerKeys.CHOICES]) == 1 and max_choices == 1:
|
|
368
|
-
review_res = review_d[AnswerKeys.CHOICES][0][ReviewKeys.REVIEW][ReviewKeys.RESULT]
|
|
369
|
-
else:
|
|
370
|
-
review_res = [choice[ReviewKeys.REVIEW][ReviewKeys.RESULT] for choice in review_d[AnswerKeys.CHOICES]]
|
|
371
|
-
if len(review_d[AnswerKeys.CHOICES]) < max_choices:
|
|
372
|
-
logger.warning(
|
|
373
|
-
f'Less choices found for answer_id: {review_d[AnswerKeys.ANSWER_ID]}, '
|
|
374
|
-
f'max_choices is {max_choices}, but only {len(review_d[AnswerKeys.CHOICES])} choices found')
|
|
375
|
-
|
|
376
|
-
review_res_list.append(review_res)
|
|
377
|
-
|
|
378
|
-
metric_score: List[dict] = self.data_adapter.compute_metric(
|
|
379
|
-
review_res_list=review_res_list, reviews_list=reviews_list)
|
|
380
|
-
|
|
381
|
-
return metric_score
|
|
382
|
-
|
|
383
|
-
def dump_report(self, reviews_score_all: List[dict]):
|
|
384
|
-
"""
|
|
385
|
-
Get report for total reviews of specific dataset.
|
|
386
|
-
It is required to rewrite this method to support your own evaluator.
|
|
153
|
+
task_state_list = []
|
|
387
154
|
|
|
388
|
-
|
|
389
|
-
|
|
155
|
+
# Get output directory for storing model predictions
|
|
156
|
+
model_prediction_dir = os.path.dirname(self.cache_manager.get_prediction_cache_path(subset))
|
|
390
157
|
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
report_path = os.path.join(self.outputs_structure.reports_dir, self.model_name)
|
|
394
|
-
os.makedirs(report_path, exist_ok=True)
|
|
395
|
-
# Get report map
|
|
396
|
-
report_map: Report = self.data_adapter.gen_report(
|
|
397
|
-
subset_score_map=reviews_score_all, model_name=self.model_name)
|
|
158
|
+
# Convert dataset to list for parallel processing
|
|
159
|
+
dataset_list = list(dataset)
|
|
398
160
|
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
report_table = gen_table(report_list=[report_map], add_overall_metric=True)
|
|
402
|
-
logger.info(f'\n{self.dataset_name_or_path} report table:'
|
|
403
|
-
f'\n{report_table} \n')
|
|
404
|
-
except Exception:
|
|
405
|
-
logger.error('Failed to generate report table.')
|
|
161
|
+
if not dataset_list:
|
|
162
|
+
return task_state_list
|
|
406
163
|
|
|
407
|
-
#
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
164
|
+
# Process samples in parallel using ThreadPoolExecutor
|
|
165
|
+
with ThreadPoolExecutor(max_workers=min(len(dataset_list), self.task_config.eval_batch_size)) as executor:
|
|
166
|
+
# Submit all prediction tasks
|
|
167
|
+
future_to_sample = {
|
|
168
|
+
executor.submit(self._predict_sample, sample, model_prediction_dir): sample
|
|
169
|
+
for sample in dataset_list
|
|
170
|
+
}
|
|
414
171
|
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
172
|
+
# Process completed tasks with progress bar
|
|
173
|
+
with tqdm(total=len(dataset_list), desc=f'Predicting[{self.benchmark_name}@{subset}]: ') as pbar:
|
|
174
|
+
for future in as_completed(future_to_sample):
|
|
175
|
+
sample = future_to_sample[future]
|
|
176
|
+
try:
|
|
177
|
+
task_state = future.result()
|
|
178
|
+
task_state_list.append(task_state)
|
|
179
|
+
|
|
180
|
+
# Save the prediction result to cache for future use
|
|
181
|
+
model_result = self.cache_manager.save_prediction_cache(
|
|
182
|
+
subset, task_state, self.benchmark.save_metadata
|
|
183
|
+
)
|
|
184
|
+
logger.debug(f'Model result: \n{model_result.model_dump_json(indent=2)}')
|
|
185
|
+
|
|
186
|
+
except Exception as exc:
|
|
187
|
+
logger.error(f'{sample.model_dump_json(indent=2)} prediction failed: due to {exc}')
|
|
188
|
+
if self.task_config.ignore_errors:
|
|
189
|
+
logger.warning('Error ignored, continuing with next sample.')
|
|
190
|
+
else:
|
|
191
|
+
raise exc
|
|
192
|
+
finally:
|
|
193
|
+
pbar.update(1)
|
|
194
|
+
|
|
195
|
+
return task_state_list
|
|
196
|
+
|
|
197
|
+
def _predict_sample(self, sample: Sample, model_prediction_dir: str) -> TaskState:
|
|
198
|
+
"""
|
|
199
|
+
Helper method to predict a single sample.
|
|
419
200
|
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
except Exception as e:
|
|
424
|
-
logger.error(f'Failed to post process report: {e}')
|
|
201
|
+
Args:
|
|
202
|
+
sample: The sample to predict.
|
|
203
|
+
model_prediction_dir: Directory for storing model predictions.
|
|
425
204
|
|
|
426
|
-
|
|
205
|
+
Returns:
|
|
206
|
+
TaskState: The task state containing the prediction result.
|
|
207
|
+
"""
|
|
208
|
+
logger.debug(f'\n{sample.pretty_print()}')
|
|
427
209
|
|
|
428
|
-
|
|
210
|
+
# Run model inference on the current sample
|
|
211
|
+
task_state = self.benchmark.run_inference(model=self.model, sample=sample, output_dir=model_prediction_dir)
|
|
212
|
+
return task_state
|
|
213
|
+
|
|
214
|
+
def get_reviews(self, subset: str, task_states: List[TaskState]) -> List[SampleScore]:
|
|
429
215
|
"""
|
|
430
|
-
|
|
431
|
-
It is required to rewrite this method to support your own evaluator.
|
|
216
|
+
Calculate evaluation metrics for model predictions.
|
|
432
217
|
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
4. Get reviews with metric function (or reviewers).
|
|
438
|
-
5. Generate report from review results.
|
|
218
|
+
This method handles:
|
|
219
|
+
1. Loading cached review results if available and caching is enabled
|
|
220
|
+
2. Computing metrics for remaining task states in parallel
|
|
221
|
+
3. Saving new review results to cache
|
|
439
222
|
|
|
440
223
|
Args:
|
|
441
|
-
|
|
224
|
+
subset: Name of the subset being reviewed.
|
|
225
|
+
task_states: List of task states containing model predictions.
|
|
442
226
|
|
|
443
227
|
Returns:
|
|
444
|
-
|
|
228
|
+
List[SampleScore]: Evaluation scores for each sample.
|
|
229
|
+
"""
|
|
230
|
+
# Initialize sample score list and filter cached reviews if caching is enabled
|
|
231
|
+
if self.use_cache and not self.task_config.rerun_review:
|
|
232
|
+
sample_score_list, task_states = self.cache_manager.filter_review_cache(subset, task_states)
|
|
233
|
+
else:
|
|
234
|
+
# Init a clean sample score list
|
|
235
|
+
sample_score_list = []
|
|
236
|
+
self.cache_manager.delete_review_cache(subset)
|
|
237
|
+
|
|
238
|
+
if not task_states:
|
|
239
|
+
return sample_score_list
|
|
240
|
+
|
|
241
|
+
# Process task states in parallel using ThreadPoolExecutor
|
|
242
|
+
with ThreadPoolExecutor(max_workers=min(len(task_states), self.task_config.judge_worker_num)) as executor:
|
|
243
|
+
# Submit all review tasks
|
|
244
|
+
future_to_task_state = {
|
|
245
|
+
executor.submit(self._review_task_state, task_state): task_state
|
|
246
|
+
for task_state in task_states
|
|
247
|
+
}
|
|
445
248
|
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
249
|
+
# Process completed tasks with progress bar
|
|
250
|
+
with tqdm(total=len(task_states), desc=f'Reviewing[{self.benchmark_name}@{subset}]: ') as pbar:
|
|
251
|
+
for future in as_completed(future_to_task_state):
|
|
252
|
+
task_state = future_to_task_state[future]
|
|
253
|
+
try:
|
|
254
|
+
sample_score = future.result()
|
|
255
|
+
sample_score_list.append(sample_score)
|
|
256
|
+
|
|
257
|
+
# Save the review result to cache for future use
|
|
258
|
+
review_result = self.cache_manager.save_review_cache(
|
|
259
|
+
subset=subset,
|
|
260
|
+
task_state=task_state,
|
|
261
|
+
sample_score=sample_score,
|
|
262
|
+
save_metadata=self.benchmark.save_metadata
|
|
263
|
+
)
|
|
264
|
+
logger.debug(f'Review result: \n{review_result.model_dump_json(indent=2)}')
|
|
265
|
+
|
|
266
|
+
except Exception as exc:
|
|
267
|
+
logger.error(f'Error when review sample {task_state.sample_id}: {exc}')
|
|
268
|
+
if self.task_config.ignore_errors:
|
|
269
|
+
logger.warning('Error ignored, continuing with next sample.')
|
|
270
|
+
else:
|
|
271
|
+
raise exc
|
|
272
|
+
finally:
|
|
273
|
+
pbar.update(1)
|
|
274
|
+
|
|
275
|
+
return sample_score_list
|
|
276
|
+
|
|
277
|
+
def _review_task_state(self, task_state: TaskState) -> SampleScore:
|
|
449
278
|
"""
|
|
279
|
+
Helper method to review a single task state.
|
|
450
280
|
|
|
451
|
-
|
|
281
|
+
Args:
|
|
282
|
+
task_state: The task state to review.
|
|
452
283
|
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
284
|
+
Returns:
|
|
285
|
+
SampleScore: The evaluation score for the task state.
|
|
286
|
+
"""
|
|
287
|
+
# Compute evaluation metrics using the benchmark's metric calculation
|
|
288
|
+
sample_score = self.benchmark.calculate_metrics(task_state=task_state)
|
|
289
|
+
return sample_score
|
|
456
290
|
|
|
457
|
-
|
|
458
|
-
|
|
291
|
+
def get_report(self, agg_score_dict: Dict[str, List[AggScore]]) -> Report:
|
|
292
|
+
"""
|
|
293
|
+
Generate a comprehensive evaluation report from aggregated scores.
|
|
459
294
|
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
295
|
+
This method handles:
|
|
296
|
+
1. Creating the evaluation report from scores
|
|
297
|
+
2. Generating and displaying a summary table
|
|
298
|
+
3. Optionally generating detailed analysis
|
|
299
|
+
4. Saving the report to file
|
|
465
300
|
|
|
466
|
-
|
|
301
|
+
Args:
|
|
302
|
+
agg_score_dict: Dictionary mapping subset names to their aggregated scores.
|
|
467
303
|
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
304
|
+
Returns:
|
|
305
|
+
Report: The complete evaluation report.
|
|
306
|
+
"""
|
|
307
|
+
assert agg_score_dict, 'No scores to generate report from.'
|
|
471
308
|
|
|
472
|
-
|
|
473
|
-
|
|
309
|
+
# Get paths for saving the report
|
|
310
|
+
report_path = self.cache_manager.get_report_path()
|
|
311
|
+
report_file = self.cache_manager.get_report_file()
|
|
474
312
|
|
|
475
|
-
|
|
476
|
-
|
|
313
|
+
# Generate the main evaluation report using benchmark-specific logic
|
|
314
|
+
report = self.benchmark.generate_report(
|
|
315
|
+
scores=agg_score_dict, model_name=self.model_name, output_dir=report_path
|
|
316
|
+
)
|
|
477
317
|
|
|
478
|
-
# Generate
|
|
479
|
-
|
|
318
|
+
# Generate and display a summary table of results
|
|
319
|
+
try:
|
|
320
|
+
report_table = gen_table(report_list=[report], add_overall_metric=True)
|
|
321
|
+
logger.info(f'\n{self.benchmark_name} report table:'
|
|
322
|
+
f'\n{report_table} \n')
|
|
323
|
+
except Exception:
|
|
324
|
+
logger.error('Failed to generate report table.')
|
|
480
325
|
|
|
481
|
-
|
|
326
|
+
# Generate detailed analysis if requested in configuration
|
|
327
|
+
if self.task_config.analysis_report:
|
|
328
|
+
logger.info('Generating report analysis, please wait ...')
|
|
329
|
+
analysis = report.generate_analysis(self.task_config.judge_model_args)
|
|
330
|
+
logger.info(f'Report analysis:\n{analysis}')
|
|
331
|
+
else:
|
|
332
|
+
logger.info('Skipping report analysis (`analysis_report=False`).')
|
|
482
333
|
|
|
483
|
-
|
|
334
|
+
# Save the complete report to file
|
|
335
|
+
report.to_json(report_file)
|
|
336
|
+
logger.info(f'Dump report to: {report_file} \n')
|
|
337
|
+
return report
|