evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/__init__.py +0 -0
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +3 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
- evalscope/api/benchmark/benchmark.py +321 -0
- evalscope/api/benchmark/meta.py +115 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +261 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +355 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +264 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +11 -0
- evalscope/api/messages/chat_message.py +198 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +105 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/dataset_mixin.py +105 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +157 -0
- evalscope/api/model/model.py +383 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +23 -11
- evalscope/app/utils/data_utils.py +42 -26
- evalscope/app/utils/text_utils.py +0 -2
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +6 -7
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -3
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +2 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +135 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +95 -54
- evalscope/constants.py +29 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +277 -423
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +32 -30
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +47 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +123 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +698 -0
- evalscope/perf/benchmark.py +2 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +7 -5
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +8 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -2
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +101 -6
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +26 -44
- evalscope/summarizer.py +1 -1
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +2 -1
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/io_utils.py +100 -5
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +10 -7
- evalscope/utils/multi_choices.py +271 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
- tests/aigc/test_t2i.py +22 -4
- tests/benchmark/__init__.py +1 -0
- tests/benchmark/test_eval.py +386 -0
- tests/cli/test_all.py +3 -5
- tests/cli/test_collection.py +13 -4
- tests/cli/test_custom.py +22 -15
- tests/rag/test_clip_benchmark.py +1 -0
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,683 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
from functools import partial
|
|
4
|
+
from overrides import override
|
|
5
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Type
|
|
6
|
+
|
|
7
|
+
from evalscope.api.dataset import DataLoader, Dataset, DatasetDict, LocalDataLoader, RemoteDataLoader, Sample
|
|
8
|
+
from evalscope.api.evaluator import TaskState
|
|
9
|
+
from evalscope.api.messages import ChatMessage, ChatMessageSystem, ChatMessageUser
|
|
10
|
+
from evalscope.api.metric import AggScore, SampleScore, Score
|
|
11
|
+
from evalscope.api.model import Model, ModelOutput
|
|
12
|
+
from evalscope.api.registry import get_aggregation, get_metric
|
|
13
|
+
from evalscope.constants import HubType, JudgeStrategy
|
|
14
|
+
from evalscope.report import Report, ReportGenerator
|
|
15
|
+
from evalscope.utils import get_logger
|
|
16
|
+
from ..benchmark import DataAdapter
|
|
17
|
+
|
|
18
|
+
logger = get_logger()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class DefaultDataAdapter(DataAdapter):
|
|
22
|
+
"""
|
|
23
|
+
Default Data Adapter for the benchmark evaluation system.
|
|
24
|
+
|
|
25
|
+
This class serves as the base implementation for data adapters that handle:
|
|
26
|
+
- Dataset loading and preprocessing
|
|
27
|
+
- Model inference execution
|
|
28
|
+
- Metric calculation and aggregation
|
|
29
|
+
- Report generation
|
|
30
|
+
|
|
31
|
+
The adapter follows a pipeline architecture with hooks that can be overridden
|
|
32
|
+
in subclasses to customize behavior for specific benchmarks or evaluation tasks.
|
|
33
|
+
|
|
34
|
+
Key responsibilities:
|
|
35
|
+
1. Load datasets with optional few-shot examples
|
|
36
|
+
2. Process samples and format prompts
|
|
37
|
+
3. Execute model inference with proper state management
|
|
38
|
+
4. Calculate evaluation metrics and aggregate results
|
|
39
|
+
5. Generate comprehensive evaluation reports
|
|
40
|
+
|
|
41
|
+
This class can be extended to implement specific data loading and processing
|
|
42
|
+
logic for different benchmark datasets and evaluation scenarios.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
# ####################
|
|
46
|
+
# DATA LOADING METHODS
|
|
47
|
+
# ####################
|
|
48
|
+
|
|
49
|
+
@override
|
|
50
|
+
def load_dataset(self) -> DatasetDict:
|
|
51
|
+
"""
|
|
52
|
+
Load the complete dataset including test data and optional few-shot examples.
|
|
53
|
+
|
|
54
|
+
This method handles both local and remote dataset loading, processes samples
|
|
55
|
+
with appropriate prompt formatting, and prepares few-shot examples if needed.
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
DatasetDict: A dictionary containing the loaded and processed datasets,
|
|
59
|
+
organized by subset names.
|
|
60
|
+
"""
|
|
61
|
+
# Load the dataset
|
|
62
|
+
self.test_dataset, self.fewshot_dataset = self.load()
|
|
63
|
+
|
|
64
|
+
# Process each sample's input by applying prompt templates and few-shot formatting
|
|
65
|
+
self._post_process_samples()
|
|
66
|
+
|
|
67
|
+
return self.test_dataset
|
|
68
|
+
|
|
69
|
+
def load(self) -> Tuple[DatasetDict, Optional[DatasetDict]]:
|
|
70
|
+
"""Load the dataset from disk or remote source.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
Tuple[DatasetDict, Optional[DatasetDict]]: The test dataset and few-shot dataset.
|
|
74
|
+
"""
|
|
75
|
+
if os.path.exists(self.dataset_id):
|
|
76
|
+
# Load dataset from local file system path
|
|
77
|
+
with self._temporary_attribute('dataset_hub', HubType.LOCAL):
|
|
78
|
+
return self.load_from_disk()
|
|
79
|
+
else:
|
|
80
|
+
# Load dataset from remote source (e.g., ModelScope, Huggingface)
|
|
81
|
+
return self.load_from_remote()
|
|
82
|
+
|
|
83
|
+
def load_from_remote(self):
|
|
84
|
+
"""Load dataset from remote source and prepare few-shot examples if needed."""
|
|
85
|
+
test_dataset = None
|
|
86
|
+
fewshot_dataset = None
|
|
87
|
+
# Load dataset from remote source
|
|
88
|
+
test_load_func = partial(self.load_subset, data_loader=RemoteDataLoader)
|
|
89
|
+
test_dataset = self.load_subsets(test_load_func)
|
|
90
|
+
|
|
91
|
+
# Load few-shot examples if few-shot prompting is enabled
|
|
92
|
+
if self._should_load_fewshot():
|
|
93
|
+
fewshot_load_func = partial(self.load_fewshot_subset, data_loader=RemoteDataLoader)
|
|
94
|
+
fewshot_dataset = self.load_subsets(fewshot_load_func, is_fewshot=True)
|
|
95
|
+
return test_dataset, fewshot_dataset
|
|
96
|
+
|
|
97
|
+
def load_from_disk(self, use_local_loader: bool = False):
|
|
98
|
+
"""
|
|
99
|
+
Load dataset from local disk path.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
use_local_loader: If True, use local file loading; otherwise use remote loading
|
|
103
|
+
for local ModelScope datasets.
|
|
104
|
+
"""
|
|
105
|
+
test_dataset = None
|
|
106
|
+
fewshot_dataset = None
|
|
107
|
+
if use_local_loader:
|
|
108
|
+
# Use LocalDataLoader for actual local file loading
|
|
109
|
+
test_load_func = partial(self.load_subset, data_loader=LocalDataLoader)
|
|
110
|
+
test_dataset = self.load_subsets(test_load_func)
|
|
111
|
+
|
|
112
|
+
# Load few-shot examples if few-shot prompting is enabled
|
|
113
|
+
if self._should_load_fewshot():
|
|
114
|
+
fewshot_load_func = partial(self.load_fewshot_subset, data_loader=LocalDataLoader)
|
|
115
|
+
fewshot_dataset = self.load_subsets(fewshot_load_func, is_fewshot=True)
|
|
116
|
+
return test_dataset, fewshot_dataset
|
|
117
|
+
else:
|
|
118
|
+
# Fallback to remote loading for local ModelScope datasets
|
|
119
|
+
return self.load_from_remote()
|
|
120
|
+
|
|
121
|
+
def _should_load_fewshot(self) -> bool:
|
|
122
|
+
"""Check if few-shot dataset should be loaded."""
|
|
123
|
+
return self.few_shot_num > 0 and self.train_split is not None
|
|
124
|
+
|
|
125
|
+
def _post_process_samples(self):
|
|
126
|
+
"""Process all sample inputs with prompt formatting."""
|
|
127
|
+
for subset in self.test_dataset.keys():
|
|
128
|
+
for sample in self.test_dataset[subset]:
|
|
129
|
+
if isinstance(sample.input, str):
|
|
130
|
+
sample.input = self.process_sample_str_input(sample, subset)
|
|
131
|
+
|
|
132
|
+
def process_sample_str_input(self, sample: Sample, subset: str) -> List[ChatMessage]:
|
|
133
|
+
"""
|
|
134
|
+
Convert a sample's input string to a list of ChatMessage objects.
|
|
135
|
+
|
|
136
|
+
This method formats the sample input into a structured message format
|
|
137
|
+
suitable for model inference, including system prompts if configured.
|
|
138
|
+
"""
|
|
139
|
+
input_text = self.process_sample_input(sample, subset=subset)
|
|
140
|
+
input_messages = [ChatMessageUser(content=input_text)]
|
|
141
|
+
if self.system_prompt:
|
|
142
|
+
input_messages.insert(0, ChatMessageSystem(content=self.system_prompt))
|
|
143
|
+
return input_messages
|
|
144
|
+
|
|
145
|
+
def process_sample_input(self, sample: Sample, subset: str) -> str:
|
|
146
|
+
"""
|
|
147
|
+
Process a single sample's input by applying prompt templates and few-shot formatting.
|
|
148
|
+
|
|
149
|
+
This method handles the complete input preparation pipeline:
|
|
150
|
+
1. Retrieves few-shot examples if enabled
|
|
151
|
+
2. Formats few-shot examples into demonstration text
|
|
152
|
+
3. Applies appropriate prompt template (with or without few-shot context)
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
sample (Sample): The sample to process
|
|
156
|
+
subset (str): The subset name this sample belongs to
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
str: The formatted input text ready for model inference
|
|
160
|
+
"""
|
|
161
|
+
if self.few_shot_num > 0:
|
|
162
|
+
if self.fewshot_dataset is not None:
|
|
163
|
+
# Retrieve few-shot examples for the current subset
|
|
164
|
+
few_shot_samples = self.fewshot_dataset.get(subset)
|
|
165
|
+
if few_shot_samples is None:
|
|
166
|
+
# Fallback: use the first available subset if current subset not found
|
|
167
|
+
first_key = next(iter(self.fewshot_dataset))
|
|
168
|
+
few_shot_samples = self.fewshot_dataset[first_key]
|
|
169
|
+
# Select fewshot samples
|
|
170
|
+
assert len(few_shot_samples) >= self.few_shot_num, (
|
|
171
|
+
f"""The dataset only have ({len(few_shot_samples)}) few-shot samples, but requested ({self.few_shot_num}) fewshot samples, please reduce 'few_shot_num'.""" # noqa: E501
|
|
172
|
+
)
|
|
173
|
+
# Convert few-shot samples to demonstration string
|
|
174
|
+
few_shot = '\n\n'.join([self.sample_to_fewshot(sample) for sample in few_shot_samples])
|
|
175
|
+
else:
|
|
176
|
+
# Build few-shot examples inside the format method
|
|
177
|
+
few_shot = ''
|
|
178
|
+
# Format the input text with few-shot examples and main prompt
|
|
179
|
+
input_text = self.format_fewshot_template(fewshot=few_shot, sample=sample)
|
|
180
|
+
else:
|
|
181
|
+
# No few-shot examples: use the prompt template directly
|
|
182
|
+
input_text = self.format_prompt_template(sample=sample)
|
|
183
|
+
return input_text
|
|
184
|
+
|
|
185
|
+
def load_subsets(self, load_func: Callable[[str], Dataset], is_fewshot=False) -> DatasetDict:
|
|
186
|
+
"""
|
|
187
|
+
Load multiple subsets of the dataset using the provided loading function.
|
|
188
|
+
|
|
189
|
+
This method handles two loading strategies:
|
|
190
|
+
1. Reformat mode: Load only the default subset and reformat it
|
|
191
|
+
2. Multi-subset mode: Load all subsets specified in subset_list
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
load_func (Callable[[str], Dataset]): Function to load individual subsets
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
DatasetDict: Dictionary containing all loaded subsets
|
|
198
|
+
"""
|
|
199
|
+
if self.reformat_subset:
|
|
200
|
+
# Load only the default subset
|
|
201
|
+
subset_data = load_func(self.default_subset)
|
|
202
|
+
# Reformat the subset to create multiple subsets based on sample keys
|
|
203
|
+
# NOTE: subset_list and limit is applied here if specified
|
|
204
|
+
limit = self.few_shot_num if is_fewshot else self.limit
|
|
205
|
+
repeats = 1 if is_fewshot else self.repeats
|
|
206
|
+
dataset_dict = DatasetDict.from_dataset(
|
|
207
|
+
dataset=subset_data, subset_list=self.subset_list, limit=limit, repeats=repeats
|
|
208
|
+
)
|
|
209
|
+
else:
|
|
210
|
+
# Load all specified subsets into separate entries
|
|
211
|
+
subset_dict = defaultdict()
|
|
212
|
+
for subset in self.subset_list:
|
|
213
|
+
# Set current subset, since same benchmark need to differentiate
|
|
214
|
+
with self._temporary_attribute('current_subset_name', subset):
|
|
215
|
+
subset_data = load_func(subset)
|
|
216
|
+
subset_dict[subset] = subset_data
|
|
217
|
+
dataset_dict = DatasetDict(subset_dict)
|
|
218
|
+
return dataset_dict
|
|
219
|
+
|
|
220
|
+
def load_subset(self, subset: str, data_loader: Type[DataLoader]) -> Dataset:
|
|
221
|
+
"""
|
|
222
|
+
Load a specific subset of the dataset for evaluation.
|
|
223
|
+
|
|
224
|
+
Args:
|
|
225
|
+
subset (str): The subset identifier to load
|
|
226
|
+
data_loader (Type[DataLoader]): The data loader class to use for loading
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
Dataset: The loaded dataset subset with processed samples
|
|
230
|
+
"""
|
|
231
|
+
# Determine the split and subset names based on configuration
|
|
232
|
+
split = subset if self.split_as_subset else self.eval_split
|
|
233
|
+
subset_name = self.default_subset if self.split_as_subset else subset
|
|
234
|
+
|
|
235
|
+
# Create and configure the remote data loader
|
|
236
|
+
loader = data_loader(
|
|
237
|
+
data_id_or_path=self.dataset_id,
|
|
238
|
+
split=split,
|
|
239
|
+
subset=subset_name,
|
|
240
|
+
sample_fields=self.record_to_sample, # Custom sample conversion function
|
|
241
|
+
filter_func=self.sample_filter,
|
|
242
|
+
limit=self.limit if not self.reformat_subset else None, # Limit number of samples if specified
|
|
243
|
+
repeats=self.repeats, # Number of repetitions for each sample
|
|
244
|
+
shuffle_choices=self.shuffle_choices, # Shuffle choices if requested
|
|
245
|
+
data_source=self.dataset_hub, # Data source configuration
|
|
246
|
+
)
|
|
247
|
+
dataset = loader.load()
|
|
248
|
+
return dataset
|
|
249
|
+
|
|
250
|
+
def load_fewshot_subset(self, subset: str, data_loader: Type[DataLoader]) -> Dataset:
|
|
251
|
+
"""
|
|
252
|
+
Load a subset specifically for few-shot examples.
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
subset (str): The subset identifier to load few-shot examples from
|
|
256
|
+
data_loader (Type[DataLoader]): The data loader class to use for loading
|
|
257
|
+
|
|
258
|
+
Returns:
|
|
259
|
+
Dataset: The loaded few-shot dataset with demonstration examples
|
|
260
|
+
"""
|
|
261
|
+
# Use training split for few-shot examples
|
|
262
|
+
split = subset if self.split_as_subset else self.train_split
|
|
263
|
+
subset_name = self.default_subset if self.split_as_subset else subset
|
|
264
|
+
|
|
265
|
+
# Create loader specifically configured for few-shot sampling
|
|
266
|
+
loader = data_loader(
|
|
267
|
+
data_id_or_path=self.dataset_id,
|
|
268
|
+
split=split,
|
|
269
|
+
subset=subset_name,
|
|
270
|
+
sample_fields=self.record_to_sample,
|
|
271
|
+
filter_func=self.sample_filter, # Apply sample filtering if defined
|
|
272
|
+
limit=self.few_shot_num
|
|
273
|
+
if not self.reformat_subset else None, # Limit to specified number of few-shot examples
|
|
274
|
+
shuffle=self.few_shot_random, # Randomize selection if enabled
|
|
275
|
+
shuffle_choices=self.shuffle_choices, # Shuffle choices if requested
|
|
276
|
+
data_source=self.dataset_hub,
|
|
277
|
+
)
|
|
278
|
+
dataset = loader.load()
|
|
279
|
+
return dataset
|
|
280
|
+
|
|
281
|
+
def sample_filter(self, sample: Sample) -> bool:
|
|
282
|
+
"""
|
|
283
|
+
Apply filtering to a dataset, only samples matching the predicate will be included.
|
|
284
|
+
|
|
285
|
+
Args:
|
|
286
|
+
sample (Sample): The sample to filter
|
|
287
|
+
|
|
288
|
+
Returns:
|
|
289
|
+
bool: True if the sample passes the filter, False otherwise
|
|
290
|
+
"""
|
|
291
|
+
return True # Default implementation allows all samples
|
|
292
|
+
|
|
293
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
294
|
+
"""
|
|
295
|
+
Convert a raw data record to a Sample object.
|
|
296
|
+
|
|
297
|
+
This method must be implemented in subclasses to handle dataset-specific
|
|
298
|
+
field mapping and data processing logic.
|
|
299
|
+
|
|
300
|
+
Args:
|
|
301
|
+
record (Dict[str, Any]): Raw data record from the dataset
|
|
302
|
+
|
|
303
|
+
Returns:
|
|
304
|
+
Sample: Processed sample object ready for evaluation
|
|
305
|
+
"""
|
|
306
|
+
raise NotImplementedError('This method should be implemented in subclasses')
|
|
307
|
+
|
|
308
|
+
def sample_to_fewshot(self, sample: Sample) -> str:
|
|
309
|
+
"""
|
|
310
|
+
Convert a Sample object to a formatted few-shot demonstration string.
|
|
311
|
+
|
|
312
|
+
This method must be implemented in subclasses to define how samples
|
|
313
|
+
are formatted as examples in few-shot prompts.
|
|
314
|
+
|
|
315
|
+
Args:
|
|
316
|
+
sample (Sample): The sample to convert to a few-shot example
|
|
317
|
+
|
|
318
|
+
Returns:
|
|
319
|
+
str: Formatted few-shot demonstration string
|
|
320
|
+
"""
|
|
321
|
+
raise NotImplementedError('This method should be implemented in subclasses')
|
|
322
|
+
|
|
323
|
+
def format_prompt_template(self, sample: Sample) -> str:
|
|
324
|
+
"""
|
|
325
|
+
Format the basic prompt template with the sample data.
|
|
326
|
+
|
|
327
|
+
This method applies the prompt template to format the input text
|
|
328
|
+
for models when no few-shot examples are used.
|
|
329
|
+
|
|
330
|
+
Args:
|
|
331
|
+
sample (Sample): The sample object containing the prompt data
|
|
332
|
+
|
|
333
|
+
Returns:
|
|
334
|
+
str: The formatted prompt ready for model input
|
|
335
|
+
"""
|
|
336
|
+
return self.prompt_template.format(question=sample.input)
|
|
337
|
+
|
|
338
|
+
def format_fewshot_template(self, fewshot: str, sample: Sample) -> str:
|
|
339
|
+
"""
|
|
340
|
+
Format the few-shot template with demonstrations and the main prompt.
|
|
341
|
+
|
|
342
|
+
This method combines few-shot examples with the main prompt using
|
|
343
|
+
the configured few-shot template.
|
|
344
|
+
|
|
345
|
+
Args:
|
|
346
|
+
fewshot (str): The formatted few-shot demonstration examples
|
|
347
|
+
sample (Sample): The sample object containing the prompt data
|
|
348
|
+
|
|
349
|
+
Returns:
|
|
350
|
+
str: The complete formatted input with few-shot context
|
|
351
|
+
"""
|
|
352
|
+
return self.few_shot_prompt_template.format(fewshot=fewshot, question=sample.input)
|
|
353
|
+
|
|
354
|
+
# #################
|
|
355
|
+
# INFERENCE METHODS
|
|
356
|
+
# #################
|
|
357
|
+
|
|
358
|
+
def _on_inference_start(self, model: Model, sample: Sample) -> None:
|
|
359
|
+
"""
|
|
360
|
+
Hook method called before inference starts.
|
|
361
|
+
|
|
362
|
+
This method can be overridden in subclasses to implement custom
|
|
363
|
+
preparation logic before model inference (e.g., model configuration,
|
|
364
|
+
sample preprocessing, state initialization).
|
|
365
|
+
|
|
366
|
+
Args:
|
|
367
|
+
model (Model): The model that will perform inference
|
|
368
|
+
sample (Sample): The sample to be processed
|
|
369
|
+
"""
|
|
370
|
+
pass
|
|
371
|
+
|
|
372
|
+
def _on_inference(self, model: Model, sample: Sample) -> ModelOutput:
|
|
373
|
+
"""
|
|
374
|
+
Hook method called during the actual inference process.
|
|
375
|
+
|
|
376
|
+
This method executes the model inference and can be overridden
|
|
377
|
+
to implement custom inference logic or model interaction patterns.
|
|
378
|
+
|
|
379
|
+
Args:
|
|
380
|
+
model (Model): The model to use for inference
|
|
381
|
+
sample (Sample): The sample to process
|
|
382
|
+
|
|
383
|
+
Returns:
|
|
384
|
+
ModelOutput: The raw output from the model
|
|
385
|
+
"""
|
|
386
|
+
# Execute model inference with the processed input and any tools
|
|
387
|
+
model_output = model.generate(input=sample.input, tools=sample.tools)
|
|
388
|
+
return model_output
|
|
389
|
+
|
|
390
|
+
def _on_inference_end(
|
|
391
|
+
self, model: Model, sample: Sample, model_output: ModelOutput, output_dir: str, **kwargs
|
|
392
|
+
) -> TaskState:
|
|
393
|
+
"""
|
|
394
|
+
Hook method called after inference completes.
|
|
395
|
+
|
|
396
|
+
This method processes the model output and creates a TaskState object
|
|
397
|
+
that encapsulates all information about the completed inference task.
|
|
398
|
+
You can save the model output to the specified output directory.
|
|
399
|
+
|
|
400
|
+
Args:
|
|
401
|
+
model (Model): The model that performed inference
|
|
402
|
+
sample (Sample): The processed sample
|
|
403
|
+
model_output (ModelOutput): The raw model output
|
|
404
|
+
output_dir (str): The directory where the model output was saved
|
|
405
|
+
|
|
406
|
+
Returns:
|
|
407
|
+
TaskState: Complete state object for the inference task
|
|
408
|
+
"""
|
|
409
|
+
return TaskState(
|
|
410
|
+
model=model.name,
|
|
411
|
+
sample=sample,
|
|
412
|
+
messages=[model_output.message],
|
|
413
|
+
output=model_output,
|
|
414
|
+
completed=True,
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
@override
|
|
418
|
+
def run_inference(self, model: Model, sample: Sample, output_dir: str, **kwargs) -> TaskState:
|
|
419
|
+
"""
|
|
420
|
+
Execute the complete inference pipeline for a single sample.
|
|
421
|
+
|
|
422
|
+
This method orchestrates the full inference process using the hook methods:
|
|
423
|
+
1. Pre-inference preparation
|
|
424
|
+
2. Model inference execution
|
|
425
|
+
3. Post-inference processing and state creation
|
|
426
|
+
|
|
427
|
+
Args:
|
|
428
|
+
model (Model): The model to use for inference
|
|
429
|
+
sample (Sample): The sample to process
|
|
430
|
+
output_dir (str): The directory to store the generated files
|
|
431
|
+
|
|
432
|
+
Returns:
|
|
433
|
+
TaskState: Complete state object containing inference results
|
|
434
|
+
"""
|
|
435
|
+
self._on_inference_start(model, sample)
|
|
436
|
+
model_output = self._on_inference(model, sample)
|
|
437
|
+
task_state = self._on_inference_end(model, sample, model_output, output_dir, **kwargs)
|
|
438
|
+
|
|
439
|
+
return task_state
|
|
440
|
+
|
|
441
|
+
# ##########################
|
|
442
|
+
# METRIC CALCULATION METHODS
|
|
443
|
+
# ##########################
|
|
444
|
+
|
|
445
|
+
def filter_prediction(self, prediction: str, task_state: TaskState) -> str:
|
|
446
|
+
"""
|
|
447
|
+
Filter and prepare the model prediction for metric calculation.
|
|
448
|
+
|
|
449
|
+
This method applies configured filters and custom answer extraction
|
|
450
|
+
to clean and prepare the raw model output for evaluation.
|
|
451
|
+
|
|
452
|
+
Args:
|
|
453
|
+
prediction (str): The raw model prediction
|
|
454
|
+
task_state (TaskState): The complete task state for context
|
|
455
|
+
|
|
456
|
+
Returns:
|
|
457
|
+
str: The filtered and extracted prediction ready for evaluation
|
|
458
|
+
"""
|
|
459
|
+
if self.filter_ensemble is not None:
|
|
460
|
+
# Apply configured filters to clean the prediction
|
|
461
|
+
prediction = self.filter_ensemble(prediction)
|
|
462
|
+
|
|
463
|
+
# Apply custom answer extraction logic
|
|
464
|
+
extracted_prediction = self.extract_answer(prediction, task_state)
|
|
465
|
+
return extracted_prediction
|
|
466
|
+
|
|
467
|
+
def extract_answer(self, prediction: str, task_state: TaskState) -> str:
|
|
468
|
+
"""
|
|
469
|
+
Hook method for custom answer extraction from model predictions.
|
|
470
|
+
|
|
471
|
+
This method can be overridden in subclasses to implement specific
|
|
472
|
+
logic for extracting the final answer from complex model outputs.
|
|
473
|
+
|
|
474
|
+
Args:
|
|
475
|
+
prediction (str): The model prediction to extract from
|
|
476
|
+
task_state (TaskState): The task state for additional context
|
|
477
|
+
|
|
478
|
+
Returns:
|
|
479
|
+
str: The extracted answer
|
|
480
|
+
"""
|
|
481
|
+
return prediction
|
|
482
|
+
|
|
483
|
+
def match_score(
|
|
484
|
+
self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
|
|
485
|
+
) -> Score:
|
|
486
|
+
"""
|
|
487
|
+
Calculate evaluation scores by comparing prediction with reference.
|
|
488
|
+
|
|
489
|
+
This method computes scores using all configured metrics and creates
|
|
490
|
+
a comprehensive Score object with detailed evaluation results.
|
|
491
|
+
|
|
492
|
+
Args:
|
|
493
|
+
original_prediction (str): The original, unfiltered model prediction
|
|
494
|
+
filtered_prediction (str): The filtered and processed prediction
|
|
495
|
+
reference (str): The ground truth reference answer
|
|
496
|
+
task_state (TaskState): The complete task state for context
|
|
497
|
+
|
|
498
|
+
Returns:
|
|
499
|
+
Score: Object containing all calculated metric scores and metadata
|
|
500
|
+
"""
|
|
501
|
+
# Initialize the score object with prediction details
|
|
502
|
+
score = Score(
|
|
503
|
+
extracted_prediction=filtered_prediction,
|
|
504
|
+
prediction=original_prediction,
|
|
505
|
+
)
|
|
506
|
+
|
|
507
|
+
# Calculate scores for each configured metric
|
|
508
|
+
for metric in self.metric_list:
|
|
509
|
+
try:
|
|
510
|
+
if isinstance(metric, str):
|
|
511
|
+
metric_name = metric
|
|
512
|
+
metric_scorer = get_metric(metric) # Get metric implementation from registry
|
|
513
|
+
metric_func = metric_scorer() # Instantiate the metric scorer
|
|
514
|
+
elif isinstance(metric, dict):
|
|
515
|
+
metric_name = list(metric.keys())[0]
|
|
516
|
+
metric_cls = get_metric(metric_name)
|
|
517
|
+
metric_func = metric_cls(**metric[metric_name]) # Initialize with parameters
|
|
518
|
+
metric_score = metric_func(
|
|
519
|
+
prediction=filtered_prediction,
|
|
520
|
+
reference=reference,
|
|
521
|
+
)
|
|
522
|
+
score.value[metric_name] = metric_score
|
|
523
|
+
except Exception as e:
|
|
524
|
+
logger.error(f'Error calculating metric {metric}: {e}')
|
|
525
|
+
score.value[metric_name] = 0
|
|
526
|
+
score.metadata[metric_name] = f'error: {str(e)}'
|
|
527
|
+
|
|
528
|
+
return score
|
|
529
|
+
|
|
530
|
+
@override
|
|
531
|
+
def calculate_metrics(self, task_state: TaskState) -> SampleScore:
|
|
532
|
+
"""
|
|
533
|
+
Calculate comprehensive evaluation metrics for a completed task.
|
|
534
|
+
|
|
535
|
+
This method processes the task state to extract predictions, applies
|
|
536
|
+
filtering and answer extraction, calculates all configured metrics,
|
|
537
|
+
and packages the results into a SampleScore object.
|
|
538
|
+
|
|
539
|
+
Args:
|
|
540
|
+
task_state (TaskState): The completed task state to evaluate
|
|
541
|
+
|
|
542
|
+
Returns:
|
|
543
|
+
SampleScore: Complete scoring results for the sample
|
|
544
|
+
|
|
545
|
+
Raises:
|
|
546
|
+
AssertionError: If the task state is not marked as completed
|
|
547
|
+
"""
|
|
548
|
+
assert task_state.completed, \
|
|
549
|
+
'TaskState must be completed before calculating metrics.'
|
|
550
|
+
|
|
551
|
+
# Extract the raw prediction from the model output
|
|
552
|
+
prediction = task_state.output.completion
|
|
553
|
+
|
|
554
|
+
# Apply filtering and answer extraction
|
|
555
|
+
filtered_prediction = self.filter_prediction(prediction, task_state)
|
|
556
|
+
|
|
557
|
+
if self.judge_strategy == JudgeStrategy.LLM_RECALL:
|
|
558
|
+
# Step 1: Calculate standard metric scores (rule-based)
|
|
559
|
+
rule_based_score = self.match_score(
|
|
560
|
+
original_prediction=prediction,
|
|
561
|
+
filtered_prediction=filtered_prediction,
|
|
562
|
+
reference=task_state.target,
|
|
563
|
+
task_state=task_state
|
|
564
|
+
)
|
|
565
|
+
|
|
566
|
+
# Step 2: Apply LLM judge if enabled and get final score
|
|
567
|
+
final_score = self.maybe_llm_match_score(
|
|
568
|
+
original_prediction=prediction,
|
|
569
|
+
filtered_prediction=filtered_prediction,
|
|
570
|
+
reference=task_state.target,
|
|
571
|
+
task_state=task_state,
|
|
572
|
+
rule_based_score=rule_based_score
|
|
573
|
+
)
|
|
574
|
+
else:
|
|
575
|
+
if self.use_llm_judge:
|
|
576
|
+
# Use LLM judge to compute the match score directly
|
|
577
|
+
final_score = self.llm_match_score(
|
|
578
|
+
original_prediction=prediction,
|
|
579
|
+
filtered_prediction=filtered_prediction,
|
|
580
|
+
reference=task_state.target,
|
|
581
|
+
task_state=task_state
|
|
582
|
+
)
|
|
583
|
+
else:
|
|
584
|
+
# Use standard match score calculation without LLM judge
|
|
585
|
+
final_score = self.match_score(
|
|
586
|
+
original_prediction=prediction,
|
|
587
|
+
filtered_prediction=filtered_prediction,
|
|
588
|
+
reference=task_state.target,
|
|
589
|
+
task_state=task_state
|
|
590
|
+
)
|
|
591
|
+
|
|
592
|
+
# Package the results into a sample score object
|
|
593
|
+
sample_score = SampleScore(
|
|
594
|
+
score=final_score,
|
|
595
|
+
sample_id=task_state.sample_id,
|
|
596
|
+
group_id=task_state.group_id,
|
|
597
|
+
sample_metadata=task_state.metadata,
|
|
598
|
+
)
|
|
599
|
+
|
|
600
|
+
return sample_score
|
|
601
|
+
|
|
602
|
+
@override
|
|
603
|
+
def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
|
|
604
|
+
"""
|
|
605
|
+
Aggregate individual sample scores into summary statistics.
|
|
606
|
+
|
|
607
|
+
This method uses the configured aggregation method to compute
|
|
608
|
+
summary statistics (e.g., mean, median, percentiles) across
|
|
609
|
+
all sample scores for comprehensive evaluation results.
|
|
610
|
+
|
|
611
|
+
Args:
|
|
612
|
+
sample_scores (List[SampleScore]): Individual scores for all samples
|
|
613
|
+
|
|
614
|
+
Returns:
|
|
615
|
+
List[AggScore]: Aggregated scores and statistics
|
|
616
|
+
"""
|
|
617
|
+
# Get the configured aggregation implementation
|
|
618
|
+
aggregate_cls = get_aggregation(self.aggregation)
|
|
619
|
+
aggregator = aggregate_cls()
|
|
620
|
+
|
|
621
|
+
# Compute aggregated scores
|
|
622
|
+
agg_scores = aggregator(sample_scores)
|
|
623
|
+
|
|
624
|
+
return agg_scores
|
|
625
|
+
|
|
626
|
+
# #########################
|
|
627
|
+
# REPORT GENERATION METHODS
|
|
628
|
+
# #########################
|
|
629
|
+
|
|
630
|
+
def _on_generate_report_end(self, report: Report, output_dir: str, **kwargs) -> None:
|
|
631
|
+
"""
|
|
632
|
+
Hook method called after generating the evaluation report.
|
|
633
|
+
|
|
634
|
+
This method can be overridden in subclasses to implement custom
|
|
635
|
+
post-processing of the generated report (e.g., additional formatting,
|
|
636
|
+
custom visualizations, external integrations).
|
|
637
|
+
|
|
638
|
+
Args:
|
|
639
|
+
report (Report): The generated evaluation report
|
|
640
|
+
output_dir (str): Directory where the report should be saved
|
|
641
|
+
"""
|
|
642
|
+
pass
|
|
643
|
+
|
|
644
|
+
def _on_generate_report(
|
|
645
|
+
self, scores: Dict[str, List[AggScore]], model_name: str, add_aggregation_name: bool = True
|
|
646
|
+
) -> Report:
|
|
647
|
+
"""
|
|
648
|
+
Hook method called during report generation.
|
|
649
|
+
|
|
650
|
+
This method creates the evaluation report using the configured
|
|
651
|
+
report generator and can be overridden to implement custom
|
|
652
|
+
report generation logic.
|
|
653
|
+
|
|
654
|
+
Args:
|
|
655
|
+
scores (Dict[str, List[AggScore]]): Aggregated scores organized by subset
|
|
656
|
+
model_name (str): Name of the evaluated model
|
|
657
|
+
|
|
658
|
+
Returns:
|
|
659
|
+
Report: The generated evaluation report
|
|
660
|
+
"""
|
|
661
|
+
return ReportGenerator.generate_report(
|
|
662
|
+
score_dict=scores, model_name=model_name, data_adapter=self, add_aggregation_name=add_aggregation_name
|
|
663
|
+
)
|
|
664
|
+
|
|
665
|
+
@override
|
|
666
|
+
def generate_report(self, scores: Dict[str, List[AggScore]], model_name: str, output_dir: str, **kwargs) -> Report:
|
|
667
|
+
"""
|
|
668
|
+
Generate a comprehensive evaluation report from aggregated scores.
|
|
669
|
+
|
|
670
|
+
This method orchestrates the complete report generation process:
|
|
671
|
+
1. Creates the report using configured generators
|
|
672
|
+
2. Applies any post-processing through hook methods
|
|
673
|
+
|
|
674
|
+
Args:
|
|
675
|
+
scores (Dict[str, List[AggScore]]): Aggregated scores by subset name
|
|
676
|
+
model_name (str): Name of the model being evaluated
|
|
677
|
+
|
|
678
|
+
Returns:
|
|
679
|
+
Report: Complete evaluation report with results and analysis
|
|
680
|
+
"""
|
|
681
|
+
report = self._on_generate_report(scores, model_name=model_name)
|
|
682
|
+
self._on_generate_report_end(report, output_dir, **kwargs)
|
|
683
|
+
return report
|