evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (273) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/__init__.py +0 -0
  3. evalscope/api/benchmark/__init__.py +3 -0
  4. evalscope/api/benchmark/adapters/__init__.py +3 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
  8. evalscope/api/benchmark/benchmark.py +321 -0
  9. evalscope/api/benchmark/meta.py +115 -0
  10. evalscope/api/dataset/__init__.py +2 -0
  11. evalscope/api/dataset/dataset.py +349 -0
  12. evalscope/api/dataset/loader.py +261 -0
  13. evalscope/api/dataset/utils.py +143 -0
  14. evalscope/api/evaluator/__init__.py +3 -0
  15. evalscope/api/evaluator/cache.py +355 -0
  16. evalscope/api/evaluator/evaluator.py +56 -0
  17. evalscope/api/evaluator/state.py +264 -0
  18. evalscope/api/filter/__init__.py +1 -0
  19. evalscope/api/filter/filter.py +72 -0
  20. evalscope/api/messages/__init__.py +11 -0
  21. evalscope/api/messages/chat_message.py +198 -0
  22. evalscope/api/messages/content.py +102 -0
  23. evalscope/api/messages/utils.py +35 -0
  24. evalscope/api/metric/__init__.py +2 -0
  25. evalscope/api/metric/metric.py +55 -0
  26. evalscope/api/metric/scorer.py +105 -0
  27. evalscope/api/mixin/__init__.py +2 -0
  28. evalscope/api/mixin/dataset_mixin.py +105 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +157 -0
  32. evalscope/api/model/model.py +383 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/ui/app_ui.py +2 -1
  40. evalscope/app/ui/multi_model.py +50 -25
  41. evalscope/app/ui/single_model.py +23 -11
  42. evalscope/app/utils/data_utils.py +42 -26
  43. evalscope/app/utils/text_utils.py +0 -2
  44. evalscope/app/utils/visualization.py +9 -4
  45. evalscope/arguments.py +6 -7
  46. evalscope/backend/opencompass/api_meta_template.py +2 -1
  47. evalscope/backend/opencompass/backend_manager.py +6 -3
  48. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  49. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  50. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  51. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  52. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  53. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  55. evalscope/backend/rag_eval/utils/embedding.py +2 -1
  56. evalscope/backend/rag_eval/utils/llm.py +13 -12
  57. evalscope/benchmarks/__init__.py +0 -2
  58. evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
  59. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
  60. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
  61. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
  62. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
  63. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
  64. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
  65. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  66. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  67. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  68. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  69. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  70. evalscope/benchmarks/arena_hard/utils.py +37 -1
  71. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  72. evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
  73. evalscope/benchmarks/bfcl/generation.py +222 -0
  74. evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
  75. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  76. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  77. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  78. evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
  79. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  80. evalscope/benchmarks/docmath/utils.py +4 -5
  81. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  82. evalscope/benchmarks/frames/frames_adapter.py +135 -52
  83. evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
  84. evalscope/benchmarks/general_arena/utils.py +23 -27
  85. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  86. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  87. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  88. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  89. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  90. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  91. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  92. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  93. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  94. evalscope/benchmarks/ifeval/instructions.py +109 -64
  95. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  96. evalscope/benchmarks/ifeval/utils.py +6 -7
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  105. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  106. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  107. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  108. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
  109. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  110. evalscope/benchmarks/race/race_adapter.py +33 -119
  111. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  112. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  113. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  114. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  115. evalscope/benchmarks/tau_bench/generation.py +147 -0
  116. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
  117. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  118. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  119. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
  120. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  121. evalscope/cli/cli.py +2 -0
  122. evalscope/cli/start_server.py +6 -3
  123. evalscope/collections/__init__.py +2 -10
  124. evalscope/collections/sampler.py +10 -10
  125. evalscope/collections/schema.py +13 -11
  126. evalscope/config.py +95 -54
  127. evalscope/constants.py +29 -61
  128. evalscope/evaluator/__init__.py +1 -1
  129. evalscope/evaluator/evaluator.py +277 -423
  130. evalscope/filters/__init__.py +2 -0
  131. evalscope/filters/extraction.py +126 -0
  132. evalscope/filters/selection.py +57 -0
  133. evalscope/metrics/__init__.py +13 -13
  134. evalscope/metrics/llm_judge.py +32 -30
  135. evalscope/metrics/math_parser.py +27 -22
  136. evalscope/metrics/metric.py +307 -0
  137. evalscope/metrics/metrics.py +22 -18
  138. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  139. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  140. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  141. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  142. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  143. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  144. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  145. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  146. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  147. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  148. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  149. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  150. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  151. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  152. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  153. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  154. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  155. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  156. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  157. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  158. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  159. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  160. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  162. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  163. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  184. evalscope/models/__init__.py +6 -29
  185. evalscope/models/mockllm.py +65 -0
  186. evalscope/models/model_apis.py +47 -0
  187. evalscope/models/modelscope.py +455 -0
  188. evalscope/models/openai_compatible.py +123 -0
  189. evalscope/models/text2image_model.py +124 -0
  190. evalscope/models/utils/openai.py +698 -0
  191. evalscope/perf/benchmark.py +2 -1
  192. evalscope/perf/http_client.py +4 -2
  193. evalscope/perf/plugin/api/custom_api.py +5 -4
  194. evalscope/perf/plugin/api/openai_api.py +11 -9
  195. evalscope/perf/plugin/datasets/custom.py +2 -1
  196. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  197. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  198. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  199. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  200. evalscope/perf/plugin/datasets/openqa.py +4 -2
  201. evalscope/perf/utils/benchmark_util.py +7 -5
  202. evalscope/perf/utils/db_util.py +9 -6
  203. evalscope/perf/utils/local_server.py +8 -3
  204. evalscope/perf/utils/rich_display.py +16 -10
  205. evalscope/report/__init__.py +2 -2
  206. evalscope/report/combinator.py +18 -12
  207. evalscope/report/generator.py +101 -6
  208. evalscope/report/{utils.py → report.py} +8 -6
  209. evalscope/run.py +26 -44
  210. evalscope/summarizer.py +1 -1
  211. evalscope/utils/__init__.py +21 -2
  212. evalscope/utils/chat_service.py +2 -1
  213. evalscope/utils/deprecation_utils.py +12 -1
  214. evalscope/utils/function_utils.py +29 -0
  215. evalscope/utils/io_utils.py +100 -5
  216. evalscope/utils/json_schema.py +208 -0
  217. evalscope/utils/logger.py +51 -12
  218. evalscope/utils/model_utils.py +10 -7
  219. evalscope/utils/multi_choices.py +271 -0
  220. evalscope/utils/url_utils.py +65 -0
  221. evalscope/version.py +2 -2
  222. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
  223. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
  224. tests/aigc/test_t2i.py +22 -4
  225. tests/benchmark/__init__.py +1 -0
  226. tests/benchmark/test_eval.py +386 -0
  227. tests/cli/test_all.py +3 -5
  228. tests/cli/test_collection.py +13 -4
  229. tests/cli/test_custom.py +22 -15
  230. tests/rag/test_clip_benchmark.py +1 -0
  231. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  232. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  233. evalscope/benchmarks/benchmark.py +0 -81
  234. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  235. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  236. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  237. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  238. evalscope/benchmarks/data_adapter.py +0 -528
  239. evalscope/benchmarks/filters.py +0 -59
  240. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  241. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  242. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  243. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  244. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  245. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  246. evalscope/benchmarks/race/race.py +0 -104
  247. evalscope/benchmarks/race/samples.jsonl +0 -5
  248. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  249. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  250. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  251. evalscope/benchmarks/utils.py +0 -60
  252. evalscope/collections/evaluator.py +0 -375
  253. evalscope/metrics/completion_parsers.py +0 -227
  254. evalscope/metrics/named_metrics.py +0 -55
  255. evalscope/models/adapters/__init__.py +0 -14
  256. evalscope/models/adapters/base_adapter.py +0 -84
  257. evalscope/models/adapters/bfcl_adapter.py +0 -246
  258. evalscope/models/adapters/chat_adapter.py +0 -207
  259. evalscope/models/adapters/choice_adapter.py +0 -222
  260. evalscope/models/adapters/custom_adapter.py +0 -71
  261. evalscope/models/adapters/server_adapter.py +0 -236
  262. evalscope/models/adapters/t2i_adapter.py +0 -79
  263. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  264. evalscope/models/custom/__init__.py +0 -4
  265. evalscope/models/custom/custom_model.py +0 -50
  266. evalscope/models/custom/dummy_model.py +0 -99
  267. evalscope/models/local_model.py +0 -128
  268. evalscope/models/register.py +0 -41
  269. tests/cli/test_run.py +0 -489
  270. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
  271. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
  272. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
  273. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,683 @@
1
+ import os
2
+ from collections import defaultdict
3
+ from functools import partial
4
+ from overrides import override
5
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Type
6
+
7
+ from evalscope.api.dataset import DataLoader, Dataset, DatasetDict, LocalDataLoader, RemoteDataLoader, Sample
8
+ from evalscope.api.evaluator import TaskState
9
+ from evalscope.api.messages import ChatMessage, ChatMessageSystem, ChatMessageUser
10
+ from evalscope.api.metric import AggScore, SampleScore, Score
11
+ from evalscope.api.model import Model, ModelOutput
12
+ from evalscope.api.registry import get_aggregation, get_metric
13
+ from evalscope.constants import HubType, JudgeStrategy
14
+ from evalscope.report import Report, ReportGenerator
15
+ from evalscope.utils import get_logger
16
+ from ..benchmark import DataAdapter
17
+
18
+ logger = get_logger()
19
+
20
+
21
+ class DefaultDataAdapter(DataAdapter):
22
+ """
23
+ Default Data Adapter for the benchmark evaluation system.
24
+
25
+ This class serves as the base implementation for data adapters that handle:
26
+ - Dataset loading and preprocessing
27
+ - Model inference execution
28
+ - Metric calculation and aggregation
29
+ - Report generation
30
+
31
+ The adapter follows a pipeline architecture with hooks that can be overridden
32
+ in subclasses to customize behavior for specific benchmarks or evaluation tasks.
33
+
34
+ Key responsibilities:
35
+ 1. Load datasets with optional few-shot examples
36
+ 2. Process samples and format prompts
37
+ 3. Execute model inference with proper state management
38
+ 4. Calculate evaluation metrics and aggregate results
39
+ 5. Generate comprehensive evaluation reports
40
+
41
+ This class can be extended to implement specific data loading and processing
42
+ logic for different benchmark datasets and evaluation scenarios.
43
+ """
44
+
45
+ # ####################
46
+ # DATA LOADING METHODS
47
+ # ####################
48
+
49
+ @override
50
+ def load_dataset(self) -> DatasetDict:
51
+ """
52
+ Load the complete dataset including test data and optional few-shot examples.
53
+
54
+ This method handles both local and remote dataset loading, processes samples
55
+ with appropriate prompt formatting, and prepares few-shot examples if needed.
56
+
57
+ Returns:
58
+ DatasetDict: A dictionary containing the loaded and processed datasets,
59
+ organized by subset names.
60
+ """
61
+ # Load the dataset
62
+ self.test_dataset, self.fewshot_dataset = self.load()
63
+
64
+ # Process each sample's input by applying prompt templates and few-shot formatting
65
+ self._post_process_samples()
66
+
67
+ return self.test_dataset
68
+
69
+ def load(self) -> Tuple[DatasetDict, Optional[DatasetDict]]:
70
+ """Load the dataset from disk or remote source.
71
+
72
+ Returns:
73
+ Tuple[DatasetDict, Optional[DatasetDict]]: The test dataset and few-shot dataset.
74
+ """
75
+ if os.path.exists(self.dataset_id):
76
+ # Load dataset from local file system path
77
+ with self._temporary_attribute('dataset_hub', HubType.LOCAL):
78
+ return self.load_from_disk()
79
+ else:
80
+ # Load dataset from remote source (e.g., ModelScope, Huggingface)
81
+ return self.load_from_remote()
82
+
83
+ def load_from_remote(self):
84
+ """Load dataset from remote source and prepare few-shot examples if needed."""
85
+ test_dataset = None
86
+ fewshot_dataset = None
87
+ # Load dataset from remote source
88
+ test_load_func = partial(self.load_subset, data_loader=RemoteDataLoader)
89
+ test_dataset = self.load_subsets(test_load_func)
90
+
91
+ # Load few-shot examples if few-shot prompting is enabled
92
+ if self._should_load_fewshot():
93
+ fewshot_load_func = partial(self.load_fewshot_subset, data_loader=RemoteDataLoader)
94
+ fewshot_dataset = self.load_subsets(fewshot_load_func, is_fewshot=True)
95
+ return test_dataset, fewshot_dataset
96
+
97
+ def load_from_disk(self, use_local_loader: bool = False):
98
+ """
99
+ Load dataset from local disk path.
100
+
101
+ Args:
102
+ use_local_loader: If True, use local file loading; otherwise use remote loading
103
+ for local ModelScope datasets.
104
+ """
105
+ test_dataset = None
106
+ fewshot_dataset = None
107
+ if use_local_loader:
108
+ # Use LocalDataLoader for actual local file loading
109
+ test_load_func = partial(self.load_subset, data_loader=LocalDataLoader)
110
+ test_dataset = self.load_subsets(test_load_func)
111
+
112
+ # Load few-shot examples if few-shot prompting is enabled
113
+ if self._should_load_fewshot():
114
+ fewshot_load_func = partial(self.load_fewshot_subset, data_loader=LocalDataLoader)
115
+ fewshot_dataset = self.load_subsets(fewshot_load_func, is_fewshot=True)
116
+ return test_dataset, fewshot_dataset
117
+ else:
118
+ # Fallback to remote loading for local ModelScope datasets
119
+ return self.load_from_remote()
120
+
121
+ def _should_load_fewshot(self) -> bool:
122
+ """Check if few-shot dataset should be loaded."""
123
+ return self.few_shot_num > 0 and self.train_split is not None
124
+
125
+ def _post_process_samples(self):
126
+ """Process all sample inputs with prompt formatting."""
127
+ for subset in self.test_dataset.keys():
128
+ for sample in self.test_dataset[subset]:
129
+ if isinstance(sample.input, str):
130
+ sample.input = self.process_sample_str_input(sample, subset)
131
+
132
+ def process_sample_str_input(self, sample: Sample, subset: str) -> List[ChatMessage]:
133
+ """
134
+ Convert a sample's input string to a list of ChatMessage objects.
135
+
136
+ This method formats the sample input into a structured message format
137
+ suitable for model inference, including system prompts if configured.
138
+ """
139
+ input_text = self.process_sample_input(sample, subset=subset)
140
+ input_messages = [ChatMessageUser(content=input_text)]
141
+ if self.system_prompt:
142
+ input_messages.insert(0, ChatMessageSystem(content=self.system_prompt))
143
+ return input_messages
144
+
145
+ def process_sample_input(self, sample: Sample, subset: str) -> str:
146
+ """
147
+ Process a single sample's input by applying prompt templates and few-shot formatting.
148
+
149
+ This method handles the complete input preparation pipeline:
150
+ 1. Retrieves few-shot examples if enabled
151
+ 2. Formats few-shot examples into demonstration text
152
+ 3. Applies appropriate prompt template (with or without few-shot context)
153
+
154
+ Args:
155
+ sample (Sample): The sample to process
156
+ subset (str): The subset name this sample belongs to
157
+
158
+ Returns:
159
+ str: The formatted input text ready for model inference
160
+ """
161
+ if self.few_shot_num > 0:
162
+ if self.fewshot_dataset is not None:
163
+ # Retrieve few-shot examples for the current subset
164
+ few_shot_samples = self.fewshot_dataset.get(subset)
165
+ if few_shot_samples is None:
166
+ # Fallback: use the first available subset if current subset not found
167
+ first_key = next(iter(self.fewshot_dataset))
168
+ few_shot_samples = self.fewshot_dataset[first_key]
169
+ # Select fewshot samples
170
+ assert len(few_shot_samples) >= self.few_shot_num, (
171
+ f"""The dataset only have ({len(few_shot_samples)}) few-shot samples, but requested ({self.few_shot_num}) fewshot samples, please reduce 'few_shot_num'.""" # noqa: E501
172
+ )
173
+ # Convert few-shot samples to demonstration string
174
+ few_shot = '\n\n'.join([self.sample_to_fewshot(sample) for sample in few_shot_samples])
175
+ else:
176
+ # Build few-shot examples inside the format method
177
+ few_shot = ''
178
+ # Format the input text with few-shot examples and main prompt
179
+ input_text = self.format_fewshot_template(fewshot=few_shot, sample=sample)
180
+ else:
181
+ # No few-shot examples: use the prompt template directly
182
+ input_text = self.format_prompt_template(sample=sample)
183
+ return input_text
184
+
185
+ def load_subsets(self, load_func: Callable[[str], Dataset], is_fewshot=False) -> DatasetDict:
186
+ """
187
+ Load multiple subsets of the dataset using the provided loading function.
188
+
189
+ This method handles two loading strategies:
190
+ 1. Reformat mode: Load only the default subset and reformat it
191
+ 2. Multi-subset mode: Load all subsets specified in subset_list
192
+
193
+ Args:
194
+ load_func (Callable[[str], Dataset]): Function to load individual subsets
195
+
196
+ Returns:
197
+ DatasetDict: Dictionary containing all loaded subsets
198
+ """
199
+ if self.reformat_subset:
200
+ # Load only the default subset
201
+ subset_data = load_func(self.default_subset)
202
+ # Reformat the subset to create multiple subsets based on sample keys
203
+ # NOTE: subset_list and limit is applied here if specified
204
+ limit = self.few_shot_num if is_fewshot else self.limit
205
+ repeats = 1 if is_fewshot else self.repeats
206
+ dataset_dict = DatasetDict.from_dataset(
207
+ dataset=subset_data, subset_list=self.subset_list, limit=limit, repeats=repeats
208
+ )
209
+ else:
210
+ # Load all specified subsets into separate entries
211
+ subset_dict = defaultdict()
212
+ for subset in self.subset_list:
213
+ # Set current subset, since same benchmark need to differentiate
214
+ with self._temporary_attribute('current_subset_name', subset):
215
+ subset_data = load_func(subset)
216
+ subset_dict[subset] = subset_data
217
+ dataset_dict = DatasetDict(subset_dict)
218
+ return dataset_dict
219
+
220
+ def load_subset(self, subset: str, data_loader: Type[DataLoader]) -> Dataset:
221
+ """
222
+ Load a specific subset of the dataset for evaluation.
223
+
224
+ Args:
225
+ subset (str): The subset identifier to load
226
+ data_loader (Type[DataLoader]): The data loader class to use for loading
227
+
228
+ Returns:
229
+ Dataset: The loaded dataset subset with processed samples
230
+ """
231
+ # Determine the split and subset names based on configuration
232
+ split = subset if self.split_as_subset else self.eval_split
233
+ subset_name = self.default_subset if self.split_as_subset else subset
234
+
235
+ # Create and configure the remote data loader
236
+ loader = data_loader(
237
+ data_id_or_path=self.dataset_id,
238
+ split=split,
239
+ subset=subset_name,
240
+ sample_fields=self.record_to_sample, # Custom sample conversion function
241
+ filter_func=self.sample_filter,
242
+ limit=self.limit if not self.reformat_subset else None, # Limit number of samples if specified
243
+ repeats=self.repeats, # Number of repetitions for each sample
244
+ shuffle_choices=self.shuffle_choices, # Shuffle choices if requested
245
+ data_source=self.dataset_hub, # Data source configuration
246
+ )
247
+ dataset = loader.load()
248
+ return dataset
249
+
250
+ def load_fewshot_subset(self, subset: str, data_loader: Type[DataLoader]) -> Dataset:
251
+ """
252
+ Load a subset specifically for few-shot examples.
253
+
254
+ Args:
255
+ subset (str): The subset identifier to load few-shot examples from
256
+ data_loader (Type[DataLoader]): The data loader class to use for loading
257
+
258
+ Returns:
259
+ Dataset: The loaded few-shot dataset with demonstration examples
260
+ """
261
+ # Use training split for few-shot examples
262
+ split = subset if self.split_as_subset else self.train_split
263
+ subset_name = self.default_subset if self.split_as_subset else subset
264
+
265
+ # Create loader specifically configured for few-shot sampling
266
+ loader = data_loader(
267
+ data_id_or_path=self.dataset_id,
268
+ split=split,
269
+ subset=subset_name,
270
+ sample_fields=self.record_to_sample,
271
+ filter_func=self.sample_filter, # Apply sample filtering if defined
272
+ limit=self.few_shot_num
273
+ if not self.reformat_subset else None, # Limit to specified number of few-shot examples
274
+ shuffle=self.few_shot_random, # Randomize selection if enabled
275
+ shuffle_choices=self.shuffle_choices, # Shuffle choices if requested
276
+ data_source=self.dataset_hub,
277
+ )
278
+ dataset = loader.load()
279
+ return dataset
280
+
281
+ def sample_filter(self, sample: Sample) -> bool:
282
+ """
283
+ Apply filtering to a dataset, only samples matching the predicate will be included.
284
+
285
+ Args:
286
+ sample (Sample): The sample to filter
287
+
288
+ Returns:
289
+ bool: True if the sample passes the filter, False otherwise
290
+ """
291
+ return True # Default implementation allows all samples
292
+
293
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
294
+ """
295
+ Convert a raw data record to a Sample object.
296
+
297
+ This method must be implemented in subclasses to handle dataset-specific
298
+ field mapping and data processing logic.
299
+
300
+ Args:
301
+ record (Dict[str, Any]): Raw data record from the dataset
302
+
303
+ Returns:
304
+ Sample: Processed sample object ready for evaluation
305
+ """
306
+ raise NotImplementedError('This method should be implemented in subclasses')
307
+
308
+ def sample_to_fewshot(self, sample: Sample) -> str:
309
+ """
310
+ Convert a Sample object to a formatted few-shot demonstration string.
311
+
312
+ This method must be implemented in subclasses to define how samples
313
+ are formatted as examples in few-shot prompts.
314
+
315
+ Args:
316
+ sample (Sample): The sample to convert to a few-shot example
317
+
318
+ Returns:
319
+ str: Formatted few-shot demonstration string
320
+ """
321
+ raise NotImplementedError('This method should be implemented in subclasses')
322
+
323
+ def format_prompt_template(self, sample: Sample) -> str:
324
+ """
325
+ Format the basic prompt template with the sample data.
326
+
327
+ This method applies the prompt template to format the input text
328
+ for models when no few-shot examples are used.
329
+
330
+ Args:
331
+ sample (Sample): The sample object containing the prompt data
332
+
333
+ Returns:
334
+ str: The formatted prompt ready for model input
335
+ """
336
+ return self.prompt_template.format(question=sample.input)
337
+
338
+ def format_fewshot_template(self, fewshot: str, sample: Sample) -> str:
339
+ """
340
+ Format the few-shot template with demonstrations and the main prompt.
341
+
342
+ This method combines few-shot examples with the main prompt using
343
+ the configured few-shot template.
344
+
345
+ Args:
346
+ fewshot (str): The formatted few-shot demonstration examples
347
+ sample (Sample): The sample object containing the prompt data
348
+
349
+ Returns:
350
+ str: The complete formatted input with few-shot context
351
+ """
352
+ return self.few_shot_prompt_template.format(fewshot=fewshot, question=sample.input)
353
+
354
+ # #################
355
+ # INFERENCE METHODS
356
+ # #################
357
+
358
+ def _on_inference_start(self, model: Model, sample: Sample) -> None:
359
+ """
360
+ Hook method called before inference starts.
361
+
362
+ This method can be overridden in subclasses to implement custom
363
+ preparation logic before model inference (e.g., model configuration,
364
+ sample preprocessing, state initialization).
365
+
366
+ Args:
367
+ model (Model): The model that will perform inference
368
+ sample (Sample): The sample to be processed
369
+ """
370
+ pass
371
+
372
+ def _on_inference(self, model: Model, sample: Sample) -> ModelOutput:
373
+ """
374
+ Hook method called during the actual inference process.
375
+
376
+ This method executes the model inference and can be overridden
377
+ to implement custom inference logic or model interaction patterns.
378
+
379
+ Args:
380
+ model (Model): The model to use for inference
381
+ sample (Sample): The sample to process
382
+
383
+ Returns:
384
+ ModelOutput: The raw output from the model
385
+ """
386
+ # Execute model inference with the processed input and any tools
387
+ model_output = model.generate(input=sample.input, tools=sample.tools)
388
+ return model_output
389
+
390
+ def _on_inference_end(
391
+ self, model: Model, sample: Sample, model_output: ModelOutput, output_dir: str, **kwargs
392
+ ) -> TaskState:
393
+ """
394
+ Hook method called after inference completes.
395
+
396
+ This method processes the model output and creates a TaskState object
397
+ that encapsulates all information about the completed inference task.
398
+ You can save the model output to the specified output directory.
399
+
400
+ Args:
401
+ model (Model): The model that performed inference
402
+ sample (Sample): The processed sample
403
+ model_output (ModelOutput): The raw model output
404
+ output_dir (str): The directory where the model output was saved
405
+
406
+ Returns:
407
+ TaskState: Complete state object for the inference task
408
+ """
409
+ return TaskState(
410
+ model=model.name,
411
+ sample=sample,
412
+ messages=[model_output.message],
413
+ output=model_output,
414
+ completed=True,
415
+ )
416
+
417
+ @override
418
+ def run_inference(self, model: Model, sample: Sample, output_dir: str, **kwargs) -> TaskState:
419
+ """
420
+ Execute the complete inference pipeline for a single sample.
421
+
422
+ This method orchestrates the full inference process using the hook methods:
423
+ 1. Pre-inference preparation
424
+ 2. Model inference execution
425
+ 3. Post-inference processing and state creation
426
+
427
+ Args:
428
+ model (Model): The model to use for inference
429
+ sample (Sample): The sample to process
430
+ output_dir (str): The directory to store the generated files
431
+
432
+ Returns:
433
+ TaskState: Complete state object containing inference results
434
+ """
435
+ self._on_inference_start(model, sample)
436
+ model_output = self._on_inference(model, sample)
437
+ task_state = self._on_inference_end(model, sample, model_output, output_dir, **kwargs)
438
+
439
+ return task_state
440
+
441
+ # ##########################
442
+ # METRIC CALCULATION METHODS
443
+ # ##########################
444
+
445
+ def filter_prediction(self, prediction: str, task_state: TaskState) -> str:
446
+ """
447
+ Filter and prepare the model prediction for metric calculation.
448
+
449
+ This method applies configured filters and custom answer extraction
450
+ to clean and prepare the raw model output for evaluation.
451
+
452
+ Args:
453
+ prediction (str): The raw model prediction
454
+ task_state (TaskState): The complete task state for context
455
+
456
+ Returns:
457
+ str: The filtered and extracted prediction ready for evaluation
458
+ """
459
+ if self.filter_ensemble is not None:
460
+ # Apply configured filters to clean the prediction
461
+ prediction = self.filter_ensemble(prediction)
462
+
463
+ # Apply custom answer extraction logic
464
+ extracted_prediction = self.extract_answer(prediction, task_state)
465
+ return extracted_prediction
466
+
467
+ def extract_answer(self, prediction: str, task_state: TaskState) -> str:
468
+ """
469
+ Hook method for custom answer extraction from model predictions.
470
+
471
+ This method can be overridden in subclasses to implement specific
472
+ logic for extracting the final answer from complex model outputs.
473
+
474
+ Args:
475
+ prediction (str): The model prediction to extract from
476
+ task_state (TaskState): The task state for additional context
477
+
478
+ Returns:
479
+ str: The extracted answer
480
+ """
481
+ return prediction
482
+
483
+ def match_score(
484
+ self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
485
+ ) -> Score:
486
+ """
487
+ Calculate evaluation scores by comparing prediction with reference.
488
+
489
+ This method computes scores using all configured metrics and creates
490
+ a comprehensive Score object with detailed evaluation results.
491
+
492
+ Args:
493
+ original_prediction (str): The original, unfiltered model prediction
494
+ filtered_prediction (str): The filtered and processed prediction
495
+ reference (str): The ground truth reference answer
496
+ task_state (TaskState): The complete task state for context
497
+
498
+ Returns:
499
+ Score: Object containing all calculated metric scores and metadata
500
+ """
501
+ # Initialize the score object with prediction details
502
+ score = Score(
503
+ extracted_prediction=filtered_prediction,
504
+ prediction=original_prediction,
505
+ )
506
+
507
+ # Calculate scores for each configured metric
508
+ for metric in self.metric_list:
509
+ try:
510
+ if isinstance(metric, str):
511
+ metric_name = metric
512
+ metric_scorer = get_metric(metric) # Get metric implementation from registry
513
+ metric_func = metric_scorer() # Instantiate the metric scorer
514
+ elif isinstance(metric, dict):
515
+ metric_name = list(metric.keys())[0]
516
+ metric_cls = get_metric(metric_name)
517
+ metric_func = metric_cls(**metric[metric_name]) # Initialize with parameters
518
+ metric_score = metric_func(
519
+ prediction=filtered_prediction,
520
+ reference=reference,
521
+ )
522
+ score.value[metric_name] = metric_score
523
+ except Exception as e:
524
+ logger.error(f'Error calculating metric {metric}: {e}')
525
+ score.value[metric_name] = 0
526
+ score.metadata[metric_name] = f'error: {str(e)}'
527
+
528
+ return score
529
+
530
+ @override
531
+ def calculate_metrics(self, task_state: TaskState) -> SampleScore:
532
+ """
533
+ Calculate comprehensive evaluation metrics for a completed task.
534
+
535
+ This method processes the task state to extract predictions, applies
536
+ filtering and answer extraction, calculates all configured metrics,
537
+ and packages the results into a SampleScore object.
538
+
539
+ Args:
540
+ task_state (TaskState): The completed task state to evaluate
541
+
542
+ Returns:
543
+ SampleScore: Complete scoring results for the sample
544
+
545
+ Raises:
546
+ AssertionError: If the task state is not marked as completed
547
+ """
548
+ assert task_state.completed, \
549
+ 'TaskState must be completed before calculating metrics.'
550
+
551
+ # Extract the raw prediction from the model output
552
+ prediction = task_state.output.completion
553
+
554
+ # Apply filtering and answer extraction
555
+ filtered_prediction = self.filter_prediction(prediction, task_state)
556
+
557
+ if self.judge_strategy == JudgeStrategy.LLM_RECALL:
558
+ # Step 1: Calculate standard metric scores (rule-based)
559
+ rule_based_score = self.match_score(
560
+ original_prediction=prediction,
561
+ filtered_prediction=filtered_prediction,
562
+ reference=task_state.target,
563
+ task_state=task_state
564
+ )
565
+
566
+ # Step 2: Apply LLM judge if enabled and get final score
567
+ final_score = self.maybe_llm_match_score(
568
+ original_prediction=prediction,
569
+ filtered_prediction=filtered_prediction,
570
+ reference=task_state.target,
571
+ task_state=task_state,
572
+ rule_based_score=rule_based_score
573
+ )
574
+ else:
575
+ if self.use_llm_judge:
576
+ # Use LLM judge to compute the match score directly
577
+ final_score = self.llm_match_score(
578
+ original_prediction=prediction,
579
+ filtered_prediction=filtered_prediction,
580
+ reference=task_state.target,
581
+ task_state=task_state
582
+ )
583
+ else:
584
+ # Use standard match score calculation without LLM judge
585
+ final_score = self.match_score(
586
+ original_prediction=prediction,
587
+ filtered_prediction=filtered_prediction,
588
+ reference=task_state.target,
589
+ task_state=task_state
590
+ )
591
+
592
+ # Package the results into a sample score object
593
+ sample_score = SampleScore(
594
+ score=final_score,
595
+ sample_id=task_state.sample_id,
596
+ group_id=task_state.group_id,
597
+ sample_metadata=task_state.metadata,
598
+ )
599
+
600
+ return sample_score
601
+
602
+ @override
603
+ def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
604
+ """
605
+ Aggregate individual sample scores into summary statistics.
606
+
607
+ This method uses the configured aggregation method to compute
608
+ summary statistics (e.g., mean, median, percentiles) across
609
+ all sample scores for comprehensive evaluation results.
610
+
611
+ Args:
612
+ sample_scores (List[SampleScore]): Individual scores for all samples
613
+
614
+ Returns:
615
+ List[AggScore]: Aggregated scores and statistics
616
+ """
617
+ # Get the configured aggregation implementation
618
+ aggregate_cls = get_aggregation(self.aggregation)
619
+ aggregator = aggregate_cls()
620
+
621
+ # Compute aggregated scores
622
+ agg_scores = aggregator(sample_scores)
623
+
624
+ return agg_scores
625
+
626
+ # #########################
627
+ # REPORT GENERATION METHODS
628
+ # #########################
629
+
630
+ def _on_generate_report_end(self, report: Report, output_dir: str, **kwargs) -> None:
631
+ """
632
+ Hook method called after generating the evaluation report.
633
+
634
+ This method can be overridden in subclasses to implement custom
635
+ post-processing of the generated report (e.g., additional formatting,
636
+ custom visualizations, external integrations).
637
+
638
+ Args:
639
+ report (Report): The generated evaluation report
640
+ output_dir (str): Directory where the report should be saved
641
+ """
642
+ pass
643
+
644
+ def _on_generate_report(
645
+ self, scores: Dict[str, List[AggScore]], model_name: str, add_aggregation_name: bool = True
646
+ ) -> Report:
647
+ """
648
+ Hook method called during report generation.
649
+
650
+ This method creates the evaluation report using the configured
651
+ report generator and can be overridden to implement custom
652
+ report generation logic.
653
+
654
+ Args:
655
+ scores (Dict[str, List[AggScore]]): Aggregated scores organized by subset
656
+ model_name (str): Name of the evaluated model
657
+
658
+ Returns:
659
+ Report: The generated evaluation report
660
+ """
661
+ return ReportGenerator.generate_report(
662
+ score_dict=scores, model_name=model_name, data_adapter=self, add_aggregation_name=add_aggregation_name
663
+ )
664
+
665
+ @override
666
+ def generate_report(self, scores: Dict[str, List[AggScore]], model_name: str, output_dir: str, **kwargs) -> Report:
667
+ """
668
+ Generate a comprehensive evaluation report from aggregated scores.
669
+
670
+ This method orchestrates the complete report generation process:
671
+ 1. Creates the report using configured generators
672
+ 2. Applies any post-processing through hook methods
673
+
674
+ Args:
675
+ scores (Dict[str, List[AggScore]]): Aggregated scores by subset name
676
+ model_name (str): Name of the model being evaluated
677
+
678
+ Returns:
679
+ Report: Complete evaluation report with results and analysis
680
+ """
681
+ report = self._on_generate_report(scores, model_name=model_name)
682
+ self._on_generate_report_end(report, output_dir, **kwargs)
683
+ return report