evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +3 -0
  3. evalscope/api/benchmark/adapters/__init__.py +5 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
  8. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  9. evalscope/api/benchmark/benchmark.py +356 -0
  10. evalscope/api/benchmark/meta.py +121 -0
  11. evalscope/api/dataset/__init__.py +2 -0
  12. evalscope/api/dataset/dataset.py +349 -0
  13. evalscope/api/dataset/loader.py +262 -0
  14. evalscope/api/dataset/utils.py +143 -0
  15. evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope/api/evaluator/cache.py +378 -0
  17. evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope/api/evaluator/state.py +275 -0
  19. evalscope/api/filter/__init__.py +1 -0
  20. evalscope/api/filter/filter.py +72 -0
  21. evalscope/api/messages/__init__.py +12 -0
  22. evalscope/api/messages/chat_message.py +243 -0
  23. evalscope/api/messages/content.py +102 -0
  24. evalscope/api/messages/utils.py +35 -0
  25. evalscope/api/metric/__init__.py +2 -0
  26. evalscope/api/metric/metric.py +55 -0
  27. evalscope/api/metric/scorer.py +113 -0
  28. evalscope/api/mixin/__init__.py +1 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +155 -0
  32. evalscope/api/model/model.py +386 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/app.py +3 -0
  40. evalscope/app/ui/app_ui.py +2 -1
  41. evalscope/app/ui/multi_model.py +50 -25
  42. evalscope/app/ui/single_model.py +26 -14
  43. evalscope/app/utils/data_utils.py +43 -27
  44. evalscope/app/utils/env_utils.py +12 -0
  45. evalscope/app/utils/text_utils.py +14 -14
  46. evalscope/app/utils/visualization.py +9 -4
  47. evalscope/arguments.py +7 -10
  48. evalscope/backend/opencompass/api_meta_template.py +2 -1
  49. evalscope/backend/opencompass/backend_manager.py +6 -5
  50. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  51. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  52. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  53. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  55. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  56. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  57. evalscope/backend/rag_eval/utils/embedding.py +10 -1
  58. evalscope/backend/rag_eval/utils/llm.py +13 -12
  59. evalscope/benchmarks/__init__.py +0 -2
  60. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  61. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  62. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  63. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  64. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  65. evalscope/benchmarks/arena_hard/utils.py +37 -1
  66. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  67. evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
  68. evalscope/benchmarks/bfcl/generation.py +222 -0
  69. evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
  70. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  71. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  72. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  73. evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
  74. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  75. evalscope/benchmarks/docmath/utils.py +4 -5
  76. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  77. evalscope/benchmarks/frames/frames_adapter.py +136 -52
  78. evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
  79. evalscope/benchmarks/general_arena/utils.py +23 -27
  80. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  81. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  82. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  83. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  84. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  85. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  86. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  87. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  88. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  89. evalscope/benchmarks/ifeval/instructions.py +109 -64
  90. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  91. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  92. evalscope/benchmarks/ifeval/utils.py +6 -7
  93. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  94. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  95. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  96. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/math_vista/__init__.py +0 -0
  105. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  106. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  107. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  108. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  109. evalscope/benchmarks/mmmu/__init__.py +0 -0
  110. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  111. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  112. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  113. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  114. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
  115. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  116. evalscope/benchmarks/race/race_adapter.py +33 -119
  117. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  118. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  119. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  120. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  121. evalscope/benchmarks/tau_bench/generation.py +147 -0
  122. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
  123. evalscope/benchmarks/text2image/__init__.py +0 -0
  124. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  125. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  126. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  127. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  128. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  129. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  131. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
  132. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  133. evalscope/cli/cli.py +2 -0
  134. evalscope/cli/start_app.py +7 -1
  135. evalscope/cli/start_perf.py +7 -1
  136. evalscope/cli/start_server.py +6 -3
  137. evalscope/collections/__init__.py +2 -10
  138. evalscope/collections/sampler.py +10 -10
  139. evalscope/collections/schema.py +13 -11
  140. evalscope/config.py +157 -57
  141. evalscope/constants.py +37 -61
  142. evalscope/evaluator/__init__.py +1 -1
  143. evalscope/evaluator/evaluator.py +275 -419
  144. evalscope/filters/__init__.py +2 -0
  145. evalscope/filters/extraction.py +126 -0
  146. evalscope/filters/selection.py +57 -0
  147. evalscope/metrics/__init__.py +13 -13
  148. evalscope/metrics/llm_judge.py +47 -33
  149. evalscope/metrics/math_parser.py +27 -22
  150. evalscope/metrics/metric.py +307 -0
  151. evalscope/metrics/metrics.py +22 -18
  152. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  153. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  154. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  155. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  156. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  157. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  158. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  159. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  160. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  162. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  163. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  184. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  185. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  186. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  187. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  188. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  189. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  190. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  191. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  192. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  193. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  194. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  195. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  196. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  197. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  198. evalscope/models/__init__.py +6 -29
  199. evalscope/models/image_edit_model.py +125 -0
  200. evalscope/models/mockllm.py +65 -0
  201. evalscope/models/model_apis.py +67 -0
  202. evalscope/models/modelscope.py +455 -0
  203. evalscope/models/openai_compatible.py +126 -0
  204. evalscope/models/text2image_model.py +124 -0
  205. evalscope/models/utils/openai.py +701 -0
  206. evalscope/perf/benchmark.py +4 -1
  207. evalscope/perf/http_client.py +4 -2
  208. evalscope/perf/plugin/api/custom_api.py +5 -4
  209. evalscope/perf/plugin/api/openai_api.py +11 -9
  210. evalscope/perf/plugin/datasets/custom.py +2 -1
  211. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  212. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  213. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  214. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  215. evalscope/perf/plugin/datasets/openqa.py +4 -2
  216. evalscope/perf/utils/benchmark_util.py +15 -10
  217. evalscope/perf/utils/db_util.py +9 -6
  218. evalscope/perf/utils/local_server.py +11 -3
  219. evalscope/perf/utils/rich_display.py +16 -10
  220. evalscope/report/__init__.py +2 -3
  221. evalscope/report/combinator.py +18 -12
  222. evalscope/report/generator.py +51 -35
  223. evalscope/report/{utils.py → report.py} +8 -6
  224. evalscope/run.py +33 -47
  225. evalscope/summarizer.py +1 -1
  226. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  227. evalscope/utils/__init__.py +21 -2
  228. evalscope/utils/chat_service.py +3 -2
  229. evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope/utils/function_utils.py +29 -0
  231. evalscope/utils/import_utils.py +23 -1
  232. evalscope/utils/io_utils.py +142 -6
  233. evalscope/utils/json_schema.py +208 -0
  234. evalscope/utils/logger.py +51 -12
  235. evalscope/utils/model_utils.py +11 -7
  236. evalscope/utils/multi_choices.py +288 -0
  237. evalscope/utils/url_utils.py +65 -0
  238. evalscope/version.py +2 -2
  239. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
  240. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
  241. tests/benchmark/test_eval.py +385 -0
  242. tests/benchmark/test_image_edit.py +65 -0
  243. tests/{aigc → benchmark}/test_t2i.py +22 -4
  244. tests/benchmark/test_vlm.py +80 -0
  245. tests/cli/test_all.py +85 -47
  246. tests/cli/test_collection.py +20 -8
  247. tests/cli/test_custom.py +22 -15
  248. tests/cli/test_reasoning.py +81 -0
  249. tests/common.py +73 -0
  250. tests/perf/test_perf.py +4 -2
  251. tests/rag/test_clip_benchmark.py +0 -2
  252. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  253. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  254. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  255. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
  256. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  257. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  258. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  259. evalscope/benchmarks/benchmark.py +0 -81
  260. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  261. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  262. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  263. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  264. evalscope/benchmarks/data_adapter.py +0 -528
  265. evalscope/benchmarks/filters.py +0 -59
  266. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  267. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  268. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  269. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  270. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  271. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  272. evalscope/benchmarks/race/race.py +0 -104
  273. evalscope/benchmarks/race/samples.jsonl +0 -5
  274. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  275. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  276. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  277. evalscope/benchmarks/utils.py +0 -60
  278. evalscope/collections/evaluator.py +0 -375
  279. evalscope/metrics/completion_parsers.py +0 -227
  280. evalscope/metrics/named_metrics.py +0 -55
  281. evalscope/models/adapters/__init__.py +0 -14
  282. evalscope/models/adapters/base_adapter.py +0 -84
  283. evalscope/models/adapters/bfcl_adapter.py +0 -246
  284. evalscope/models/adapters/chat_adapter.py +0 -207
  285. evalscope/models/adapters/choice_adapter.py +0 -222
  286. evalscope/models/adapters/custom_adapter.py +0 -71
  287. evalscope/models/adapters/server_adapter.py +0 -236
  288. evalscope/models/adapters/t2i_adapter.py +0 -79
  289. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  290. evalscope/models/custom/__init__.py +0 -4
  291. evalscope/models/custom/custom_model.py +0 -50
  292. evalscope/models/custom/dummy_model.py +0 -99
  293. evalscope/models/local_model.py +0 -128
  294. evalscope/models/register.py +0 -41
  295. tests/cli/test_run.py +0 -489
  296. /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
  297. /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
  298. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  299. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  300. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  301. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  302. /tests/{aigc → benchmark}/__init__.py +0 -0
@@ -0,0 +1,684 @@
1
+ import os
2
+ from collections import defaultdict
3
+ from functools import partial
4
+ from overrides import override
5
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Type
6
+
7
+ from evalscope.api.dataset import DataLoader, Dataset, DatasetDict, LocalDataLoader, RemoteDataLoader, Sample
8
+ from evalscope.api.evaluator import TaskState
9
+ from evalscope.api.messages import ChatMessage, ChatMessageSystem, ChatMessageUser
10
+ from evalscope.api.metric import AggScore, SampleScore, Score
11
+ from evalscope.api.model import Model, ModelOutput
12
+ from evalscope.api.registry import get_aggregation, get_metric
13
+ from evalscope.constants import HubType, JudgeStrategy
14
+ from evalscope.report import Report, ReportGenerator
15
+ from evalscope.utils import get_logger
16
+ from ..benchmark import DataAdapter
17
+
18
+ logger = get_logger()
19
+
20
+
21
+ class DefaultDataAdapter(DataAdapter):
22
+ """
23
+ Default Data Adapter for the benchmark evaluation system.
24
+
25
+ This class serves as the base implementation for data adapters that handle:
26
+ - Dataset loading and preprocessing
27
+ - Model inference execution
28
+ - Metric calculation and aggregation
29
+ - Report generation
30
+
31
+ The adapter follows a pipeline architecture with hooks that can be overridden
32
+ in subclasses to customize behavior for specific benchmarks or evaluation tasks.
33
+
34
+ Key responsibilities:
35
+ 1. Load datasets with optional few-shot examples
36
+ 2. Process samples and format prompts
37
+ 3. Execute model inference with proper state management
38
+ 4. Calculate evaluation metrics and aggregate results
39
+ 5. Generate comprehensive evaluation reports
40
+
41
+ This class can be extended to implement specific data loading and processing
42
+ logic for different benchmark datasets and evaluation scenarios.
43
+ """
44
+
45
+ # ####################
46
+ # DATA LOADING METHODS
47
+ # ####################
48
+
49
+ @override
50
+ def load_dataset(self) -> DatasetDict:
51
+ """
52
+ Load the complete dataset including test data and optional few-shot examples.
53
+
54
+ This method handles both local and remote dataset loading, processes samples
55
+ with appropriate prompt formatting, and prepares few-shot examples if needed.
56
+
57
+ Returns:
58
+ DatasetDict: A dictionary containing the loaded and processed datasets,
59
+ organized by subset names.
60
+ """
61
+ # Load the dataset
62
+ self.test_dataset, self.fewshot_dataset = self.load()
63
+
64
+ # Process each sample's input by applying prompt templates and few-shot formatting
65
+ self._post_process_samples()
66
+
67
+ return self.test_dataset
68
+
69
+ def load(self) -> Tuple[DatasetDict, Optional[DatasetDict]]:
70
+ """Load the dataset from disk or remote source.
71
+
72
+ Returns:
73
+ Tuple[DatasetDict, Optional[DatasetDict]]: The test dataset and few-shot dataset.
74
+ """
75
+ if os.path.exists(self.dataset_id):
76
+ # Load dataset from local file system path
77
+ with self._temporary_attribute('dataset_hub', HubType.LOCAL):
78
+ return self.load_from_disk()
79
+ else:
80
+ # Load dataset from remote source (e.g., ModelScope, Huggingface)
81
+ return self.load_from_remote()
82
+
83
+ def load_from_remote(self):
84
+ """Load dataset from remote source and prepare few-shot examples if needed."""
85
+ test_dataset = None
86
+ fewshot_dataset = None
87
+ # Load dataset from remote source
88
+ test_load_func = partial(self.load_subset, data_loader=RemoteDataLoader)
89
+ test_dataset = self.load_subsets(test_load_func)
90
+
91
+ # Load few-shot examples if few-shot prompting is enabled
92
+ if self._should_load_fewshot():
93
+ fewshot_load_func = partial(self.load_fewshot_subset, data_loader=RemoteDataLoader)
94
+ fewshot_dataset = self.load_subsets(fewshot_load_func, is_fewshot=True)
95
+ return test_dataset, fewshot_dataset
96
+
97
+ def load_from_disk(self, use_local_loader: bool = False):
98
+ """
99
+ Load dataset from local disk path.
100
+
101
+ Args:
102
+ use_local_loader: If True, use local file loading; otherwise use remote loading
103
+ for local ModelScope datasets.
104
+ """
105
+ test_dataset = None
106
+ fewshot_dataset = None
107
+ if use_local_loader:
108
+ # Use LocalDataLoader for actual local file loading
109
+ test_load_func = partial(self.load_subset, data_loader=LocalDataLoader)
110
+ test_dataset = self.load_subsets(test_load_func)
111
+
112
+ # Load few-shot examples if few-shot prompting is enabled
113
+ if self._should_load_fewshot():
114
+ fewshot_load_func = partial(self.load_fewshot_subset, data_loader=LocalDataLoader)
115
+ fewshot_dataset = self.load_subsets(fewshot_load_func, is_fewshot=True)
116
+ return test_dataset, fewshot_dataset
117
+ else:
118
+ # Fallback to remote loading for local ModelScope datasets
119
+ return self.load_from_remote()
120
+
121
+ def _should_load_fewshot(self) -> bool:
122
+ """Check if few-shot dataset should be loaded."""
123
+ return self.few_shot_num > 0 and self.train_split is not None
124
+
125
+ def _post_process_samples(self):
126
+ """Process all sample inputs with prompt formatting."""
127
+ for subset in self.test_dataset.keys():
128
+ for sample in self.test_dataset[subset]:
129
+ if isinstance(sample.input, str):
130
+ sample.input = self.process_sample_str_input(sample, subset)
131
+
132
+ def process_sample_str_input(self, sample: Sample, subset: str) -> List[ChatMessage]:
133
+ """
134
+ Convert a sample's input string to a list of ChatMessage objects.
135
+
136
+ This method formats the sample input into a structured message format
137
+ suitable for model inference, including system prompts if configured.
138
+ """
139
+ input_text = self.process_sample_input(sample, subset=subset)
140
+ input_messages = [ChatMessageUser(content=input_text)]
141
+ if self.system_prompt:
142
+ input_messages.insert(0, ChatMessageSystem(content=self.system_prompt))
143
+ return input_messages
144
+
145
+ def process_sample_input(self, sample: Sample, subset: str) -> str:
146
+ """
147
+ Process a single sample's input by applying prompt templates and few-shot formatting.
148
+
149
+ This method handles the complete input preparation pipeline:
150
+ 1. Retrieves few-shot examples if enabled
151
+ 2. Formats few-shot examples into demonstration text
152
+ 3. Applies appropriate prompt template (with or without few-shot context)
153
+
154
+ Args:
155
+ sample (Sample): The sample to process
156
+ subset (str): The subset name this sample belongs to
157
+
158
+ Returns:
159
+ str: The formatted input text ready for model inference
160
+ """
161
+ if self.few_shot_num > 0:
162
+ if self.fewshot_dataset is not None:
163
+ # Retrieve few-shot examples for the current subset
164
+ few_shot_samples = self.fewshot_dataset.get(subset)
165
+ if few_shot_samples is None:
166
+ # Fallback: use the first available subset if current subset not found
167
+ first_key = next(iter(self.fewshot_dataset))
168
+ few_shot_samples = self.fewshot_dataset[first_key]
169
+ # Select fewshot samples
170
+ assert len(few_shot_samples) >= self.few_shot_num, (
171
+ f"""The dataset only have ({len(few_shot_samples)}) few-shot samples, but requested ({self.few_shot_num}) fewshot samples, please reduce 'few_shot_num'.""" # noqa: E501
172
+ )
173
+ # Convert few-shot samples to demonstration string
174
+ few_shot = '\n\n'.join([self.sample_to_fewshot(sample) for sample in few_shot_samples])
175
+ else:
176
+ # Build few-shot examples inside the format method
177
+ few_shot = ''
178
+ # Format the input text with few-shot examples and main prompt
179
+ input_text = self.format_fewshot_template(fewshot=few_shot, sample=sample)
180
+ else:
181
+ # No few-shot examples: use the prompt template directly
182
+ input_text = self.format_prompt_template(sample=sample)
183
+ return input_text
184
+
185
+ def load_subsets(self, load_func: Callable[[str], Dataset], is_fewshot=False) -> DatasetDict:
186
+ """
187
+ Load multiple subsets of the dataset using the provided loading function.
188
+
189
+ This method handles two loading strategies:
190
+ 1. Reformat mode: Load only the default subset and reformat it
191
+ 2. Multi-subset mode: Load all subsets specified in subset_list
192
+
193
+ Args:
194
+ load_func (Callable[[str], Dataset]): Function to load individual subsets
195
+
196
+ Returns:
197
+ DatasetDict: Dictionary containing all loaded subsets
198
+ """
199
+ if self.reformat_subset:
200
+ # Load only the default subset
201
+ subset_data = load_func(self.default_subset)
202
+ # Reformat the subset to create multiple subsets based on sample keys
203
+ # NOTE: subset_list and limit is applied here if specified
204
+ limit = self.few_shot_num if is_fewshot else self.limit
205
+ repeats = 1 if is_fewshot else self.repeats
206
+ dataset_dict = DatasetDict.from_dataset(
207
+ dataset=subset_data, subset_list=self.subset_list, limit=limit, repeats=repeats
208
+ )
209
+ else:
210
+ # Load all specified subsets into separate entries
211
+ subset_dict = defaultdict()
212
+ for subset in self.subset_list:
213
+ # Set current subset, since same benchmark need to differentiate
214
+ with self._temporary_attribute('current_subset_name', subset):
215
+ subset_data = load_func(subset)
216
+ subset_dict[subset] = subset_data
217
+ dataset_dict = DatasetDict(subset_dict)
218
+ return dataset_dict
219
+
220
+ def load_subset(self, subset: str, data_loader: Type[DataLoader]) -> Dataset:
221
+ """
222
+ Load a specific subset of the dataset for evaluation.
223
+
224
+ Args:
225
+ subset (str): The subset identifier to load
226
+ data_loader (Type[DataLoader]): The data loader class to use for loading
227
+
228
+ Returns:
229
+ Dataset: The loaded dataset subset with processed samples
230
+ """
231
+ # Determine the split and subset names based on configuration
232
+ split = subset if self.split_as_subset else self.eval_split
233
+ subset_name = self.default_subset if self.split_as_subset else subset
234
+
235
+ # Create and configure the remote data loader
236
+ loader = data_loader(
237
+ data_id_or_path=self.dataset_id,
238
+ split=split,
239
+ subset=subset_name,
240
+ sample_fields=self.record_to_sample, # Custom sample conversion function
241
+ filter_func=self.sample_filter,
242
+ limit=self.limit if not self.reformat_subset else None, # Limit number of samples if specified
243
+ repeats=self.repeats, # Number of repetitions for each sample
244
+ shuffle=self.shuffle, # Shuffle dataset if enabled
245
+ shuffle_choices=self.shuffle_choices, # Shuffle choices if requested
246
+ data_source=self.dataset_hub, # Data source configuration
247
+ )
248
+ dataset = loader.load()
249
+ return dataset
250
+
251
+ def load_fewshot_subset(self, subset: str, data_loader: Type[DataLoader]) -> Dataset:
252
+ """
253
+ Load a subset specifically for few-shot examples.
254
+
255
+ Args:
256
+ subset (str): The subset identifier to load few-shot examples from
257
+ data_loader (Type[DataLoader]): The data loader class to use for loading
258
+
259
+ Returns:
260
+ Dataset: The loaded few-shot dataset with demonstration examples
261
+ """
262
+ # Use training split for few-shot examples
263
+ split = subset if self.split_as_subset else self.train_split
264
+ subset_name = self.default_subset if self.split_as_subset else subset
265
+
266
+ # Create loader specifically configured for few-shot sampling
267
+ loader = data_loader(
268
+ data_id_or_path=self.dataset_id,
269
+ split=split,
270
+ subset=subset_name,
271
+ sample_fields=self.record_to_sample,
272
+ filter_func=self.sample_filter, # Apply sample filtering if defined
273
+ limit=self.few_shot_num
274
+ if not self.reformat_subset else None, # Limit to specified number of few-shot examples
275
+ shuffle=self.few_shot_random, # Randomize selection if enabled
276
+ shuffle_choices=self.shuffle_choices, # Shuffle choices if requested
277
+ data_source=self.dataset_hub,
278
+ )
279
+ dataset = loader.load()
280
+ return dataset
281
+
282
+ def sample_filter(self, sample: Sample) -> bool:
283
+ """
284
+ Apply filtering to a dataset, only samples matching the predicate will be included.
285
+
286
+ Args:
287
+ sample (Sample): The sample to filter
288
+
289
+ Returns:
290
+ bool: True if the sample passes the filter, False otherwise
291
+ """
292
+ return True # Default implementation allows all samples
293
+
294
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
295
+ """
296
+ Convert a raw data record to a Sample object.
297
+
298
+ This method must be implemented in subclasses to handle dataset-specific
299
+ field mapping and data processing logic.
300
+
301
+ Args:
302
+ record (Dict[str, Any]): Raw data record from the dataset
303
+
304
+ Returns:
305
+ Sample: Processed sample object ready for evaluation
306
+ """
307
+ raise NotImplementedError('This method should be implemented in subclasses')
308
+
309
+ def sample_to_fewshot(self, sample: Sample) -> str:
310
+ """
311
+ Convert a Sample object to a formatted few-shot demonstration string.
312
+
313
+ This method must be implemented in subclasses to define how samples
314
+ are formatted as examples in few-shot prompts.
315
+
316
+ Args:
317
+ sample (Sample): The sample to convert to a few-shot example
318
+
319
+ Returns:
320
+ str: Formatted few-shot demonstration string
321
+ """
322
+ raise NotImplementedError('This method should be implemented in subclasses')
323
+
324
+ def format_prompt_template(self, sample: Sample) -> str:
325
+ """
326
+ Format the basic prompt template with the sample data.
327
+
328
+ This method applies the prompt template to format the input text
329
+ for models when no few-shot examples are used.
330
+
331
+ Args:
332
+ sample (Sample): The sample object containing the prompt data
333
+
334
+ Returns:
335
+ str: The formatted prompt ready for model input
336
+ """
337
+ return self.prompt_template.format(question=sample.input)
338
+
339
+ def format_fewshot_template(self, fewshot: str, sample: Sample) -> str:
340
+ """
341
+ Format the few-shot template with demonstrations and the main prompt.
342
+
343
+ This method combines few-shot examples with the main prompt using
344
+ the configured few-shot template.
345
+
346
+ Args:
347
+ fewshot (str): The formatted few-shot demonstration examples
348
+ sample (Sample): The sample object containing the prompt data
349
+
350
+ Returns:
351
+ str: The complete formatted input with few-shot context
352
+ """
353
+ return self.few_shot_prompt_template.format(fewshot=fewshot, question=sample.input)
354
+
355
+ # #################
356
+ # INFERENCE METHODS
357
+ # #################
358
+
359
+ def _on_inference_start(self, model: Model, sample: Sample) -> None:
360
+ """
361
+ Hook method called before inference starts.
362
+
363
+ This method can be overridden in subclasses to implement custom
364
+ preparation logic before model inference (e.g., model configuration,
365
+ sample preprocessing, state initialization).
366
+
367
+ Args:
368
+ model (Model): The model that will perform inference
369
+ sample (Sample): The sample to be processed
370
+ """
371
+ pass
372
+
373
+ def _on_inference(self, model: Model, sample: Sample) -> ModelOutput:
374
+ """
375
+ Hook method called during the actual inference process.
376
+
377
+ This method executes the model inference and can be overridden
378
+ to implement custom inference logic or model interaction patterns.
379
+
380
+ Args:
381
+ model (Model): The model to use for inference
382
+ sample (Sample): The sample to process
383
+
384
+ Returns:
385
+ ModelOutput: The raw output from the model
386
+ """
387
+ # Execute model inference with the processed input and any tools
388
+ model_output = model.generate(input=sample.input, tools=sample.tools)
389
+ return model_output
390
+
391
+ def _on_inference_end(
392
+ self, model: Model, sample: Sample, model_output: ModelOutput, output_dir: str, **kwargs
393
+ ) -> TaskState:
394
+ """
395
+ Hook method called after inference completes.
396
+
397
+ This method processes the model output and creates a TaskState object
398
+ that encapsulates all information about the completed inference task.
399
+ You can save the model output to the specified output directory.
400
+
401
+ Args:
402
+ model (Model): The model that performed inference
403
+ sample (Sample): The processed sample
404
+ model_output (ModelOutput): The raw model output
405
+ output_dir (str): The directory where the model output was saved
406
+
407
+ Returns:
408
+ TaskState: Complete state object for the inference task
409
+ """
410
+ return TaskState(
411
+ model=model.name,
412
+ sample=sample,
413
+ messages=[model_output.message],
414
+ output=model_output,
415
+ completed=True,
416
+ )
417
+
418
+ @override
419
+ def run_inference(self, model: Model, sample: Sample, output_dir: str, **kwargs) -> TaskState:
420
+ """
421
+ Execute the complete inference pipeline for a single sample.
422
+
423
+ This method orchestrates the full inference process using the hook methods:
424
+ 1. Pre-inference preparation
425
+ 2. Model inference execution
426
+ 3. Post-inference processing and state creation
427
+
428
+ Args:
429
+ model (Model): The model to use for inference
430
+ sample (Sample): The sample to process
431
+ output_dir (str): The directory to store the generated files
432
+
433
+ Returns:
434
+ TaskState: Complete state object containing inference results
435
+ """
436
+ self._on_inference_start(model, sample)
437
+ model_output = self._on_inference(model, sample)
438
+ task_state = self._on_inference_end(model, sample, model_output, output_dir, **kwargs)
439
+
440
+ return task_state
441
+
442
+ # ##########################
443
+ # METRIC CALCULATION METHODS
444
+ # ##########################
445
+
446
+ def filter_prediction(self, prediction: str, task_state: TaskState) -> str:
447
+ """
448
+ Filter and prepare the model prediction for metric calculation.
449
+
450
+ This method applies configured filters and custom answer extraction
451
+ to clean and prepare the raw model output for evaluation.
452
+
453
+ Args:
454
+ prediction (str): The raw model prediction
455
+ task_state (TaskState): The complete task state for context
456
+
457
+ Returns:
458
+ str: The filtered and extracted prediction ready for evaluation
459
+ """
460
+ if self.filter_ensemble is not None:
461
+ # Apply configured filters to clean the prediction
462
+ prediction = self.filter_ensemble(prediction)
463
+
464
+ # Apply custom answer extraction logic
465
+ extracted_prediction = self.extract_answer(prediction, task_state)
466
+ return extracted_prediction
467
+
468
+ def extract_answer(self, prediction: str, task_state: TaskState) -> str:
469
+ """
470
+ Hook method for custom answer extraction from model predictions.
471
+
472
+ This method can be overridden in subclasses to implement specific
473
+ logic for extracting the final answer from complex model outputs.
474
+
475
+ Args:
476
+ prediction (str): The model prediction to extract from
477
+ task_state (TaskState): The task state for additional context
478
+
479
+ Returns:
480
+ str: The extracted answer
481
+ """
482
+ return prediction
483
+
484
+ def match_score(
485
+ self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
486
+ ) -> Score:
487
+ """
488
+ Calculate evaluation scores by comparing prediction with reference.
489
+
490
+ This method computes scores using all configured metrics and creates
491
+ a comprehensive Score object with detailed evaluation results.
492
+
493
+ Args:
494
+ original_prediction (str): The original, unfiltered model prediction
495
+ filtered_prediction (str): The filtered and processed prediction
496
+ reference (str): The ground truth reference answer
497
+ task_state (TaskState): The complete task state for context
498
+
499
+ Returns:
500
+ Score: Object containing all calculated metric scores and metadata
501
+ """
502
+ # Initialize the score object with prediction details
503
+ score = Score(
504
+ extracted_prediction=filtered_prediction,
505
+ prediction=original_prediction,
506
+ )
507
+
508
+ # Calculate scores for each configured metric
509
+ for metric in self.metric_list:
510
+ try:
511
+ if isinstance(metric, str):
512
+ metric_name = metric
513
+ metric_scorer = get_metric(metric) # Get metric implementation from registry
514
+ metric_func = metric_scorer() # Instantiate the metric scorer
515
+ elif isinstance(metric, dict):
516
+ metric_name = list(metric.keys())[0]
517
+ metric_cls = get_metric(metric_name)
518
+ metric_func = metric_cls(**metric[metric_name]) # Initialize with parameters
519
+ metric_score = metric_func(
520
+ prediction=filtered_prediction,
521
+ reference=reference,
522
+ )
523
+ score.value[metric_name] = metric_score
524
+ except Exception as e:
525
+ logger.error(f'Error calculating metric {metric}: {e}')
526
+ score.value[metric_name] = 0
527
+ score.metadata[metric_name] = f'error: {str(e)}'
528
+
529
+ return score
530
+
531
+ @override
532
+ def calculate_metrics(self, task_state: TaskState) -> SampleScore:
533
+ """
534
+ Calculate comprehensive evaluation metrics for a completed task.
535
+
536
+ This method processes the task state to extract predictions, applies
537
+ filtering and answer extraction, calculates all configured metrics,
538
+ and packages the results into a SampleScore object.
539
+
540
+ Args:
541
+ task_state (TaskState): The completed task state to evaluate
542
+
543
+ Returns:
544
+ SampleScore: Complete scoring results for the sample
545
+
546
+ Raises:
547
+ AssertionError: If the task state is not marked as completed
548
+ """
549
+ assert task_state.completed, \
550
+ 'TaskState must be completed before calculating metrics.'
551
+
552
+ # Extract the raw prediction from the model output
553
+ prediction = task_state.output.completion
554
+
555
+ # Apply filtering and answer extraction
556
+ filtered_prediction = self.filter_prediction(prediction, task_state)
557
+
558
+ if self.judge_strategy == JudgeStrategy.LLM_RECALL:
559
+ # Step 1: Calculate standard metric scores (rule-based)
560
+ rule_based_score = self.match_score(
561
+ original_prediction=prediction,
562
+ filtered_prediction=filtered_prediction,
563
+ reference=task_state.target,
564
+ task_state=task_state
565
+ )
566
+
567
+ # Step 2: Apply LLM judge if enabled and get final score
568
+ final_score = self.maybe_llm_match_score(
569
+ original_prediction=prediction,
570
+ filtered_prediction=filtered_prediction,
571
+ reference=task_state.target,
572
+ task_state=task_state,
573
+ rule_based_score=rule_based_score
574
+ )
575
+ else:
576
+ if self.use_llm_judge:
577
+ # Use LLM judge to compute the match score directly
578
+ final_score = self.llm_match_score(
579
+ original_prediction=prediction,
580
+ filtered_prediction=filtered_prediction,
581
+ reference=task_state.target,
582
+ task_state=task_state
583
+ )
584
+ else:
585
+ # Use standard match score calculation without LLM judge
586
+ final_score = self.match_score(
587
+ original_prediction=prediction,
588
+ filtered_prediction=filtered_prediction,
589
+ reference=task_state.target,
590
+ task_state=task_state
591
+ )
592
+
593
+ # Package the results into a sample score object
594
+ sample_score = SampleScore(
595
+ score=final_score,
596
+ sample_id=task_state.sample_id,
597
+ group_id=task_state.group_id,
598
+ sample_metadata=task_state.metadata,
599
+ )
600
+
601
+ return sample_score
602
+
603
+ @override
604
+ def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
605
+ """
606
+ Aggregate individual sample scores into summary statistics.
607
+
608
+ This method uses the configured aggregation method to compute
609
+ summary statistics (e.g., mean, median, percentiles) across
610
+ all sample scores for comprehensive evaluation results.
611
+
612
+ Args:
613
+ sample_scores (List[SampleScore]): Individual scores for all samples
614
+
615
+ Returns:
616
+ List[AggScore]: Aggregated scores and statistics
617
+ """
618
+ # Get the configured aggregation implementation
619
+ aggregate_cls = get_aggregation(self.aggregation)
620
+ aggregator = aggregate_cls()
621
+
622
+ # Compute aggregated scores
623
+ agg_scores = aggregator(sample_scores)
624
+
625
+ return agg_scores
626
+
627
+ # #########################
628
+ # REPORT GENERATION METHODS
629
+ # #########################
630
+
631
+ def _on_generate_report_end(self, report: Report, output_dir: str, **kwargs) -> None:
632
+ """
633
+ Hook method called after generating the evaluation report.
634
+
635
+ This method can be overridden in subclasses to implement custom
636
+ post-processing of the generated report (e.g., additional formatting,
637
+ custom visualizations, external integrations).
638
+
639
+ Args:
640
+ report (Report): The generated evaluation report
641
+ output_dir (str): Directory where the report should be saved
642
+ """
643
+ pass
644
+
645
+ def _on_generate_report(
646
+ self, scores: Dict[str, List[AggScore]], model_name: str, add_aggregation_name: bool = True
647
+ ) -> Report:
648
+ """
649
+ Hook method called during report generation.
650
+
651
+ This method creates the evaluation report using the configured
652
+ report generator and can be overridden to implement custom
653
+ report generation logic.
654
+
655
+ Args:
656
+ scores (Dict[str, List[AggScore]]): Aggregated scores organized by subset
657
+ model_name (str): Name of the evaluated model
658
+
659
+ Returns:
660
+ Report: The generated evaluation report
661
+ """
662
+ return ReportGenerator.generate_report(
663
+ score_dict=scores, model_name=model_name, data_adapter=self, add_aggregation_name=add_aggregation_name
664
+ )
665
+
666
+ @override
667
+ def generate_report(self, scores: Dict[str, List[AggScore]], model_name: str, output_dir: str, **kwargs) -> Report:
668
+ """
669
+ Generate a comprehensive evaluation report from aggregated scores.
670
+
671
+ This method orchestrates the complete report generation process:
672
+ 1. Creates the report using configured generators
673
+ 2. Applies any post-processing through hook methods
674
+
675
+ Args:
676
+ scores (Dict[str, List[AggScore]]): Aggregated scores by subset name
677
+ model_name (str): Name of the model being evaluated
678
+
679
+ Returns:
680
+ Report: Complete evaluation report with results and analysis
681
+ """
682
+ report = self._on_generate_report(scores, model_name=model_name)
683
+ self._on_generate_report_end(report, output_dir, **kwargs)
684
+ return report