evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (273) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/__init__.py +0 -0
  3. evalscope/api/benchmark/__init__.py +3 -0
  4. evalscope/api/benchmark/adapters/__init__.py +3 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
  8. evalscope/api/benchmark/benchmark.py +321 -0
  9. evalscope/api/benchmark/meta.py +115 -0
  10. evalscope/api/dataset/__init__.py +2 -0
  11. evalscope/api/dataset/dataset.py +349 -0
  12. evalscope/api/dataset/loader.py +261 -0
  13. evalscope/api/dataset/utils.py +143 -0
  14. evalscope/api/evaluator/__init__.py +3 -0
  15. evalscope/api/evaluator/cache.py +355 -0
  16. evalscope/api/evaluator/evaluator.py +56 -0
  17. evalscope/api/evaluator/state.py +264 -0
  18. evalscope/api/filter/__init__.py +1 -0
  19. evalscope/api/filter/filter.py +72 -0
  20. evalscope/api/messages/__init__.py +11 -0
  21. evalscope/api/messages/chat_message.py +198 -0
  22. evalscope/api/messages/content.py +102 -0
  23. evalscope/api/messages/utils.py +35 -0
  24. evalscope/api/metric/__init__.py +2 -0
  25. evalscope/api/metric/metric.py +55 -0
  26. evalscope/api/metric/scorer.py +105 -0
  27. evalscope/api/mixin/__init__.py +2 -0
  28. evalscope/api/mixin/dataset_mixin.py +105 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +157 -0
  32. evalscope/api/model/model.py +383 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/ui/app_ui.py +2 -1
  40. evalscope/app/ui/multi_model.py +50 -25
  41. evalscope/app/ui/single_model.py +23 -11
  42. evalscope/app/utils/data_utils.py +42 -26
  43. evalscope/app/utils/text_utils.py +0 -2
  44. evalscope/app/utils/visualization.py +9 -4
  45. evalscope/arguments.py +6 -7
  46. evalscope/backend/opencompass/api_meta_template.py +2 -1
  47. evalscope/backend/opencompass/backend_manager.py +6 -3
  48. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  49. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  50. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  51. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  52. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  53. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  55. evalscope/backend/rag_eval/utils/embedding.py +2 -1
  56. evalscope/backend/rag_eval/utils/llm.py +13 -12
  57. evalscope/benchmarks/__init__.py +0 -2
  58. evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
  59. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
  60. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
  61. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
  62. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
  63. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
  64. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
  65. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  66. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  67. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  68. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  69. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  70. evalscope/benchmarks/arena_hard/utils.py +37 -1
  71. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  72. evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
  73. evalscope/benchmarks/bfcl/generation.py +222 -0
  74. evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
  75. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  76. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  77. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  78. evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
  79. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  80. evalscope/benchmarks/docmath/utils.py +4 -5
  81. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  82. evalscope/benchmarks/frames/frames_adapter.py +135 -52
  83. evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
  84. evalscope/benchmarks/general_arena/utils.py +23 -27
  85. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  86. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  87. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  88. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  89. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  90. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  91. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  92. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  93. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  94. evalscope/benchmarks/ifeval/instructions.py +109 -64
  95. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  96. evalscope/benchmarks/ifeval/utils.py +6 -7
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  105. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  106. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  107. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  108. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
  109. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  110. evalscope/benchmarks/race/race_adapter.py +33 -119
  111. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  112. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  113. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  114. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  115. evalscope/benchmarks/tau_bench/generation.py +147 -0
  116. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
  117. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  118. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  119. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
  120. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  121. evalscope/cli/cli.py +2 -0
  122. evalscope/cli/start_server.py +6 -3
  123. evalscope/collections/__init__.py +2 -10
  124. evalscope/collections/sampler.py +10 -10
  125. evalscope/collections/schema.py +13 -11
  126. evalscope/config.py +95 -54
  127. evalscope/constants.py +29 -61
  128. evalscope/evaluator/__init__.py +1 -1
  129. evalscope/evaluator/evaluator.py +277 -423
  130. evalscope/filters/__init__.py +2 -0
  131. evalscope/filters/extraction.py +126 -0
  132. evalscope/filters/selection.py +57 -0
  133. evalscope/metrics/__init__.py +13 -13
  134. evalscope/metrics/llm_judge.py +32 -30
  135. evalscope/metrics/math_parser.py +27 -22
  136. evalscope/metrics/metric.py +307 -0
  137. evalscope/metrics/metrics.py +22 -18
  138. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  139. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  140. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  141. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  142. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  143. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  144. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  145. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  146. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  147. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  148. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  149. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  150. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  151. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  152. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  153. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  154. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  155. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  156. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  157. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  158. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  159. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  160. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  162. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  163. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  184. evalscope/models/__init__.py +6 -29
  185. evalscope/models/mockllm.py +65 -0
  186. evalscope/models/model_apis.py +47 -0
  187. evalscope/models/modelscope.py +455 -0
  188. evalscope/models/openai_compatible.py +123 -0
  189. evalscope/models/text2image_model.py +124 -0
  190. evalscope/models/utils/openai.py +698 -0
  191. evalscope/perf/benchmark.py +2 -1
  192. evalscope/perf/http_client.py +4 -2
  193. evalscope/perf/plugin/api/custom_api.py +5 -4
  194. evalscope/perf/plugin/api/openai_api.py +11 -9
  195. evalscope/perf/plugin/datasets/custom.py +2 -1
  196. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  197. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  198. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  199. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  200. evalscope/perf/plugin/datasets/openqa.py +4 -2
  201. evalscope/perf/utils/benchmark_util.py +7 -5
  202. evalscope/perf/utils/db_util.py +9 -6
  203. evalscope/perf/utils/local_server.py +8 -3
  204. evalscope/perf/utils/rich_display.py +16 -10
  205. evalscope/report/__init__.py +2 -2
  206. evalscope/report/combinator.py +18 -12
  207. evalscope/report/generator.py +101 -6
  208. evalscope/report/{utils.py → report.py} +8 -6
  209. evalscope/run.py +26 -44
  210. evalscope/summarizer.py +1 -1
  211. evalscope/utils/__init__.py +21 -2
  212. evalscope/utils/chat_service.py +2 -1
  213. evalscope/utils/deprecation_utils.py +12 -1
  214. evalscope/utils/function_utils.py +29 -0
  215. evalscope/utils/io_utils.py +100 -5
  216. evalscope/utils/json_schema.py +208 -0
  217. evalscope/utils/logger.py +51 -12
  218. evalscope/utils/model_utils.py +10 -7
  219. evalscope/utils/multi_choices.py +271 -0
  220. evalscope/utils/url_utils.py +65 -0
  221. evalscope/version.py +2 -2
  222. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
  223. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
  224. tests/aigc/test_t2i.py +22 -4
  225. tests/benchmark/__init__.py +1 -0
  226. tests/benchmark/test_eval.py +386 -0
  227. tests/cli/test_all.py +3 -5
  228. tests/cli/test_collection.py +13 -4
  229. tests/cli/test_custom.py +22 -15
  230. tests/rag/test_clip_benchmark.py +1 -0
  231. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  232. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  233. evalscope/benchmarks/benchmark.py +0 -81
  234. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  235. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  236. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  237. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  238. evalscope/benchmarks/data_adapter.py +0 -528
  239. evalscope/benchmarks/filters.py +0 -59
  240. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  241. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  242. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  243. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  244. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  245. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  246. evalscope/benchmarks/race/race.py +0 -104
  247. evalscope/benchmarks/race/samples.jsonl +0 -5
  248. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  249. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  250. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  251. evalscope/benchmarks/utils.py +0 -60
  252. evalscope/collections/evaluator.py +0 -375
  253. evalscope/metrics/completion_parsers.py +0 -227
  254. evalscope/metrics/named_metrics.py +0 -55
  255. evalscope/models/adapters/__init__.py +0 -14
  256. evalscope/models/adapters/base_adapter.py +0 -84
  257. evalscope/models/adapters/bfcl_adapter.py +0 -246
  258. evalscope/models/adapters/chat_adapter.py +0 -207
  259. evalscope/models/adapters/choice_adapter.py +0 -222
  260. evalscope/models/adapters/custom_adapter.py +0 -71
  261. evalscope/models/adapters/server_adapter.py +0 -236
  262. evalscope/models/adapters/t2i_adapter.py +0 -79
  263. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  264. evalscope/models/custom/__init__.py +0 -4
  265. evalscope/models/custom/custom_model.py +0 -50
  266. evalscope/models/custom/dummy_model.py +0 -99
  267. evalscope/models/local_model.py +0 -128
  268. evalscope/models/register.py +0 -41
  269. tests/cli/test_run.py +0 -489
  270. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
  271. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
  272. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
  273. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,483 +1,337 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ """
3
+ Default evaluator implementation for running benchmark evaluations.
4
+
5
+ This module provides the DefaultEvaluator class which orchestrates the entire
6
+ evaluation process including data loading, model inference, metric calculation,
7
+ and report generation.
8
+ """
2
9
 
3
- import json
4
10
  import os
5
- import time
6
- from collections import OrderedDict, defaultdict
11
+ from collections import defaultdict
7
12
  from concurrent.futures import ThreadPoolExecutor, as_completed
8
- from copy import deepcopy
9
13
  from tqdm import tqdm
10
- from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
14
+ from typing import TYPE_CHECKING, Dict, List, Tuple, Union
11
15
 
12
- from evalscope.benchmarks import DataAdapter
13
- from evalscope.config import TaskConfig
14
- from evalscope.constants import AnswerKeys, DumpMode, EvalStage, EvalType, JudgeStrategy, ReviewKeys
16
+ from evalscope.api.dataset import Dataset, DatasetDict, Sample
17
+ from evalscope.api.evaluator import CacheManager, Evaluator, TaskState
18
+ from evalscope.api.metric import AggScore, SampleScore
15
19
  from evalscope.report import Report, gen_table
16
- from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, gen_hash, jsonl_to_list
17
- from evalscope.utils.logger import get_logger
18
- from evalscope.utils.model_utils import dict_torch_dtype_to_str
19
20
 
20
21
  if TYPE_CHECKING:
21
- from evalscope.models import BaseModelAdapter
22
+ from evalscope.api.benchmark import DataAdapter
23
+ from evalscope.api.model import Model
24
+ from evalscope.config import TaskConfig
25
+ from evalscope.utils.io_utils import OutputsStructure
26
+
27
+ from evalscope.utils.logger import get_logger
22
28
 
23
29
  logger = get_logger()
24
30
 
25
31
 
26
- class Evaluator(object):
32
+ class DefaultEvaluator(Evaluator):
27
33
  """
28
- The evaluator for model on datasets.
34
+ Default Evaluator for running evaluations on benchmarks.
35
+
36
+ This evaluator handles the complete evaluation pipeline:
37
+ 1. Loading datasets from benchmarks
38
+ 2. Running model inference on samples
39
+ 3. Calculating evaluation metrics
40
+ 4. Generating and saving reports
41
+ 5. Managing caching for predictions and reviews
29
42
 
30
43
  Args:
31
- dataset_name_or_path: str, the dataset name or path.
32
- if the dataset is a local path, e.g. /path/to/your_dataset_name,
33
- then the task name will be the basename of the path, which is `your_dataset_name`.
34
- data_adapter: DataAdapter, the data adapter for the dataset.
35
- model_adapter: BaseModelAdapter, the model adapter for the model.
36
- outputs: OutputsStructure, the outputs dir. Default: None
37
- task_cfg: TaskConfig, the overall task config. Default: None
38
- **kwargs: kwargs.
44
+ benchmark: The data adapter for loading and processing data.
45
+ model: The model to be evaluated.
46
+ outputs: The output structure for saving evaluation results.
47
+ task_config: The task configuration.
39
48
  """
40
49
 
41
- def __init__(self,
42
- data_adapter: DataAdapter,
43
- model_adapter: 'BaseModelAdapter',
44
- outputs: OutputsStructure = None,
45
- task_cfg: TaskConfig = None,
46
- **kwargs):
47
-
48
- self.dataset_name = data_adapter.name
49
- self.dataset_name_or_path = os.path.expanduser(data_adapter.dataset_id)
50
- self.model_name = task_cfg.model_id
51
-
52
- self.data_adapter = data_adapter
53
- self.model_adapter = model_adapter
54
- self.model_cfg = model_adapter.model_cfg
55
- self.eval_type = task_cfg.eval_type
56
- self.dataset_hub = task_cfg.dataset_hub
57
- self.stage = task_cfg.stage
58
- self.use_cache = task_cfg.use_cache
59
- self.task_cfg = task_cfg
60
- # Deal with the output paths
61
- self.outputs_structure = outputs
62
- self.kwargs = kwargs
63
-
64
- self._init_judge()
65
-
66
- def _init_judge(self):
67
- if self.task_cfg.judge_strategy == JudgeStrategy.RULE:
68
- self.judge = None
69
- else:
70
- from evalscope.metrics import LLMJudge
71
- self.judge = LLMJudge(**self.task_cfg.judge_model_args)
72
-
73
- def load_dataset(self):
74
- dataset = self.data_adapter.load(
75
- work_dir=os.path.expanduser(self.task_cfg.dataset_dir), datasets_hub=self.dataset_hub, **self.kwargs)
76
-
77
- # Get prompts from dataset
78
- prompts = self.data_adapter.gen_prompts(data_dict=dataset)
79
-
80
- # Limit and index prompts
81
- limited_prompts = defaultdict(list)
82
- for subset_name, prompts_list in prompts.items():
83
- # If limit is None, use all prompts
84
- if self.task_cfg.limit is None:
85
- limit = len(prompts_list)
86
- else:
87
- if isinstance(self.task_cfg.limit, int):
88
- limit = self.task_cfg.limit
89
- elif isinstance(self.task_cfg.limit, float):
90
- limit = int(len(prompts_list) * self.task_cfg.limit)
91
- # Limit the number of prompts
92
- for index, prompt in enumerate(prompts_list[:min(limit, len(prompts_list))]):
93
- prompt[AnswerKeys.INDEX] = index
94
- limited_prompts[subset_name].append(prompt)
95
-
96
- return limited_prompts
97
-
98
- def _generate_answer_id(self, model_cfg, input_d, infer_cfg):
99
- model_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(model_cfg).items())), ensure_ascii=False)
100
- input_prompt_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(input_d).items())), ensure_ascii=False)
101
- infer_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False)
102
- return 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
103
-
104
- def _process_answer(self, answer_d, input_d, subset_name, answer_id):
105
- answer_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
106
- answer_d[AnswerKeys.ANSWER_ID] = answer_id
107
- answer_d[AnswerKeys.SUBSET_NAME] = subset_name
108
- answer_d[AnswerKeys.RAW_INPUT] = input_d[AnswerKeys.RAW_INPUT]
109
- answer_d[AnswerKeys.INDEX] = input_d[AnswerKeys.INDEX]
110
- return answer_d
111
-
112
- def _get_answer(self, input_prompts, subset_name, infer_cfg) -> List[dict]:
113
- try:
114
- # get answer from model
115
- answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
116
- except Exception as e:
117
- logger.error(f'Failed to get answer for {input_prompts}, due to {e}')
118
- # if ignore_errors is True, continue to next input
119
- if self.task_cfg.ignore_errors:
120
- logger.warning('`ignore_errors` is set to True. Dropping this prompt and continuing with evaluation.')
121
- return []
122
- else:
123
- raise e
124
- # process answer
125
- answers_list = []
126
- for answer_d, input_prompt in zip(answer_ds, input_prompts):
127
- answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
128
- processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
129
- answers_list.append(processed_answer)
130
- return answers_list
131
-
132
- @staticmethod
133
- def filter_answer(use_cache, prompts_list, pred_file_path) -> dict:
134
- # Filter prompts that have been answered
135
- answers_list = []
136
- if not use_cache or not os.path.exists(pred_file_path):
137
- return answers_list, prompts_list
138
-
139
- def get_answered_indices(answers_list: List[Dict]) -> List[int]:
140
- indices = [answer.get(AnswerKeys.INDEX) for answer in answers_list]
141
-
142
- if all(index is None for index in indices):
143
- return list(range(len(answers_list)))
144
-
145
- return [index for index in indices if index is not None]
146
-
147
- answers_list = jsonl_to_list(pred_file_path)
148
- answered_indices = set(get_answered_indices(answers_list))
149
- logger.info(f'Reusing predictions from {pred_file_path}, got {len(answered_indices)} answers.')
150
-
151
- prompts = [prompt for i, prompt in enumerate(prompts_list) if i not in answered_indices]
152
- return answers_list, prompts
153
-
154
- def get_answers(self, subset_name: str, prompts_list: List[dict], infer_cfg: dict = None, **kwargs) -> list:
50
+ def __init__(
51
+ self,
52
+ benchmark: 'DataAdapter',
53
+ model: 'Model',
54
+ outputs: 'OutputsStructure',
55
+ task_config: 'TaskConfig',
56
+ ):
57
+ # Store core components needed for evaluation
58
+ self.benchmark = benchmark
59
+ self.model = model
60
+ self.outputs = outputs
61
+ self.task_config = task_config
62
+
63
+ # Extract frequently used identifiers
64
+ self.benchmark_name = benchmark.name
65
+ """Name of the benchmark being evaluated."""
66
+
67
+ self.model_name = task_config.model_id
68
+ """ID of the model being evaluated."""
69
+
70
+ self.use_cache = task_config.use_cache
71
+ """Whether to use cache for predictions."""
72
+
73
+ # Initialize cache manager for storing and retrieving cached results
74
+ self.cache_manager = CacheManager(
75
+ outputs=outputs,
76
+ model_name=self.model_name,
77
+ benchmark_name=self.benchmark_name,
78
+ )
79
+
80
+ def eval(self) -> Report:
155
81
  """
156
- Get answers from model inference.
157
- It is required to rewrite this method to support your own evaluator.
82
+ Run the complete evaluation process.
158
83
 
159
- Args:
160
- subset_name: subset name for benchmark.
161
- prompts_list: prompts list.
162
- infer_cfg: model inference config.
163
- Attributes:
164
- do_sample: bool, whether to use sampling.
165
- top_k: int, the number of highest probability vocabulary tokens to keep for top-k-filtering.
166
- top_p: float, if set to float < 1, only the most probable tokens with probabilities to add.
167
- temperature: float, the value used to module the next token probabilities.
168
- num_beams: int, number of beams for beam search. 1 means no beam search.
169
- max_length: int, the max length of the sequence to be generated.
170
- max_new_tokens: int, the max number of new tokens to be generated.
171
- repetition_penalty: float, the parameter for repetition penalty. 1.0 means no penalty.
172
- **kwargs: kwargs.
173
-
174
- Returns: The list of answers.
84
+ This is the main entry point that orchestrates the entire evaluation:
85
+ 1. Load dataset from benchmark
86
+ 2. Evaluate each subset independently
87
+ 3. Aggregate scores across subsets
88
+ 4. Generate final evaluation report
89
+
90
+ Returns:
91
+ Report: The complete evaluation report containing all metrics and results.
175
92
  """
176
- assert self.data_adapter is not None, 'data_adapter must be provided when calling func get_answers() !'
177
- assert self.model_adapter is not None, 'model must be provided when calling func get_answers() !'
178
- assert len(prompts_list) > 0, 'prompts_list must not be empty when calling func get_answers() !'
179
-
180
- pred_file_name = self.dataset_name + '_' + subset_name + '.jsonl'
181
- pred_file_path = os.path.join(self.outputs_structure.predictions_dir, self.model_name, pred_file_name)
182
- os.makedirs(os.path.dirname(pred_file_path), exist_ok=True)
183
-
184
- answers_list, prompts_list = Evaluator.filter_answer(self.use_cache, prompts_list, pred_file_path)
185
-
186
- eval_batch_size = self.task_cfg.eval_batch_size
187
- if self.task_cfg.eval_type == EvalType.SERVICE:
188
- with tqdm(total=len(prompts_list), desc=f'Predicting({subset_name}): ') as pbar:
189
- with ThreadPoolExecutor(max_workers=eval_batch_size) as executor:
190
- futures = []
191
- for input_prompt in prompts_list:
192
- futures.append(executor.submit(self._get_answer, [input_prompt], subset_name, infer_cfg))
193
- for future in as_completed(futures):
194
- answer_ds: List[dict] = future.result()
195
- answers_list.extend(answer_ds)
196
- dump_jsonl_data(answer_ds, pred_file_path, dump_mode=DumpMode.APPEND)
197
- pbar.update(len(answer_ds))
198
- else:
199
- batch_prompts_list = [
200
- prompts_list[i:i + eval_batch_size] for i in range(0, len(prompts_list), eval_batch_size)
201
- ]
202
- with tqdm(total=len(prompts_list), desc=f'Predicting({subset_name}): ') as pbar:
203
- for batch_prompts in batch_prompts_list:
204
- answer_ds: List[dict] = self._get_answer(
205
- input_prompts=batch_prompts, subset_name=subset_name, infer_cfg=infer_cfg)
206
- answers_list.extend(answer_ds)
207
- dump_jsonl_data(answer_ds, pred_file_path, dump_mode=DumpMode.APPEND)
208
- pbar.update(len(batch_prompts))
209
-
210
- logger.info(f'Dump predictions to {pred_file_path}.')
211
- return answers_list
212
-
213
- def _get_review(self, answer_d: dict, review_id: str = None, reviewer_spec: dict = None) -> dict:
214
-
215
- if reviewer_spec is None:
216
- reviewer_spec = {}
217
-
218
- review_res = deepcopy(answer_d)
219
- if AnswerKeys.CHOICES not in review_res:
220
- review_res[AnswerKeys.CHOICES] = []
221
- review_res[ReviewKeys.REVIEWED] = True
222
- review_res[ReviewKeys.REVIEW_ID] = None
223
- review_res[ReviewKeys.REVIEWER_SPEC] = reviewer_spec
224
- review_res[ReviewKeys.REVIEW_TIME] = time.time()
225
- logger.warning(f'No choices found for answer dict: {review_res}')
226
- return review_res
227
-
228
- rev_choices = []
229
- for choice in review_res[AnswerKeys.CHOICES]:
230
- raw_input_d: dict = review_res[AnswerKeys.RAW_INPUT]
231
- answer_content = choice[ReviewKeys.MESSAGE][ReviewKeys.CONTENT]
232
- gold_content = self.data_adapter.get_gold_answer(raw_input_d)
233
-
234
- # Get review result based on judge strategy
235
- use_llm = (
236
- self.task_cfg.judge_strategy == JudgeStrategy.LLM
237
- or (self.task_cfg.judge_strategy == JudgeStrategy.AUTO and self.data_adapter.llm_as_a_judge))
238
-
239
- if use_llm:
240
- # Use LLM as judge
241
- assert self.judge is not None, f'Judge model is required for LLM judging {self.data_adapter.name}'
242
- pred_content = self.data_adapter.llm_parse_pred_result(
243
- result=answer_content, raw_input_d=raw_input_d, eval_type=self.eval_type)
244
- review_result = self.data_adapter.llm_match(
245
- gold_content, pred_content, self.judge, raw_input=raw_input_d)
246
- else:
247
- # Use rule-based judging
248
- pred_content = self.data_adapter.parse_pred_result(
249
- result=answer_content, raw_input_d=raw_input_d, eval_type=self.eval_type)
250
- review_result = self.data_adapter.match(gold_content, pred_content)
251
-
252
- # For LLM_RECALL strategy, use LLM to re-judge if rule-based result is not good
253
- if (self.task_cfg.judge_strategy == JudgeStrategy.LLM_RECALL
254
- and isinstance(review_result, (bool, int, float)) and not bool(review_result)):
255
- assert self.judge is not None, f'Judge model is required for LLM_RECALL strategy {self.data_adapter.name}' # noqa: E501
256
- pred_content = self.data_adapter.llm_parse_pred_result(
257
- result=answer_content, raw_input_d=raw_input_d, eval_type=self.eval_type)
258
- review_result = self.data_adapter.llm_match(
259
- gold_content, pred_content, self.judge, raw_input=raw_input_d)
260
-
261
- choice[ReviewKeys.REVIEW] = {
262
- ReviewKeys.GOLD: gold_content if gold_content != raw_input_d else '*Same as Input*',
263
- ReviewKeys.PRED: pred_content,
264
- ReviewKeys.RESULT: review_result
265
- }
266
- rev_choices.append(choice)
267
-
268
- review_res[AnswerKeys.CHOICES] = rev_choices
269
- review_res[ReviewKeys.REVIEWED] = True
270
- review_res[ReviewKeys.REVIEW_ID] = review_id
271
- review_res[ReviewKeys.REVIEWER_SPEC] = reviewer_spec
272
- review_res[ReviewKeys.REVIEW_TIME] = time.time()
273
-
274
- return review_res
275
-
276
- def _generate_review_id(self, answer_d):
277
- # Gen review_id (concat: answer_id + reviewer_spec)
278
- answer_id = answer_d[AnswerKeys.ANSWER_ID]
279
- reviewer_spec = {'metric': self.data_adapter.metric_list, 'reviewer': ['Evaluator'], 'revision': ['default']}
280
- reviewer_spec_str = json.dumps(
281
- OrderedDict(sorted(dict_torch_dtype_to_str(reviewer_spec).items())), ensure_ascii=False)
282
- review_id = 'review-' + gen_hash(answer_id + reviewer_spec_str)
283
- return review_id, reviewer_spec
284
-
285
- def get_reviews(self, subset_name: str, answers_list: List[dict], **kwargs) -> list:
93
+ # Load the dataset and evaluate each subset
94
+ dataset_dict = self.benchmark.load_dataset()
95
+ agg_score_dict = defaultdict(list)
96
+
97
+ # Process each subset (e.g., test, validation) independently
98
+ for subset, dataset in dataset_dict.items():
99
+ assert len(dataset) > 0, f'No samples found in subset: {subset}'
100
+ subset_score = self.evaluate_subset(subset, dataset)
101
+ agg_score_dict[subset] = subset_score
102
+
103
+ # Generate the report based on aggregated scores
104
+ report = self.get_report(agg_score_dict)
105
+ return report
106
+
107
+ def evaluate_subset(self, subset: str, dataset: Dataset) -> List[AggScore]:
286
108
  """
287
- Get reviews from answers.
288
- It is required to rewrite this method to support your own evaluator.
109
+ Evaluate a single subset of the dataset.
110
+
111
+ This method processes one subset through the complete evaluation pipeline:
112
+ 1. Get model predictions for all samples
113
+ 2. Calculate evaluation metrics for predictions
114
+ 3. Aggregate individual sample scores
289
115
 
290
116
  Args:
291
- subset_name: subset name of benchmark
292
- answers_list: inference results list.
293
- **kwargs: kwargs.
117
+ subset: Name of the subset being evaluated (e.g., 'test', 'validation').
118
+ dataset: The dataset subset containing samples to evaluate.
294
119
 
295
- Returns: reviews list.
120
+ Returns:
121
+ List[AggScore]: Aggregated scores for this subset.
296
122
  """
297
- reviews_list = []
298
-
299
- review_file_name = self.dataset_name + '_' + subset_name + '.jsonl'
300
- review_file_path = os.path.join(self.outputs_structure.reviews_dir, self.model_name, review_file_name)
301
- os.makedirs(os.path.dirname(review_file_path), exist_ok=True)
302
-
303
- # Load existing reviews if using cache
304
- existing_reviews = {}
305
- if self.use_cache and os.path.exists(review_file_path):
306
- with open(review_file_path, 'r') as f:
307
- for line in f:
308
- review = json.loads(line.strip())
309
- existing_reviews[review['index']] = review
310
- logger.info(f'Reusing review result from {review_file_path}, got {len(existing_reviews)} reviews.')
311
-
312
- def process_single_review(answer_d):
313
- # Check if review already exists in cache
314
- if self.use_cache and answer_d['index'] in existing_reviews:
315
- return existing_reviews[answer_d['index']]
316
-
317
- review_id, reviewer_spec = self._generate_review_id(answer_d)
318
- # Get review
319
- review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
320
- logger.debug(review_d)
321
- return review_d
322
-
323
- with ThreadPoolExecutor(max_workers=self.task_cfg.judge_worker_num) as executor:
324
- # Submit all tasks and get futures
325
- futures = [executor.submit(process_single_review, answer_d) for answer_d in answers_list]
326
-
327
- # Process completed futures with progress bar
328
- for future in tqdm(as_completed(futures), total=len(futures), desc=f'Reviewing({subset_name}): '):
329
- review_d = future.result()
330
- reviews_list.append(review_d)
331
- # Dump new reviews only if not using cache or review is new
332
- if not self.use_cache or review_d['index'] not in existing_reviews:
333
- dump_jsonl_data(review_d, review_file_path, dump_mode=DumpMode.APPEND)
334
-
335
- return reviews_list
336
-
337
- def compute_metrics(self, reviews_list: List[dict]) -> List[dict]:
123
+ # Get model predictions for all samples in the subset
124
+ task_states = self.get_answers(subset, dataset)
125
+
126
+ # Calculate evaluation metrics for each prediction
127
+ sample_scores = self.get_reviews(subset, task_states)
128
+
129
+ # Aggregate individual sample scores into subset-level metrics
130
+ agg_scores = self.benchmark.aggregate_scores(sample_scores=sample_scores)
131
+ return agg_scores
132
+
133
+ def get_answers(self, subset: str, dataset: Dataset) -> List[TaskState]:
338
134
  """
339
- To compute metrics from reviews_list for each subset.
340
- It is required to rewrite this method to support your own evaluator.
135
+ Get model predictions for all samples in the dataset subset.
136
+
137
+ This method handles:
138
+ 1. Loading cached predictions if available and caching is enabled
139
+ 2. Running model inference on remaining samples in parallel
140
+ 3. Saving new predictions to cache
341
141
 
342
142
  Args:
343
- reviews_list: reviews list.
143
+ subset: Name of the subset being processed.
144
+ dataset: The dataset subset containing samples for prediction.
344
145
 
345
146
  Returns:
346
- The metric result. Depends on the metric function in data_adapter.
147
+ List[TaskState]: Task states containing model predictions for each sample.
347
148
  """
348
- # Get max choices
349
- choices_lengths = [
350
- len(review_d[AnswerKeys.CHOICES]) for review_d in reviews_list if review_d.get(ReviewKeys.REVIEWED)
351
- ]
352
- if choices_lengths:
353
- max_choices = max(choices_lengths)
149
+ # Initialize task state list and filter cached predictions if caching is enabled
150
+ if self.use_cache:
151
+ task_state_list, dataset = self.cache_manager.filter_prediction_cache(subset, dataset)
354
152
  else:
355
- max_choices = 0
356
-
357
- # Get review result
358
- review_res_list = []
359
- for review_d in reviews_list:
360
- if not review_d[ReviewKeys.REVIEWED]:
361
- logger.warning(f'Review not finished for answer_id: {review_d[AnswerKeys.ANSWER_ID]}, skipping ...')
362
- continue
363
-
364
- if len(review_d[AnswerKeys.CHOICES]) == 0:
365
- logger.warning(f'No choices found for answer_id: {review_d[AnswerKeys.ANSWER_ID]}, skipping ...')
366
- continue
367
- elif len(review_d[AnswerKeys.CHOICES]) == 1 and max_choices == 1:
368
- review_res = review_d[AnswerKeys.CHOICES][0][ReviewKeys.REVIEW][ReviewKeys.RESULT]
369
- else:
370
- review_res = [choice[ReviewKeys.REVIEW][ReviewKeys.RESULT] for choice in review_d[AnswerKeys.CHOICES]]
371
- if len(review_d[AnswerKeys.CHOICES]) < max_choices:
372
- logger.warning(
373
- f'Less choices found for answer_id: {review_d[AnswerKeys.ANSWER_ID]}, '
374
- f'max_choices is {max_choices}, but only {len(review_d[AnswerKeys.CHOICES])} choices found')
375
-
376
- review_res_list.append(review_res)
377
-
378
- metric_score: List[dict] = self.data_adapter.compute_metric(
379
- review_res_list=review_res_list, reviews_list=reviews_list)
380
-
381
- return metric_score
382
-
383
- def dump_report(self, reviews_score_all: List[dict]):
384
- """
385
- Get report for total reviews of specific dataset.
386
- It is required to rewrite this method to support your own evaluator.
153
+ task_state_list = []
387
154
 
388
- Args:
389
- reviews_score_all: reviews score list. Generated by func self.data_adapter.compute_metric().
155
+ # Get output directory for storing model predictions
156
+ model_prediction_dir = os.path.dirname(self.cache_manager.get_prediction_cache_path(subset))
390
157
 
391
- Returns: None
392
- """
393
- report_path = os.path.join(self.outputs_structure.reports_dir, self.model_name)
394
- os.makedirs(report_path, exist_ok=True)
395
- # Get report map
396
- report_map: Report = self.data_adapter.gen_report(
397
- subset_score_map=reviews_score_all, model_name=self.model_name)
158
+ # Convert dataset to list for parallel processing
159
+ dataset_list = list(dataset)
398
160
 
399
- # Make table
400
- try:
401
- report_table = gen_table(report_list=[report_map], add_overall_metric=True)
402
- logger.info(f'\n{self.dataset_name_or_path} report table:'
403
- f'\n{report_table} \n')
404
- except Exception:
405
- logger.error('Failed to generate report table.')
161
+ if not dataset_list:
162
+ return task_state_list
406
163
 
407
- # Make report analysis
408
- if self.task_cfg.analysis_report:
409
- logger.info('Generating report analysis, please wait ...')
410
- analysis = report_map.generate_analysis(self.task_cfg.judge_model_args)
411
- logger.info('Report analysis:\n%s', analysis)
412
- else:
413
- logger.info('Skipping report analysis (`analysis_report=False`).')
164
+ # Process samples in parallel using ThreadPoolExecutor
165
+ with ThreadPoolExecutor(max_workers=min(len(dataset_list), self.task_config.eval_batch_size)) as executor:
166
+ # Submit all prediction tasks
167
+ future_to_sample = {
168
+ executor.submit(self._predict_sample, sample, model_prediction_dir): sample
169
+ for sample in dataset_list
170
+ }
414
171
 
415
- # Dump report
416
- report_file = os.path.join(report_path, f'{self.dataset_name}.json')
417
- report_map.to_json(report_file)
418
- logger.info(f'Dump report to: {report_file} \n')
172
+ # Process completed tasks with progress bar
173
+ with tqdm(total=len(dataset_list), desc=f'Predicting[{self.benchmark_name}@{subset}]: ') as pbar:
174
+ for future in as_completed(future_to_sample):
175
+ sample = future_to_sample[future]
176
+ try:
177
+ task_state = future.result()
178
+ task_state_list.append(task_state)
179
+
180
+ # Save the prediction result to cache for future use
181
+ model_result = self.cache_manager.save_prediction_cache(
182
+ subset, task_state, self.benchmark.save_metadata
183
+ )
184
+ logger.debug(f'Model result: \n{model_result.model_dump_json(indent=2)}')
185
+
186
+ except Exception as exc:
187
+ logger.error(f'{sample.model_dump_json(indent=2)} prediction failed: due to {exc}')
188
+ if self.task_config.ignore_errors:
189
+ logger.warning('Error ignored, continuing with next sample.')
190
+ else:
191
+ raise exc
192
+ finally:
193
+ pbar.update(1)
194
+
195
+ return task_state_list
196
+
197
+ def _predict_sample(self, sample: Sample, model_prediction_dir: str) -> TaskState:
198
+ """
199
+ Helper method to predict a single sample.
419
200
 
420
- # Post process report
421
- try:
422
- self.data_adapter.post_process_report(report_map, report_path=report_path)
423
- except Exception as e:
424
- logger.error(f'Failed to post process report: {e}')
201
+ Args:
202
+ sample: The sample to predict.
203
+ model_prediction_dir: Directory for storing model predictions.
425
204
 
426
- return report_map
205
+ Returns:
206
+ TaskState: The task state containing the prediction result.
207
+ """
208
+ logger.debug(f'\n{sample.pretty_print()}')
427
209
 
428
- def eval(self, **kwargs) -> dict:
210
+ # Run model inference on the current sample
211
+ task_state = self.benchmark.run_inference(model=self.model, sample=sample, output_dir=model_prediction_dir)
212
+ return task_state
213
+
214
+ def get_reviews(self, subset: str, task_states: List[TaskState]) -> List[SampleScore]:
429
215
  """
430
- Evaluate the model on the specific benchmark. Streaming & parallel mode is supported.
431
- It is required to rewrite this method to support your own evaluator.
216
+ Calculate evaluation metrics for model predictions.
432
217
 
433
- The evaluation process is as follows:
434
- 1. Get the input samples from the dataset (benchmarks on the ModelScope or HuggingFace).
435
- 2. Get the input prompts from dataset with specific data adapter.
436
- 3. Get answers with model inference.
437
- 4. Get reviews with metric function (or reviewers).
438
- 5. Generate report from review results.
218
+ This method handles:
219
+ 1. Loading cached review results if available and caching is enabled
220
+ 2. Computing metrics for remaining task states in parallel
221
+ 3. Saving new review results to cache
439
222
 
440
223
  Args:
441
- infer_cfg: The config for model inference.
224
+ subset: Name of the subset being reviewed.
225
+ task_states: List of task states containing model predictions.
442
226
 
443
227
  Returns:
444
- Dict of results. Depends on the stage of evaluation.
228
+ List[SampleScore]: Evaluation scores for each sample.
229
+ """
230
+ # Initialize sample score list and filter cached reviews if caching is enabled
231
+ if self.use_cache and not self.task_config.rerun_review:
232
+ sample_score_list, task_states = self.cache_manager.filter_review_cache(subset, task_states)
233
+ else:
234
+ # Init a clean sample score list
235
+ sample_score_list = []
236
+ self.cache_manager.delete_review_cache(subset)
237
+
238
+ if not task_states:
239
+ return sample_score_list
240
+
241
+ # Process task states in parallel using ThreadPoolExecutor
242
+ with ThreadPoolExecutor(max_workers=min(len(task_states), self.task_config.judge_worker_num)) as executor:
243
+ # Submit all review tasks
244
+ future_to_task_state = {
245
+ executor.submit(self._review_task_state, task_state): task_state
246
+ for task_state in task_states
247
+ }
445
248
 
446
- stage == 'all': return the report_map
447
- stage == 'infer': return the answers_map
448
- stage == 'review': return the reviews_map
249
+ # Process completed tasks with progress bar
250
+ with tqdm(total=len(task_states), desc=f'Reviewing[{self.benchmark_name}@{subset}]: ') as pbar:
251
+ for future in as_completed(future_to_task_state):
252
+ task_state = future_to_task_state[future]
253
+ try:
254
+ sample_score = future.result()
255
+ sample_score_list.append(sample_score)
256
+
257
+ # Save the review result to cache for future use
258
+ review_result = self.cache_manager.save_review_cache(
259
+ subset=subset,
260
+ task_state=task_state,
261
+ sample_score=sample_score,
262
+ save_metadata=self.benchmark.save_metadata
263
+ )
264
+ logger.debug(f'Review result: \n{review_result.model_dump_json(indent=2)}')
265
+
266
+ except Exception as exc:
267
+ logger.error(f'Error when review sample {task_state.sample_id}: {exc}')
268
+ if self.task_config.ignore_errors:
269
+ logger.warning('Error ignored, continuing with next sample.')
270
+ else:
271
+ raise exc
272
+ finally:
273
+ pbar.update(1)
274
+
275
+ return sample_score_list
276
+
277
+ def _review_task_state(self, task_state: TaskState) -> SampleScore:
449
278
  """
279
+ Helper method to review a single task state.
450
280
 
451
- logger.info(f'Start evaluating on dataset {self.dataset_name_or_path}')
281
+ Args:
282
+ task_state: The task state to review.
452
283
 
453
- reviews_score_all = {} # {subset_name: (score, num)}
454
- stage_answers_dict = {}
455
- stage_reviews_dict = {}
284
+ Returns:
285
+ SampleScore: The evaluation score for the task state.
286
+ """
287
+ # Compute evaluation metrics using the benchmark's metric calculation
288
+ sample_score = self.benchmark.calculate_metrics(task_state=task_state)
289
+ return sample_score
456
290
 
457
- prompts = self.load_dataset()
458
- for subset_name, prompts_list in prompts.items():
291
+ def get_report(self, agg_score_dict: Dict[str, List[AggScore]]) -> Report:
292
+ """
293
+ Generate a comprehensive evaluation report from aggregated scores.
459
294
 
460
- answers_list: list = self.get_answers(
461
- subset_name=subset_name, prompts_list=prompts_list, infer_cfg=self.task_cfg.generation_config, **kwargs)
462
- if self.stage == EvalStage.INFER:
463
- stage_answers_dict[subset_name] = answers_list
464
- continue
295
+ This method handles:
296
+ 1. Creating the evaluation report from scores
297
+ 2. Generating and displaying a summary table
298
+ 3. Optionally generating detailed analysis
299
+ 4. Saving the report to file
465
300
 
466
- reviews_list: list = self.get_reviews(subset_name=subset_name, answers_list=answers_list, **kwargs)
301
+ Args:
302
+ agg_score_dict: Dictionary mapping subset names to their aggregated scores.
467
303
 
468
- metric_res = self.compute_metrics(reviews_list=reviews_list)
469
- reviews_score_all[subset_name] = metric_res
470
- stage_reviews_dict[subset_name] = reviews_list
304
+ Returns:
305
+ Report: The complete evaluation report.
306
+ """
307
+ assert agg_score_dict, 'No scores to generate report from.'
471
308
 
472
- if self.stage == EvalStage.INFER:
473
- return stage_answers_dict
309
+ # Get paths for saving the report
310
+ report_path = self.cache_manager.get_report_path()
311
+ report_file = self.cache_manager.get_report_file()
474
312
 
475
- if self.stage == EvalStage.REVIEW:
476
- return stage_reviews_dict
313
+ # Generate the main evaluation report using benchmark-specific logic
314
+ report = self.benchmark.generate_report(
315
+ scores=agg_score_dict, model_name=self.model_name, output_dir=report_path
316
+ )
477
317
 
478
- # Generate report
479
- report_map = self.dump_report(reviews_score_all)
318
+ # Generate and display a summary table of results
319
+ try:
320
+ report_table = gen_table(report_list=[report], add_overall_metric=True)
321
+ logger.info(f'\n{self.benchmark_name} report table:'
322
+ f'\n{report_table} \n')
323
+ except Exception:
324
+ logger.error('Failed to generate report table.')
480
325
 
481
- logger.info(f'Evaluation finished on {self.dataset_name_or_path}')
326
+ # Generate detailed analysis if requested in configuration
327
+ if self.task_config.analysis_report:
328
+ logger.info('Generating report analysis, please wait ...')
329
+ analysis = report.generate_analysis(self.task_config.judge_model_args)
330
+ logger.info(f'Report analysis:\n{analysis}')
331
+ else:
332
+ logger.info('Skipping report analysis (`analysis_report=False`).')
482
333
 
483
- return report_map
334
+ # Save the complete report to file
335
+ report.to_json(report_file)
336
+ logger.info(f'Dump report to: {report_file} \n')
337
+ return report