evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +3 -0
  3. evalscope/api/benchmark/adapters/__init__.py +5 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
  8. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  9. evalscope/api/benchmark/benchmark.py +356 -0
  10. evalscope/api/benchmark/meta.py +121 -0
  11. evalscope/api/dataset/__init__.py +2 -0
  12. evalscope/api/dataset/dataset.py +349 -0
  13. evalscope/api/dataset/loader.py +262 -0
  14. evalscope/api/dataset/utils.py +143 -0
  15. evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope/api/evaluator/cache.py +378 -0
  17. evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope/api/evaluator/state.py +275 -0
  19. evalscope/api/filter/__init__.py +1 -0
  20. evalscope/api/filter/filter.py +72 -0
  21. evalscope/api/messages/__init__.py +12 -0
  22. evalscope/api/messages/chat_message.py +243 -0
  23. evalscope/api/messages/content.py +102 -0
  24. evalscope/api/messages/utils.py +35 -0
  25. evalscope/api/metric/__init__.py +2 -0
  26. evalscope/api/metric/metric.py +55 -0
  27. evalscope/api/metric/scorer.py +113 -0
  28. evalscope/api/mixin/__init__.py +1 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +155 -0
  32. evalscope/api/model/model.py +386 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/app.py +3 -0
  40. evalscope/app/ui/app_ui.py +2 -1
  41. evalscope/app/ui/multi_model.py +50 -25
  42. evalscope/app/ui/single_model.py +26 -14
  43. evalscope/app/utils/data_utils.py +43 -27
  44. evalscope/app/utils/env_utils.py +12 -0
  45. evalscope/app/utils/text_utils.py +14 -14
  46. evalscope/app/utils/visualization.py +9 -4
  47. evalscope/arguments.py +7 -10
  48. evalscope/backend/opencompass/api_meta_template.py +2 -1
  49. evalscope/backend/opencompass/backend_manager.py +6 -5
  50. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  51. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  52. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  53. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  55. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  56. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  57. evalscope/backend/rag_eval/utils/embedding.py +10 -1
  58. evalscope/backend/rag_eval/utils/llm.py +13 -12
  59. evalscope/benchmarks/__init__.py +0 -2
  60. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  61. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  62. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  63. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  64. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  65. evalscope/benchmarks/arena_hard/utils.py +37 -1
  66. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  67. evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
  68. evalscope/benchmarks/bfcl/generation.py +222 -0
  69. evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
  70. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  71. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  72. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  73. evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
  74. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  75. evalscope/benchmarks/docmath/utils.py +4 -5
  76. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  77. evalscope/benchmarks/frames/frames_adapter.py +136 -52
  78. evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
  79. evalscope/benchmarks/general_arena/utils.py +23 -27
  80. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  81. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  82. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  83. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  84. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  85. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  86. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  87. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  88. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  89. evalscope/benchmarks/ifeval/instructions.py +109 -64
  90. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  91. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  92. evalscope/benchmarks/ifeval/utils.py +6 -7
  93. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  94. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  95. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  96. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/math_vista/__init__.py +0 -0
  105. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  106. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  107. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  108. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  109. evalscope/benchmarks/mmmu/__init__.py +0 -0
  110. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  111. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  112. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  113. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  114. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
  115. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  116. evalscope/benchmarks/race/race_adapter.py +33 -119
  117. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  118. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  119. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  120. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  121. evalscope/benchmarks/tau_bench/generation.py +147 -0
  122. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
  123. evalscope/benchmarks/text2image/__init__.py +0 -0
  124. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  125. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  126. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  127. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  128. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  129. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  131. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
  132. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  133. evalscope/cli/cli.py +2 -0
  134. evalscope/cli/start_app.py +7 -1
  135. evalscope/cli/start_perf.py +7 -1
  136. evalscope/cli/start_server.py +6 -3
  137. evalscope/collections/__init__.py +2 -10
  138. evalscope/collections/sampler.py +10 -10
  139. evalscope/collections/schema.py +13 -11
  140. evalscope/config.py +157 -57
  141. evalscope/constants.py +37 -61
  142. evalscope/evaluator/__init__.py +1 -1
  143. evalscope/evaluator/evaluator.py +275 -419
  144. evalscope/filters/__init__.py +2 -0
  145. evalscope/filters/extraction.py +126 -0
  146. evalscope/filters/selection.py +57 -0
  147. evalscope/metrics/__init__.py +13 -13
  148. evalscope/metrics/llm_judge.py +47 -33
  149. evalscope/metrics/math_parser.py +27 -22
  150. evalscope/metrics/metric.py +307 -0
  151. evalscope/metrics/metrics.py +22 -18
  152. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  153. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  154. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  155. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  156. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  157. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  158. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  159. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  160. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  162. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  163. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  184. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  185. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  186. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  187. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  188. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  189. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  190. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  191. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  192. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  193. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  194. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  195. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  196. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  197. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  198. evalscope/models/__init__.py +6 -29
  199. evalscope/models/image_edit_model.py +125 -0
  200. evalscope/models/mockllm.py +65 -0
  201. evalscope/models/model_apis.py +67 -0
  202. evalscope/models/modelscope.py +455 -0
  203. evalscope/models/openai_compatible.py +126 -0
  204. evalscope/models/text2image_model.py +124 -0
  205. evalscope/models/utils/openai.py +701 -0
  206. evalscope/perf/benchmark.py +4 -1
  207. evalscope/perf/http_client.py +4 -2
  208. evalscope/perf/plugin/api/custom_api.py +5 -4
  209. evalscope/perf/plugin/api/openai_api.py +11 -9
  210. evalscope/perf/plugin/datasets/custom.py +2 -1
  211. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  212. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  213. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  214. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  215. evalscope/perf/plugin/datasets/openqa.py +4 -2
  216. evalscope/perf/utils/benchmark_util.py +15 -10
  217. evalscope/perf/utils/db_util.py +9 -6
  218. evalscope/perf/utils/local_server.py +11 -3
  219. evalscope/perf/utils/rich_display.py +16 -10
  220. evalscope/report/__init__.py +2 -3
  221. evalscope/report/combinator.py +18 -12
  222. evalscope/report/generator.py +51 -35
  223. evalscope/report/{utils.py → report.py} +8 -6
  224. evalscope/run.py +33 -47
  225. evalscope/summarizer.py +1 -1
  226. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  227. evalscope/utils/__init__.py +21 -2
  228. evalscope/utils/chat_service.py +3 -2
  229. evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope/utils/function_utils.py +29 -0
  231. evalscope/utils/import_utils.py +23 -1
  232. evalscope/utils/io_utils.py +142 -6
  233. evalscope/utils/json_schema.py +208 -0
  234. evalscope/utils/logger.py +51 -12
  235. evalscope/utils/model_utils.py +11 -7
  236. evalscope/utils/multi_choices.py +288 -0
  237. evalscope/utils/url_utils.py +65 -0
  238. evalscope/version.py +2 -2
  239. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
  240. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
  241. tests/benchmark/test_eval.py +385 -0
  242. tests/benchmark/test_image_edit.py +65 -0
  243. tests/{aigc → benchmark}/test_t2i.py +22 -4
  244. tests/benchmark/test_vlm.py +80 -0
  245. tests/cli/test_all.py +85 -47
  246. tests/cli/test_collection.py +20 -8
  247. tests/cli/test_custom.py +22 -15
  248. tests/cli/test_reasoning.py +81 -0
  249. tests/common.py +73 -0
  250. tests/perf/test_perf.py +4 -2
  251. tests/rag/test_clip_benchmark.py +0 -2
  252. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  253. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  254. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  255. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
  256. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  257. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  258. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  259. evalscope/benchmarks/benchmark.py +0 -81
  260. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  261. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  262. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  263. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  264. evalscope/benchmarks/data_adapter.py +0 -528
  265. evalscope/benchmarks/filters.py +0 -59
  266. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  267. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  268. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  269. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  270. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  271. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  272. evalscope/benchmarks/race/race.py +0 -104
  273. evalscope/benchmarks/race/samples.jsonl +0 -5
  274. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  275. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  276. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  277. evalscope/benchmarks/utils.py +0 -60
  278. evalscope/collections/evaluator.py +0 -375
  279. evalscope/metrics/completion_parsers.py +0 -227
  280. evalscope/metrics/named_metrics.py +0 -55
  281. evalscope/models/adapters/__init__.py +0 -14
  282. evalscope/models/adapters/base_adapter.py +0 -84
  283. evalscope/models/adapters/bfcl_adapter.py +0 -246
  284. evalscope/models/adapters/chat_adapter.py +0 -207
  285. evalscope/models/adapters/choice_adapter.py +0 -222
  286. evalscope/models/adapters/custom_adapter.py +0 -71
  287. evalscope/models/adapters/server_adapter.py +0 -236
  288. evalscope/models/adapters/t2i_adapter.py +0 -79
  289. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  290. evalscope/models/custom/__init__.py +0 -4
  291. evalscope/models/custom/custom_model.py +0 -50
  292. evalscope/models/custom/dummy_model.py +0 -99
  293. evalscope/models/local_model.py +0 -128
  294. evalscope/models/register.py +0 -41
  295. tests/cli/test_run.py +0 -489
  296. /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
  297. /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
  298. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  299. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  300. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  301. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  302. /tests/{aigc → benchmark}/__init__.py +0 -0
@@ -0,0 +1,378 @@
1
+ import copy
2
+ import os
3
+ from pydantic import BaseModel
4
+ from typing import Any, Dict, List, Optional, Tuple, Union
5
+
6
+ from evalscope.api.dataset import Dataset
7
+ from evalscope.api.messages import ChatMessage
8
+ from evalscope.api.metric import SampleScore
9
+ from evalscope.api.model import ModelOutput
10
+ from evalscope.constants import DumpMode
11
+ from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
12
+ from evalscope.utils.logger import get_logger
13
+ from .state import TaskState
14
+
15
+ logger = get_logger()
16
+
17
+
18
+ class CacheManager:
19
+ """
20
+ Manage model results and review results for evaluation caching.
21
+
22
+ This class handles the caching mechanism for evaluation results, allowing
23
+ the system to resume evaluations from previously computed results and
24
+ avoid redundant computations.
25
+ """
26
+
27
+ def __init__(self, outputs: OutputsStructure, model_name: str, benchmark_name: str):
28
+ """
29
+ Initialize the cache manager.
30
+
31
+ Args:
32
+ outputs: Output directory structure for storing cache files
33
+ model_name: Name of the model being evaluated
34
+ benchmark_name: Name of the benchmark being used
35
+ """
36
+ self.outputs = outputs
37
+ self.model_name = model_name
38
+ self.benchmark_name = benchmark_name
39
+
40
+ def filter_prediction_cache(self, subset: str, dataset: Dataset) -> Tuple[List[TaskState], Dataset]:
41
+ """
42
+ Load cached prediction results and filter them from the dataset.
43
+
44
+ This method checks for existing prediction cache files and loads any
45
+ previously computed results. It then filters these samples from the
46
+ input dataset to avoid recomputation.
47
+
48
+ Args:
49
+ subset: Name of the dataset subset
50
+ dataset: The dataset to filter
51
+
52
+ Returns:
53
+ Tuple of (cached task states, filtered dataset with remaining samples)
54
+ """
55
+ cache_file = self.get_prediction_cache_path(subset)
56
+ if not os.path.exists(cache_file):
57
+ # No cache file exists, return empty cache and full dataset
58
+ return [], dataset
59
+
60
+ cached_task_states = []
61
+ cached_sample_ids = set()
62
+ cache_items = jsonl_to_list(cache_file)
63
+
64
+ # Process each cached item
65
+ for cache_item in cache_items:
66
+ # Deserialize the cached model result
67
+ cached_model_result = ModelResult.model_validate(cache_item)
68
+ # Convert to task state for further processing
69
+ cached_state = cached_model_result.to_task_state(dataset=dataset)
70
+
71
+ cached_task_states.append(cached_state)
72
+ cached_sample_ids.add(cached_state.sample_id)
73
+
74
+ # Remove cached samples from the dataset to avoid reprocessing
75
+ filtered_dataset = dataset.filter(lambda sample: sample.id not in cached_sample_ids)
76
+
77
+ logger.info(
78
+ f'Reusing predictions from {cache_file}, got {len(cached_task_states)} predictions, '
79
+ f'remaining {len(filtered_dataset)} samples'
80
+ )
81
+ return cached_task_states, filtered_dataset
82
+
83
+ def get_prediction_cache_path(self, subset: str) -> str:
84
+ """
85
+ Get the file path for prediction cache storage.
86
+
87
+ Args:
88
+ subset: Name of the dataset subset
89
+
90
+ Returns:
91
+ Path to the prediction cache file
92
+ """
93
+ file_path = os.path.join(self.outputs.predictions_dir, self.model_name, f'{self.benchmark_name}_{subset}.jsonl')
94
+ # Ensure the directory exists
95
+ if self.outputs.is_make:
96
+ os.makedirs(os.path.dirname(file_path), exist_ok=True)
97
+ return file_path
98
+
99
+ def save_prediction_cache(self, subset: str, task_state: TaskState, save_metadata: bool = True) -> 'ModelResult':
100
+ """
101
+ Save a prediction result to the cache.
102
+
103
+ Args:
104
+ subset: Name of the dataset subset
105
+ task_state: The task state containing prediction results
106
+
107
+ Returns:
108
+ The saved model result object
109
+ """
110
+ cache_file = self.get_prediction_cache_path(subset)
111
+ # Convert task state to serializable model result
112
+ model_result = ModelResult.from_task_state(task_state, save_metadata)
113
+ # Serialize to dictionary
114
+ model_result_dict = model_result.model_dump()
115
+ # Append to JSONL cache file
116
+ dump_jsonl_data(data_list=model_result_dict, jsonl_file=cache_file, dump_mode=DumpMode.APPEND)
117
+ return model_result
118
+
119
+ def filter_review_cache(self, subset: str,
120
+ task_states: List[TaskState]) -> Tuple[List[SampleScore], List[TaskState]]:
121
+ """
122
+ Load cached review results and filter corresponding task states.
123
+
124
+ This method loads previously computed review scores and removes
125
+ the corresponding task states from further review processing.
126
+
127
+ Args:
128
+ subset: Name of the dataset subset
129
+ task_states: List of task states to potentially review
130
+
131
+ Returns:
132
+ Tuple of (cached sample scores, filtered task states for remaining reviews)
133
+ """
134
+ cache_file = self.get_review_cache_path(subset)
135
+ if not os.path.exists(cache_file):
136
+ # No review cache exists, return empty scores and all task states
137
+ return [], task_states
138
+
139
+ cached_sample_scores: List[SampleScore] = []
140
+ cache_items = jsonl_to_list(cache_file)
141
+
142
+ # Process each cached review result
143
+ for cache_item in cache_items:
144
+ # Deserialize the cached review result
145
+ cached_review_result = ReviewResult.model_validate(cache_item)
146
+ cached_sample_scores.append(cached_review_result.to_sample_score())
147
+
148
+ # Filter out task states that already have review scores
149
+ cached_sample_ids = {review.sample_id for review in cached_sample_scores}
150
+ filtered_task_states = [state for state in task_states if state.sample_id not in cached_sample_ids]
151
+
152
+ logger.info(f'Reusing reviews from {cache_file}, got {len(cached_sample_scores)} reviews')
153
+ return cached_sample_scores, filtered_task_states
154
+
155
+ def get_review_cache_path(self, subset: str) -> str:
156
+ """
157
+ Get the file path for review cache storage.
158
+
159
+ Args:
160
+ subset: Name of the dataset subset
161
+
162
+ Returns:
163
+ Path to the review cache file
164
+ """
165
+ file_path = os.path.join(self.outputs.reviews_dir, self.model_name, f'{self.benchmark_name}_{subset}.jsonl')
166
+ # Ensure the directory exists
167
+ if self.outputs.is_make:
168
+ os.makedirs(os.path.dirname(file_path), exist_ok=True)
169
+ return file_path
170
+
171
+ def delete_review_cache(self, subset: str):
172
+ """Delete the review cache for a specific subset. If the cache exists, it will be removed."""
173
+ file_path = self.get_review_cache_path(subset)
174
+ if os.path.exists(file_path):
175
+ logger.info(f'Deleting review cache file: {file_path}')
176
+ os.remove(file_path)
177
+
178
+ def save_review_cache(
179
+ self,
180
+ subset: str,
181
+ task_state: TaskState,
182
+ sample_score: SampleScore,
183
+ save_metadata: bool = True
184
+ ) -> 'ReviewResult':
185
+ """
186
+ Save a review result to the cache.
187
+
188
+ Args:
189
+ subset: Name of the dataset subset
190
+ task_state: The task state that was reviewed
191
+ sample_score: The computed score for the sample
192
+
193
+ Returns:
194
+ The saved review result object
195
+ """
196
+ cache_file = self.get_review_cache_path(subset)
197
+ # Convert score and state to serializable review result
198
+ review_result = ReviewResult.from_score_state(sample_score, task_state, save_metadata)
199
+ # Serialize to dictionary
200
+ review_result_dict = review_result.model_dump()
201
+ # Append to JSONL cache file
202
+ dump_jsonl_data(data_list=review_result_dict, jsonl_file=cache_file, dump_mode=DumpMode.APPEND)
203
+ return review_result
204
+
205
+ def get_report_path(self) -> str:
206
+ """
207
+ Get the directory path for report storage.
208
+
209
+ Returns:
210
+ Path to the reports directory for this model
211
+ """
212
+ report_path = os.path.join(self.outputs.reports_dir, self.model_name)
213
+ # Ensure the directory exists
214
+ if self.outputs.is_make:
215
+ os.makedirs(report_path, exist_ok=True)
216
+ return report_path
217
+
218
+ def get_report_file(self) -> str:
219
+ """
220
+ Get the report file path for the benchmark.
221
+
222
+ The report file is named as '{benchmark_name}.json' and contains
223
+ the final evaluation results for the benchmark.
224
+
225
+ Returns:
226
+ Full path to the benchmark report file
227
+ """
228
+ return os.path.join(self.get_report_path(), f'{self.benchmark_name}.json')
229
+
230
+
231
+ class ModelResult(BaseModel):
232
+ """
233
+ Serializable container for model prediction results.
234
+
235
+ This class represents a single model prediction that can be cached
236
+ and restored later to avoid recomputation.
237
+ """
238
+
239
+ index: int
240
+ """Index of the sample in the dataset that was processed."""
241
+
242
+ model: str = ''
243
+ """Name of the model that generated this prediction."""
244
+
245
+ model_output: Optional[ModelOutput] = None
246
+ """The actual prediction/output generated by the model."""
247
+
248
+ messages: List[ChatMessage] = []
249
+ """Chat messages exchanged during evaluation (for conversational models)."""
250
+
251
+ metadata: Optional[Dict[str, Any]] = None
252
+ """Additional metadata associated with the model result."""
253
+
254
+ @classmethod
255
+ def from_task_state(cls, task_state: TaskState, save_metadata: bool = True) -> 'ModelResult':
256
+ """
257
+ Create a ModelResult from a TaskState for caching.
258
+
259
+ Args:
260
+ task_state: The completed task state to serialize
261
+
262
+ Returns:
263
+ ModelResult object ready for caching
264
+ """
265
+ return cls(
266
+ model=task_state.model,
267
+ index=task_state.sample_id,
268
+ messages=task_state.messages,
269
+ model_output=task_state.output,
270
+ metadata=task_state.metadata if save_metadata else {},
271
+ )
272
+
273
+ def to_task_state(self, dataset: Dataset) -> TaskState:
274
+ """
275
+ Restore a TaskState from cached ModelResult.
276
+
277
+ Args:
278
+ dataset: The dataset to retrieve the original sample from
279
+
280
+ Returns:
281
+ Reconstructed TaskState with cached results
282
+
283
+ Raises:
284
+ ValueError: If the sample index is not found in the dataset
285
+ """
286
+ sample = dataset[self.index]
287
+ if not sample:
288
+ raise ValueError(f'Sample with index {self.index} not found in dataset')
289
+
290
+ # update metadata if exists
291
+ if self.metadata:
292
+ sample.metadata.update(self.metadata)
293
+
294
+ return TaskState(
295
+ model=self.model,
296
+ sample=sample,
297
+ messages=self.messages,
298
+ output=ModelOutput.model_validate(self.model_output),
299
+ completed=True, # Mark as completed since it was cached
300
+ )
301
+
302
+ def pretty_print(self) -> str:
303
+ """
304
+ Generate a pretty-printed string representation of the model result.
305
+
306
+ Returns:
307
+ A string representation of the model result
308
+ """
309
+ return self.model_dump_json(indent=2)
310
+
311
+
312
+ class ReviewResult(BaseModel):
313
+ """
314
+ Serializable container for review/scoring results.
315
+
316
+ This class represents the result of reviewing a model's prediction,
317
+ including the computed score and relevant context.
318
+ """
319
+
320
+ index: int
321
+ """Index of the sample that was reviewed."""
322
+
323
+ input: str = ''
324
+ """Original input from the sample (immutable reference)."""
325
+
326
+ target: Optional[str] = None
327
+ """Expected/target answer for the sample, if available."""
328
+
329
+ sample_score: SampleScore
330
+ """The computed evaluation score for this sample."""
331
+
332
+ @classmethod
333
+ def from_score_state(
334
+ cls, sample_score: SampleScore, state: TaskState, save_metadata: bool = True
335
+ ) -> 'ReviewResult':
336
+ """
337
+ Create a ReviewResult from a score and task state for caching.
338
+
339
+ Args:
340
+ sample_score: The computed score for the sample
341
+ state: The task state containing sample information
342
+
343
+ Returns:
344
+ ReviewResult object ready for caching
345
+ """
346
+ if not save_metadata:
347
+ sample_score = copy.deepcopy(sample_score)
348
+ sample_score.sample_metadata = None
349
+
350
+ return cls(
351
+ index=state.sample_id,
352
+ input=state.input_markdown,
353
+ target=state.target,
354
+ sample_score=sample_score,
355
+ )
356
+
357
+ def to_sample_score(self) -> SampleScore:
358
+ """
359
+ Extract the sample score from the cached review result.
360
+
361
+ Returns:
362
+ The sample score object
363
+ """
364
+ return self.sample_score
365
+
366
+ def pretty_print(self) -> str:
367
+ """
368
+ Generate a pretty-printed string representation of the review result.
369
+
370
+ Returns:
371
+ A string representation of the review result
372
+ """
373
+ output = [
374
+ f'Review Result for Sample {self.index}:',
375
+ f'Target: {self.target}',
376
+ f'Score: {self.sample_score.model_dump_json(indent=2)}',
377
+ ]
378
+ return '\n'.join(output)
@@ -0,0 +1,56 @@
1
+ import abc
2
+ from typing import TYPE_CHECKING, List, Union
3
+
4
+ from evalscope.api.metric import SampleScore
5
+ from evalscope.report import Report
6
+ from .state import TaskState
7
+
8
+ if TYPE_CHECKING:
9
+ from evalscope.api.benchmark import DataAdapter
10
+ from evalscope.api.model import Model
11
+ from evalscope.config import TaskConfig
12
+ from evalscope.utils.io_utils import OutputsStructure
13
+
14
+
15
+ class Evaluator(abc.ABC):
16
+ """
17
+ Abstract base class for evaluators.
18
+
19
+ Args:
20
+ benchmark (DataAdapter): The data adapter for the benchmark.
21
+ model (Model): The model to evaluate.
22
+ outputs (OutputsStructure, optional): The output structure for results.
23
+ task_config (TaskConfig, optional): The task configuration.
24
+ """
25
+
26
+ def __init__(
27
+ self,
28
+ benchmark: 'DataAdapter',
29
+ model: 'Model',
30
+ outputs: 'OutputsStructure' = None,
31
+ task_config: 'TaskConfig' = None,
32
+ ):
33
+ self.benchmark = benchmark
34
+ self.model = model
35
+ self.outputs = outputs
36
+ self.task_config = task_config
37
+
38
+ @abc.abstractmethod
39
+ def eval(self, *args, **kwargs) -> Report:
40
+ """Run the evaluation process."""
41
+ pass
42
+
43
+ @abc.abstractmethod
44
+ def get_answers(self, *args, **kwargs) -> List[TaskState]:
45
+ """Get the evaluation answers."""
46
+ pass
47
+
48
+ @abc.abstractmethod
49
+ def get_reviews(self, *args, **kwargs) -> List[SampleScore]:
50
+ """Get the review results."""
51
+ pass
52
+
53
+ @abc.abstractmethod
54
+ def get_report(self, *args, **kwargs) -> Report:
55
+ """Get the evaluation report."""
56
+ pass
@@ -0,0 +1,275 @@
1
+ from dataclasses import dataclass
2
+ from random import Random
3
+ from typing import Any, Dict, List, Optional, Sequence, Union, overload
4
+
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.messages import ChatMessage, ChatMessageUser, messages_pretty_str, messages_to_markdown
7
+ from evalscope.api.model import ModelOutput
8
+
9
+
10
+ class Target(Sequence[str]):
11
+ """Target for scoring against the current TaskState.
12
+
13
+ Target is a sequence of one or more strings. Use the
14
+ `text` property to access the value as a single string.
15
+ """
16
+
17
+ def __init__(self, target: Union[str, List[str]]) -> None:
18
+ self.target = target if isinstance(target, list) else [target]
19
+
20
+ @overload
21
+ def __getitem__(self, index: int) -> str:
22
+ ...
23
+
24
+ @overload
25
+ def __getitem__(self, index: slice) -> Sequence[str]:
26
+ ...
27
+
28
+ def __getitem__(self, index: Union[int, slice]) -> Union[str, Sequence[str]]:
29
+ return self.target[index]
30
+
31
+ def __len__(self) -> int:
32
+ return len(self.target)
33
+
34
+ @property
35
+ def text(self) -> str:
36
+ return ''.join(self.target)
37
+
38
+
39
+ @dataclass
40
+ class Choice:
41
+ """
42
+ A `Choice` represents a single choice in a multiple choice question.
43
+
44
+ It is only relevant for the `multiple_choice` solver and corresponding
45
+ `choice` scorer.
46
+ """
47
+
48
+ value: str
49
+ """The original value of the choice from the `Sample`."""
50
+
51
+ correct: Optional[bool]
52
+ """Did the model think this choice satisfies the question? `None`
53
+ indicates this has not been set yet"""
54
+
55
+ original_position: int
56
+ """Choices may be re-ordered during processing, this represents the
57
+ original position in the sample's list of choices"""
58
+
59
+
60
+ class Choices(Sequence[Choice]):
61
+ """
62
+ Wrapper class for a list of `Choice` objects.
63
+
64
+ Primarily simply to abstract away implementations of choice-specific
65
+ functionality from the already-big `TaskState` class.
66
+ """
67
+
68
+ def __init__(self, choices: Union[List[str], List[Choice]]) -> None:
69
+ """
70
+ Setter for choices, intended to only be used with the `multiple_choice` scorer.
71
+
72
+ Choices come from a list of choices for the sample, specifically used by
73
+ the `multiple_choice` scorer.
74
+
75
+ For example, if the sample was a multiple choice question like "What is
76
+ the capital of France? A) Paris B) London C) Berlin", we would store the
77
+ possible answers here.
78
+ """
79
+ self._choices: List[Choice] = []
80
+
81
+ for i, choice in enumerate(choices):
82
+ if isinstance(choice, str):
83
+ self._choices.append(Choice(value=choice, correct=None, original_position=i))
84
+ elif isinstance(choice, Choice):
85
+ self._choices.append(choice)
86
+
87
+ @overload
88
+ def __getitem__(self, index: int) -> Choice:
89
+ ...
90
+
91
+ @overload
92
+ def __getitem__(self, index: slice) -> Sequence[Choice]:
93
+ ...
94
+
95
+ def __getitem__(self, index: Union[int, slice]) -> Union[Choice, Sequence[Choice]]:
96
+ return self._choices[index]
97
+
98
+ def __len__(self) -> int:
99
+ return len(self._choices)
100
+
101
+ def mark_choice(self, index: int, correct: bool) -> None:
102
+ """Set the value of a specific choice"""
103
+ self._choices[index].correct = correct
104
+
105
+ def shuffle(self, rand: Random = Random()) -> None:
106
+ """
107
+ Shuffle the choice order, setting the `original_position` so they can be mapped back to their original order.
108
+
109
+ Some evals will shuffle the choices from the original sample to try to
110
+ avoid the model answering correctly due to fine-tuning (or similar) on
111
+ specific datasets.
112
+ """
113
+ shuffled_positions = list(range(len(self._choices)))
114
+ rand.shuffle(shuffled_positions)
115
+
116
+ shuffled_choices = [Choice('notachoice', None, -1)] * len(self._choices)
117
+
118
+ for i, shuffled_position in enumerate(shuffled_positions):
119
+ shuffled_choices[i] = self._choices[shuffled_position]
120
+ shuffled_choices[i].original_position = shuffled_position
121
+
122
+ self._choices = shuffled_choices
123
+
124
+
125
+ class TaskState:
126
+ """
127
+ The `TaskState` represents the internal state of the `Task` being run for a single `Sample`.
128
+
129
+ The `TaskState` is passed to and returned from each solver during a sample's
130
+ evaluation. It allows us to maintain the manipulated message history, the tools
131
+ available to the model, the final output of the model, and whether the task
132
+ is completed or has hit a limit.
133
+ """
134
+
135
+ def __init__(
136
+ self,
137
+ model: str,
138
+ sample: Sample,
139
+ messages: List[ChatMessage] = [],
140
+ output: Optional[ModelOutput] = None,
141
+ completed: bool = False,
142
+ ) -> None:
143
+ self._model = model
144
+ self._sample = sample
145
+ self._sample_id = sample.id
146
+ self._group_id = sample.group_id
147
+ self._input = sample.input
148
+ self._target = Target(sample.target)
149
+ self._metadata = sample.metadata
150
+ self._messages: List[ChatMessage] = messages
151
+ self._output = output if output else ModelOutput(model=str(model))
152
+ self._completed = completed
153
+ if sample.choices:
154
+ self._choices = Choices(sample.choices)
155
+ else:
156
+ self._choices = Choices([])
157
+
158
+ @property
159
+ def model(self) -> str:
160
+ """Name of model being evaluated."""
161
+ return self._model
162
+
163
+ @property
164
+ def sample_id(self) -> int:
165
+ """Unique id for sample."""
166
+ return self._sample_id
167
+
168
+ @property
169
+ def group_id(self) -> int:
170
+ """Group id for sample."""
171
+ return self._group_id
172
+
173
+ @property
174
+ def input(self) -> Union[str, List[ChatMessage]]:
175
+ """Input from the `Sample`, should be considered immutable."""
176
+ return self._input
177
+
178
+ @property
179
+ def input_text(self) -> str:
180
+ """
181
+ Convenience function for accessing the initial input from the `Sample` as a string.
182
+
183
+ If the `input` is a `List[ChatMessage]`, this will return the text from
184
+ the last chat message
185
+ """
186
+ if isinstance(self._input, str):
187
+ return self._input
188
+ else:
189
+ return messages_pretty_str(self._input)
190
+
191
+ @property
192
+ def input_markdown(self) -> str:
193
+ """Get the input text as markdown.
194
+
195
+ For multi-modal content, images will be represented in markdown format.
196
+ """
197
+ if isinstance(self._input, str):
198
+ return self._input
199
+ else:
200
+ return messages_to_markdown(self._input)
201
+
202
+ @property
203
+ def choices(self) -> Choices:
204
+ """Choices for the sample, if applicable."""
205
+ return self._choices
206
+
207
+ @property
208
+ def user_prompt(self) -> ChatMessageUser:
209
+ """User prompt for this state.
210
+
211
+ Tasks are very general and can have may types of inputs.
212
+ However, in many cases solvers assume they can interact with
213
+ the state as a "chat" in a predictable fashion (e.g. prompt
214
+ engineering solvers). This property enables easy read and
215
+ write access to the user chat prompt. Raises an
216
+ exception if there is no user prompt
217
+ """
218
+ prompt = next((m for m in reversed(self.messages) if m.role == 'user'), None)
219
+ if prompt:
220
+ return prompt
221
+ else:
222
+ raise ValueError('user_prompt requested from TaskState but none available')
223
+
224
+ @property
225
+ def metadata(self) -> Dict[str, Any]:
226
+ """Metadata from the `Sample` for this `TaskState`"""
227
+ return self._metadata
228
+
229
+ @metadata.setter
230
+ def metadata(self, metadata: Dict[str, Any]) -> None:
231
+ self._metadata = metadata
232
+
233
+ @property
234
+ def messages(self) -> List[ChatMessage]:
235
+ """
236
+ Chat conversation history for sample.
237
+
238
+ This will generally get appended to every time a `generate` call is made
239
+ to the model. Useful for both debug and for solvers/scorers to assess
240
+ model performance or choose the next step.
241
+ """
242
+ return self._messages
243
+
244
+ @messages.setter
245
+ def messages(self, messages: List[ChatMessage]) -> None:
246
+ self._messages = messages
247
+
248
+ @property
249
+ def output(self) -> ModelOutput:
250
+ """
251
+ The 'final' model output once we've completed all solving.
252
+
253
+ For simple evals this may just be the last `message` from the
254
+ conversation history, but more complex solvers may set this directly.
255
+ """
256
+ return self._output
257
+
258
+ @output.setter
259
+ def output(self, output: ModelOutput) -> None:
260
+ self._output = output
261
+
262
+ @property
263
+ def completed(self) -> bool:
264
+ """Is the task completed."""
265
+ return self._completed
266
+
267
+ @completed.setter
268
+ def completed(self, completed: bool) -> None:
269
+ """Set the completed status."""
270
+ self._completed = completed
271
+
272
+ @property
273
+ def target(self) -> str:
274
+ """The scoring target for this `Sample`."""
275
+ return self._target.text