evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +3 -0
  3. evalscope/api/benchmark/adapters/__init__.py +5 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
  8. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  9. evalscope/api/benchmark/benchmark.py +356 -0
  10. evalscope/api/benchmark/meta.py +121 -0
  11. evalscope/api/dataset/__init__.py +2 -0
  12. evalscope/api/dataset/dataset.py +349 -0
  13. evalscope/api/dataset/loader.py +262 -0
  14. evalscope/api/dataset/utils.py +143 -0
  15. evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope/api/evaluator/cache.py +378 -0
  17. evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope/api/evaluator/state.py +275 -0
  19. evalscope/api/filter/__init__.py +1 -0
  20. evalscope/api/filter/filter.py +72 -0
  21. evalscope/api/messages/__init__.py +12 -0
  22. evalscope/api/messages/chat_message.py +243 -0
  23. evalscope/api/messages/content.py +102 -0
  24. evalscope/api/messages/utils.py +35 -0
  25. evalscope/api/metric/__init__.py +2 -0
  26. evalscope/api/metric/metric.py +55 -0
  27. evalscope/api/metric/scorer.py +113 -0
  28. evalscope/api/mixin/__init__.py +1 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +155 -0
  32. evalscope/api/model/model.py +386 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/app.py +3 -0
  40. evalscope/app/ui/app_ui.py +2 -1
  41. evalscope/app/ui/multi_model.py +50 -25
  42. evalscope/app/ui/single_model.py +26 -14
  43. evalscope/app/utils/data_utils.py +43 -27
  44. evalscope/app/utils/env_utils.py +12 -0
  45. evalscope/app/utils/text_utils.py +14 -14
  46. evalscope/app/utils/visualization.py +9 -4
  47. evalscope/arguments.py +7 -10
  48. evalscope/backend/opencompass/api_meta_template.py +2 -1
  49. evalscope/backend/opencompass/backend_manager.py +6 -5
  50. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  51. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  52. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  53. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  55. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  56. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  57. evalscope/backend/rag_eval/utils/embedding.py +10 -1
  58. evalscope/backend/rag_eval/utils/llm.py +13 -12
  59. evalscope/benchmarks/__init__.py +0 -2
  60. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  61. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  62. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  63. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  64. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  65. evalscope/benchmarks/arena_hard/utils.py +37 -1
  66. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  67. evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
  68. evalscope/benchmarks/bfcl/generation.py +222 -0
  69. evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
  70. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  71. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  72. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  73. evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
  74. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  75. evalscope/benchmarks/docmath/utils.py +4 -5
  76. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  77. evalscope/benchmarks/frames/frames_adapter.py +136 -52
  78. evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
  79. evalscope/benchmarks/general_arena/utils.py +23 -27
  80. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  81. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  82. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  83. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  84. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  85. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  86. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  87. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  88. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  89. evalscope/benchmarks/ifeval/instructions.py +109 -64
  90. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  91. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  92. evalscope/benchmarks/ifeval/utils.py +6 -7
  93. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  94. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  95. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  96. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/math_vista/__init__.py +0 -0
  105. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  106. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  107. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  108. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  109. evalscope/benchmarks/mmmu/__init__.py +0 -0
  110. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  111. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  112. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  113. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  114. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
  115. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  116. evalscope/benchmarks/race/race_adapter.py +33 -119
  117. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  118. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  119. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  120. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  121. evalscope/benchmarks/tau_bench/generation.py +147 -0
  122. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
  123. evalscope/benchmarks/text2image/__init__.py +0 -0
  124. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  125. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  126. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  127. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  128. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  129. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  131. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
  132. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  133. evalscope/cli/cli.py +2 -0
  134. evalscope/cli/start_app.py +7 -1
  135. evalscope/cli/start_perf.py +7 -1
  136. evalscope/cli/start_server.py +6 -3
  137. evalscope/collections/__init__.py +2 -10
  138. evalscope/collections/sampler.py +10 -10
  139. evalscope/collections/schema.py +13 -11
  140. evalscope/config.py +157 -57
  141. evalscope/constants.py +37 -61
  142. evalscope/evaluator/__init__.py +1 -1
  143. evalscope/evaluator/evaluator.py +275 -419
  144. evalscope/filters/__init__.py +2 -0
  145. evalscope/filters/extraction.py +126 -0
  146. evalscope/filters/selection.py +57 -0
  147. evalscope/metrics/__init__.py +13 -13
  148. evalscope/metrics/llm_judge.py +47 -33
  149. evalscope/metrics/math_parser.py +27 -22
  150. evalscope/metrics/metric.py +307 -0
  151. evalscope/metrics/metrics.py +22 -18
  152. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  153. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  154. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  155. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  156. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  157. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  158. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  159. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  160. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  162. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  163. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  184. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  185. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  186. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  187. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  188. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  189. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  190. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  191. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  192. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  193. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  194. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  195. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  196. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  197. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  198. evalscope/models/__init__.py +6 -29
  199. evalscope/models/image_edit_model.py +125 -0
  200. evalscope/models/mockllm.py +65 -0
  201. evalscope/models/model_apis.py +67 -0
  202. evalscope/models/modelscope.py +455 -0
  203. evalscope/models/openai_compatible.py +126 -0
  204. evalscope/models/text2image_model.py +124 -0
  205. evalscope/models/utils/openai.py +701 -0
  206. evalscope/perf/benchmark.py +4 -1
  207. evalscope/perf/http_client.py +4 -2
  208. evalscope/perf/plugin/api/custom_api.py +5 -4
  209. evalscope/perf/plugin/api/openai_api.py +11 -9
  210. evalscope/perf/plugin/datasets/custom.py +2 -1
  211. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  212. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  213. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  214. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  215. evalscope/perf/plugin/datasets/openqa.py +4 -2
  216. evalscope/perf/utils/benchmark_util.py +15 -10
  217. evalscope/perf/utils/db_util.py +9 -6
  218. evalscope/perf/utils/local_server.py +11 -3
  219. evalscope/perf/utils/rich_display.py +16 -10
  220. evalscope/report/__init__.py +2 -3
  221. evalscope/report/combinator.py +18 -12
  222. evalscope/report/generator.py +51 -35
  223. evalscope/report/{utils.py → report.py} +8 -6
  224. evalscope/run.py +33 -47
  225. evalscope/summarizer.py +1 -1
  226. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  227. evalscope/utils/__init__.py +21 -2
  228. evalscope/utils/chat_service.py +3 -2
  229. evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope/utils/function_utils.py +29 -0
  231. evalscope/utils/import_utils.py +23 -1
  232. evalscope/utils/io_utils.py +142 -6
  233. evalscope/utils/json_schema.py +208 -0
  234. evalscope/utils/logger.py +51 -12
  235. evalscope/utils/model_utils.py +11 -7
  236. evalscope/utils/multi_choices.py +288 -0
  237. evalscope/utils/url_utils.py +65 -0
  238. evalscope/version.py +2 -2
  239. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
  240. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
  241. tests/benchmark/test_eval.py +385 -0
  242. tests/benchmark/test_image_edit.py +65 -0
  243. tests/{aigc → benchmark}/test_t2i.py +22 -4
  244. tests/benchmark/test_vlm.py +80 -0
  245. tests/cli/test_all.py +85 -47
  246. tests/cli/test_collection.py +20 -8
  247. tests/cli/test_custom.py +22 -15
  248. tests/cli/test_reasoning.py +81 -0
  249. tests/common.py +73 -0
  250. tests/perf/test_perf.py +4 -2
  251. tests/rag/test_clip_benchmark.py +0 -2
  252. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  253. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  254. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  255. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
  256. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  257. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  258. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  259. evalscope/benchmarks/benchmark.py +0 -81
  260. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  261. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  262. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  263. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  264. evalscope/benchmarks/data_adapter.py +0 -528
  265. evalscope/benchmarks/filters.py +0 -59
  266. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  267. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  268. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  269. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  270. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  271. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  272. evalscope/benchmarks/race/race.py +0 -104
  273. evalscope/benchmarks/race/samples.jsonl +0 -5
  274. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  275. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  276. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  277. evalscope/benchmarks/utils.py +0 -60
  278. evalscope/collections/evaluator.py +0 -375
  279. evalscope/metrics/completion_parsers.py +0 -227
  280. evalscope/metrics/named_metrics.py +0 -55
  281. evalscope/models/adapters/__init__.py +0 -14
  282. evalscope/models/adapters/base_adapter.py +0 -84
  283. evalscope/models/adapters/bfcl_adapter.py +0 -246
  284. evalscope/models/adapters/chat_adapter.py +0 -207
  285. evalscope/models/adapters/choice_adapter.py +0 -222
  286. evalscope/models/adapters/custom_adapter.py +0 -71
  287. evalscope/models/adapters/server_adapter.py +0 -236
  288. evalscope/models/adapters/t2i_adapter.py +0 -79
  289. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  290. evalscope/models/custom/__init__.py +0 -4
  291. evalscope/models/custom/custom_model.py +0 -50
  292. evalscope/models/custom/dummy_model.py +0 -99
  293. evalscope/models/local_model.py +0 -128
  294. evalscope/models/register.py +0 -41
  295. tests/cli/test_run.py +0 -489
  296. /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
  297. /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
  298. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  299. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  300. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  301. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  302. /tests/{aigc → benchmark}/__init__.py +0 -0
@@ -0,0 +1,121 @@
1
+ import copy
2
+ from collections import OrderedDict
3
+ from dataclasses import asdict, dataclass, field
4
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Type, Union
5
+
6
+ from evalscope.constants import OutputType
7
+
8
+ if TYPE_CHECKING:
9
+ from evalscope.api.benchmark import DataAdapter
10
+
11
+
12
+ @dataclass
13
+ class BenchmarkMeta:
14
+ """Metadata for a benchmark, including dataset and model configurations."""
15
+
16
+ name: str
17
+ """ Unique name of the benchmark."""
18
+
19
+ dataset_id: str
20
+ """ Dataset id on modelscope or path to local dataset."""
21
+
22
+ data_adapter: Optional[Type['DataAdapter']] = None
23
+ """ Data adapter class for the benchmark."""
24
+
25
+ output_types: List[str] = field(default_factory=lambda: [OutputType.GENERATION])
26
+ """ List of output types supported by the benchmark."""
27
+
28
+ subset_list: List[str] = field(default_factory=lambda: ['default'])
29
+ """ List of subsets available for the benchmark."""
30
+
31
+ default_subset: str = 'default'
32
+ """ Default subset to use for the benchmark."""
33
+
34
+ few_shot_num: int = 0
35
+ """ Number of few-shot examples to use."""
36
+
37
+ few_shot_random: bool = False
38
+ """ Whether to use random few-shot examples."""
39
+
40
+ train_split: Optional[str] = None
41
+ """ Training split to use for the benchmark."""
42
+
43
+ eval_split: Optional[str] = None
44
+ """ Evaluation split to use for the benchmark."""
45
+
46
+ prompt_template: Optional[str] = None
47
+ """ Prompt template to use for the benchmark."""
48
+
49
+ few_shot_prompt_template: Optional[str] = None
50
+ """ Few-shot prompt template to use for the benchmark."""
51
+
52
+ system_prompt: Optional[str] = None
53
+ """ System prompt to use for the benchmark."""
54
+
55
+ query_template: Optional[str] = None
56
+ """ Query template to use for the benchmark."""
57
+
58
+ pretty_name: Optional[str] = None
59
+ """ Human-readable name for the benchmark."""
60
+
61
+ description: Optional[str] = None
62
+ """ Description of the benchmark."""
63
+
64
+ tags: List[str] = field(default_factory=list)
65
+ """ Tags associated with the benchmark."""
66
+
67
+ filters: Optional[OrderedDict] = None
68
+ """ Filters to apply to the dataset on model output."""
69
+
70
+ metric_list: List[Union[str, Dict[str, Any]]] = field(default_factory=list)
71
+ """ List of metrics to evaluate the benchmark."""
72
+
73
+ aggregation: str = 'mean'
74
+ """ Aggregation function for the metrics. Default is 'mean'. Can be 'mean', 'pass@<k>' or a custom function name."""
75
+
76
+ shuffle: bool = False
77
+ """Whether to shuffle the dataset before evaluation."""
78
+
79
+ shuffle_choices: bool = False
80
+ """Whether to shuffle the choices in multiple-choice datasets."""
81
+
82
+ extra_params: Dict = field(default_factory=dict)
83
+ """ Additional parameters for the benchmark."""
84
+
85
+ def __post_init__(self):
86
+ """Validate fields after initialization."""
87
+ if self.few_shot_num < 0:
88
+ raise ValueError('few_shot_num must be >= 0')
89
+
90
+ def _update(self, args: dict):
91
+ """Update instance with provided arguments, maintaining backward compatibility."""
92
+ args = copy.deepcopy(args)
93
+
94
+ if args.get('local_path'):
95
+ self.dataset_id = args['local_path']
96
+ del args['local_path']
97
+
98
+ if args.get('filters'):
99
+ if self.filters is None:
100
+ self.filters = OrderedDict()
101
+ new_filters = OrderedDict(args['filters'])
102
+ # insert filters at the beginning
103
+ self.filters = OrderedDict(list(new_filters.items()) + list(self.filters.items()))
104
+ del args['filters']
105
+ # Update fields with validation
106
+ for key, value in args.items():
107
+ if hasattr(self, key):
108
+ setattr(self, key, value) # Validate few_shot_num if it's being updated
109
+ if key == 'few_shot_num' and value < 0:
110
+ raise ValueError('few_shot_num must be >= 0')
111
+
112
+ def to_dict(self) -> dict:
113
+ """Convert to dictionary, maintaining backward compatibility."""
114
+ return asdict(self)
115
+
116
+ def to_string_dict(self) -> dict:
117
+ """Convert to string dictionary, excluding data_adapter."""
118
+ cur_dict = copy.deepcopy(asdict(self))
119
+ if 'data_adapter' in cur_dict:
120
+ del cur_dict['data_adapter']
121
+ return cur_dict
@@ -0,0 +1,2 @@
1
+ from .dataset import Dataset, DatasetDict, MemoryDataset, Sample
2
+ from .loader import DataLoader, DictDataLoader, LocalDataLoader, RemoteDataLoader
@@ -0,0 +1,349 @@
1
+ import abc
2
+ import random
3
+ from collections import defaultdict
4
+ from dataclasses import dataclass, field
5
+ from pydantic import BaseModel, Field
6
+ from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, Union
7
+
8
+ from evalscope.api.messages import ChatMessage, messages_to_markdown
9
+ from evalscope.api.tool import ToolInfo
10
+
11
+
12
+ class Sample(BaseModel):
13
+ r"""Sample for an evaluation task."""
14
+
15
+ input: Union[str, List[ChatMessage]]
16
+ """The input to be submitted to the model."""
17
+
18
+ choices: Optional[List[str]] = None
19
+ """List of available answer choices (used only for multiple-choice evals)."""
20
+
21
+ target: Union[str, List[str]] = ''
22
+ """Ideal target output. May be a literal value or narrative text to be used by a model grader."""
23
+
24
+ id: Optional[int] = None
25
+ """Unique identifier for sample."""
26
+
27
+ group_id: Optional[int] = None
28
+ """Identifier for the group this sample belongs to, used for grouping k repeated samples."""
29
+
30
+ tools: Optional[List[ToolInfo]] = None
31
+ """List of tools available to the model during inference (optional)."""
32
+
33
+ subset_key: Optional[str] = None
34
+ """Key for the subset this sample belongs to, used for generating subsets (optional)."""
35
+
36
+ metadata: Dict[str, Any] = Field(default_factory=dict)
37
+ """Arbitrary metadata associated with the sample."""
38
+
39
+ sandbox: Optional[str] = None
40
+ """Sandbox environment type and optional config file."""
41
+
42
+ files: Optional[Dict[str, str]] = None
43
+ """Files that go along with the sample (copied to SandboxEnvironment)"""
44
+
45
+ setup: Optional[str] = None
46
+ """Setup script to run for sample (run within default SandboxEnvironment)."""
47
+
48
+ def pretty_print(self) -> str:
49
+ """Return a pretty-printed string representation of the sample."""
50
+ if isinstance(self.input, str):
51
+ input_text = self.input
52
+ else:
53
+ input_text = messages_to_markdown(self.input, max_length=50)
54
+ return f'Sample ID: {self.id}\nInput: {input_text}\nTarget: {self.target}'
55
+
56
+
57
+ @dataclass
58
+ class FieldSpec:
59
+ r"""Specification for mapping data source fields to sample fields."""
60
+
61
+ input: str = field(default='input')
62
+ """Name of the field containing the sample input."""
63
+
64
+ target: str = field(default='target')
65
+ """Name of the field containing the sample target."""
66
+
67
+ choices: str = field(default='choices')
68
+ """Name of field containing the list of answer choices."""
69
+
70
+ id: int = field(default=0)
71
+ """ Unique identifier for the sample."""
72
+
73
+ metadata: Optional[List[str]] = field(default=None)
74
+ """List of additional field names that should be read as metadata."""
75
+
76
+ sandbox: str = field(default='sandbox')
77
+ """Sandbox type along with optional config file."""
78
+
79
+ files: str = field(default='files')
80
+ """Files that go along with the sample."""
81
+
82
+ setup: str = field(default='setup')
83
+ """Setup script to run for sample (run within default SandboxEnvironment)."""
84
+
85
+
86
+ class Dataset(Sequence[Sample], abc.ABC):
87
+ r"""A sequence of Sample objects.
88
+
89
+ Datasets provide sequential access (via conventional indexes or slicing)
90
+ to a collection of Sample objects.
91
+ """
92
+
93
+ @property
94
+ @abc.abstractmethod
95
+ def name(self) -> Optional[str]:
96
+ ...
97
+
98
+ @property
99
+ @abc.abstractmethod
100
+ def location(self) -> Optional[str]:
101
+ ...
102
+
103
+ @property
104
+ @abc.abstractmethod
105
+ def shuffled(self) -> bool:
106
+ ...
107
+
108
+ @abc.abstractmethod
109
+ def __iter__(self) -> Iterator[Sample]:
110
+ """Return an iterator over the samples."""
111
+ ...
112
+
113
+ @abc.abstractmethod
114
+ def __getitem__(self, index: Union[int, slice]) -> Union[Sample, 'Dataset']:
115
+ ...
116
+
117
+ @abc.abstractmethod
118
+ def __len__(self) -> int:
119
+ ...
120
+
121
+ @abc.abstractmethod
122
+ def filter(self, predicate: Callable[[Sample], bool], name: Optional[str] = None) -> 'Dataset':
123
+ """Filter the dataset using a predicate. Only samples matching the predicate will be included.
124
+
125
+ Args:
126
+ predicate: Filtering function.
127
+ name: Name for filtered dataset (optional).
128
+
129
+ Returns:
130
+ Filtered dataset.
131
+ """
132
+ ...
133
+
134
+ @abc.abstractmethod
135
+ def shuffle(self, seed: Optional[int] = None) -> None:
136
+ """Shuffle the order of the dataset (in place).
137
+
138
+ Args:
139
+ seed: Random seed for shuffling (optional).
140
+ """
141
+ ...
142
+
143
+ @abc.abstractmethod
144
+ def shuffle_choices(self, seed: Optional[int] = None) -> None:
145
+ """Shuffle the order of the choices with each sample.
146
+
147
+ Args:
148
+ seed: Random seed for shuffling (optional).
149
+ """
150
+ ...
151
+
152
+ @abc.abstractmethod
153
+ def reindex(self, group_size=1):
154
+ """Reindex the dataset samples to ensure consistent ordering.
155
+
156
+ Args:
157
+ group_size: Number of samples per group for setting group_id.
158
+ """
159
+ ...
160
+
161
+
162
+ class MemoryDataset(Dataset):
163
+ r"""A Dataset stored in memory."""
164
+
165
+ def __init__(
166
+ self,
167
+ samples: List[Sample],
168
+ name: Optional[str] = None,
169
+ location: Optional[str] = None,
170
+ shuffled: bool = False,
171
+ ) -> None:
172
+ r"""A dataset of samples held in an in-memory list.
173
+
174
+ Datasets provide sequential access (via conventional indexes or slicing)
175
+ to a collection of Sample objects. The ListDataset is explicitly
176
+ initialized with a list that is held in memory.
177
+
178
+ Args:
179
+ samples (List[Sample]): The list of sample objects.
180
+ name (str | None): Optional name for dataset.
181
+ location (str | None): Optional location for dataset.
182
+ shuffled (bool): Was the dataset shuffled after reading.
183
+ """
184
+ self.samples = samples
185
+ self._name = name
186
+ self._location = location
187
+ self._shuffled = shuffled
188
+
189
+ @property
190
+ def name(self) -> Optional[str]:
191
+ """Dataset name."""
192
+ return self._name
193
+
194
+ @property
195
+ def location(self) -> Optional[str]:
196
+ """Dataset location."""
197
+ return self._location
198
+
199
+ @property
200
+ def shuffled(self) -> bool:
201
+ """Was the dataset shuffled."""
202
+ return self._shuffled
203
+
204
+ def __iter__(self) -> Iterator[Sample]:
205
+ return iter(self.samples)
206
+
207
+ def __getitem__(self, index: Union[int, slice]) -> Union[Sample, Dataset]:
208
+ if isinstance(index, int):
209
+ return self.samples[index]
210
+ else:
211
+ return MemoryDataset(
212
+ samples=self.samples[index],
213
+ name=self.name,
214
+ location=self.location,
215
+ shuffled=self.shuffled,
216
+ )
217
+
218
+ def __len__(self) -> int:
219
+ return len(self.samples)
220
+
221
+ def shuffle(self, seed: Optional[int] = None) -> None:
222
+ if seed is not None:
223
+ random.Random(seed).shuffle(self.samples)
224
+ else:
225
+ random.shuffle(self.samples)
226
+ self._shuffled = True
227
+
228
+ def shuffle_choices(self, seed: Optional[int] = None) -> None:
229
+ from evalscope.utils.multi_choices import answer_character
230
+
231
+ rand = random.Random(seed)
232
+ for sample in self.samples:
233
+ if not sample.choices:
234
+ continue
235
+ # The original positions
236
+ positions = list(range(len(sample.choices)))
237
+
238
+ # Shuffle the choices
239
+ rand.shuffle(positions)
240
+ shuffled_choices = [sample.choices[i] for i in positions]
241
+
242
+ # Map of original position / target letter
243
+ position_map = {i: answer_character(new_i) for new_i, i in enumerate(positions)}
244
+
245
+ # Update to the shuffled choices and target
246
+ sample.choices = shuffled_choices
247
+ sample.target = self._remap_target(sample.target, position_map=position_map)
248
+
249
+ def _remap_target(self, target: Union[str, List[str]], position_map: Dict[int, str]) -> Union[str, List[str]]:
250
+ from evalscope.utils.multi_choices import answer_index
251
+
252
+ if isinstance(target, list):
253
+ return [position_map[answer_index(t)] for t in target]
254
+ else:
255
+ return position_map[answer_index(target)]
256
+
257
+ def filter(self, predicate: Callable[[Sample], bool], name: Optional[str] = None) -> 'MemoryDataset':
258
+ return MemoryDataset(
259
+ name=name or self.name,
260
+ location=self.location,
261
+ samples=[sample for sample in self.samples if predicate(sample)],
262
+ shuffled=self.shuffled,
263
+ )
264
+
265
+ def reindex(self, group_size=1):
266
+ # Reindex the dataset samples to ensure consistent ordering
267
+ for i, sample in enumerate(self.samples):
268
+ sample.id = i
269
+ sample.group_id = i // group_size
270
+
271
+
272
+ class DatasetDict:
273
+ """
274
+ A dictionary-like container for datasets.
275
+ """
276
+
277
+ def __init__(self, datasets: Dict[str, Dataset]):
278
+ self.datasets = datasets
279
+
280
+ def __getitem__(self, key: str) -> Dataset:
281
+ return self.datasets[key]
282
+
283
+ def __setitem__(self, key: str, value: Dataset) -> None:
284
+ self.datasets[key] = value
285
+
286
+ def __delitem__(self, key: str) -> None:
287
+ del self.datasets[key]
288
+
289
+ def get(self, key: str, default: Optional[Dataset] = None) -> Optional[Dataset]:
290
+ return self.datasets.get(key, default)
291
+
292
+ def items(self):
293
+ return self.datasets.items()
294
+
295
+ def keys(self):
296
+ return self.datasets.keys()
297
+
298
+ def values(self):
299
+ return self.datasets.values()
300
+
301
+ def __len__(self) -> int:
302
+ return len(self.datasets)
303
+
304
+ @classmethod
305
+ def from_dataset(
306
+ cls,
307
+ dataset: Dataset,
308
+ subset_list: List[str],
309
+ limit: Optional[Union[int, float]] = None,
310
+ repeats: int = 1
311
+ ) -> 'DatasetDict':
312
+ """
313
+ Create a DatasetDict from a single Dataset using subset key in the sample.
314
+
315
+ Args:
316
+ dataset (Dataset): The dataset to wrap in a DatasetDict.
317
+ subset_list (List[str]): List of subset keys to include.
318
+ limit (int | float | None): Optional limit on number of samples per subset.
319
+ If int, limits to that many samples. If float, limits to that fraction of samples.
320
+
321
+ Returns:
322
+ DatasetDict: A new DatasetDict containing the provided dataset.
323
+ """
324
+ data_dict = defaultdict(list)
325
+ dataset_dict = defaultdict(list)
326
+ # init subset keys to prevent order issues
327
+ for key in subset_list:
328
+ data_dict[key] = []
329
+ dataset_dict[key] = []
330
+
331
+ # Loop through each sample in the dataset
332
+ for sample in dataset.samples:
333
+ subset_key = sample.subset_key or 'default'
334
+ data_dict[subset_key].append(sample)
335
+ # Create a MemoryDataset for each subset key
336
+ for key, samples in data_dict.items():
337
+ if key not in subset_list:
338
+ continue
339
+ # Apply limit if specified
340
+ if limit is not None:
341
+ if isinstance(limit, float):
342
+ limit = int(len(samples) * limit)
343
+ total_limit = limit * repeats
344
+ samples = samples[:total_limit]
345
+ cur_dataset = MemoryDataset(samples, name=dataset.name)
346
+ # Reindex the dataset to ensure consistent IDs and group IDs
347
+ cur_dataset.reindex(group_size=repeats)
348
+ dataset_dict[key] = cur_dataset
349
+ return cls(dataset_dict)