evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +3 -0
  3. evalscope/api/benchmark/adapters/__init__.py +5 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
  8. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  9. evalscope/api/benchmark/benchmark.py +356 -0
  10. evalscope/api/benchmark/meta.py +121 -0
  11. evalscope/api/dataset/__init__.py +2 -0
  12. evalscope/api/dataset/dataset.py +349 -0
  13. evalscope/api/dataset/loader.py +262 -0
  14. evalscope/api/dataset/utils.py +143 -0
  15. evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope/api/evaluator/cache.py +378 -0
  17. evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope/api/evaluator/state.py +275 -0
  19. evalscope/api/filter/__init__.py +1 -0
  20. evalscope/api/filter/filter.py +72 -0
  21. evalscope/api/messages/__init__.py +12 -0
  22. evalscope/api/messages/chat_message.py +243 -0
  23. evalscope/api/messages/content.py +102 -0
  24. evalscope/api/messages/utils.py +35 -0
  25. evalscope/api/metric/__init__.py +2 -0
  26. evalscope/api/metric/metric.py +55 -0
  27. evalscope/api/metric/scorer.py +113 -0
  28. evalscope/api/mixin/__init__.py +1 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +155 -0
  32. evalscope/api/model/model.py +386 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/app.py +3 -0
  40. evalscope/app/ui/app_ui.py +2 -1
  41. evalscope/app/ui/multi_model.py +50 -25
  42. evalscope/app/ui/single_model.py +26 -14
  43. evalscope/app/utils/data_utils.py +43 -27
  44. evalscope/app/utils/env_utils.py +12 -0
  45. evalscope/app/utils/text_utils.py +14 -14
  46. evalscope/app/utils/visualization.py +9 -4
  47. evalscope/arguments.py +7 -10
  48. evalscope/backend/opencompass/api_meta_template.py +2 -1
  49. evalscope/backend/opencompass/backend_manager.py +6 -5
  50. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  51. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  52. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  53. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  55. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  56. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  57. evalscope/backend/rag_eval/utils/embedding.py +10 -1
  58. evalscope/backend/rag_eval/utils/llm.py +13 -12
  59. evalscope/benchmarks/__init__.py +0 -2
  60. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  61. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  62. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  63. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  64. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  65. evalscope/benchmarks/arena_hard/utils.py +37 -1
  66. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  67. evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
  68. evalscope/benchmarks/bfcl/generation.py +222 -0
  69. evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
  70. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  71. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  72. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  73. evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
  74. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  75. evalscope/benchmarks/docmath/utils.py +4 -5
  76. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  77. evalscope/benchmarks/frames/frames_adapter.py +136 -52
  78. evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
  79. evalscope/benchmarks/general_arena/utils.py +23 -27
  80. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  81. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  82. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  83. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  84. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  85. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  86. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  87. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  88. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  89. evalscope/benchmarks/ifeval/instructions.py +109 -64
  90. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  91. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  92. evalscope/benchmarks/ifeval/utils.py +6 -7
  93. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  94. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  95. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  96. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/math_vista/__init__.py +0 -0
  105. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  106. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  107. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  108. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  109. evalscope/benchmarks/mmmu/__init__.py +0 -0
  110. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  111. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  112. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  113. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  114. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
  115. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  116. evalscope/benchmarks/race/race_adapter.py +33 -119
  117. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  118. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  119. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  120. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  121. evalscope/benchmarks/tau_bench/generation.py +147 -0
  122. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
  123. evalscope/benchmarks/text2image/__init__.py +0 -0
  124. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  125. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  126. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  127. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  128. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  129. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  131. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
  132. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  133. evalscope/cli/cli.py +2 -0
  134. evalscope/cli/start_app.py +7 -1
  135. evalscope/cli/start_perf.py +7 -1
  136. evalscope/cli/start_server.py +6 -3
  137. evalscope/collections/__init__.py +2 -10
  138. evalscope/collections/sampler.py +10 -10
  139. evalscope/collections/schema.py +13 -11
  140. evalscope/config.py +157 -57
  141. evalscope/constants.py +37 -61
  142. evalscope/evaluator/__init__.py +1 -1
  143. evalscope/evaluator/evaluator.py +275 -419
  144. evalscope/filters/__init__.py +2 -0
  145. evalscope/filters/extraction.py +126 -0
  146. evalscope/filters/selection.py +57 -0
  147. evalscope/metrics/__init__.py +13 -13
  148. evalscope/metrics/llm_judge.py +47 -33
  149. evalscope/metrics/math_parser.py +27 -22
  150. evalscope/metrics/metric.py +307 -0
  151. evalscope/metrics/metrics.py +22 -18
  152. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  153. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  154. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  155. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  156. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  157. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  158. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  159. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  160. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  162. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  163. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  184. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  185. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  186. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  187. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  188. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  189. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  190. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  191. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  192. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  193. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  194. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  195. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  196. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  197. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  198. evalscope/models/__init__.py +6 -29
  199. evalscope/models/image_edit_model.py +125 -0
  200. evalscope/models/mockllm.py +65 -0
  201. evalscope/models/model_apis.py +67 -0
  202. evalscope/models/modelscope.py +455 -0
  203. evalscope/models/openai_compatible.py +126 -0
  204. evalscope/models/text2image_model.py +124 -0
  205. evalscope/models/utils/openai.py +701 -0
  206. evalscope/perf/benchmark.py +4 -1
  207. evalscope/perf/http_client.py +4 -2
  208. evalscope/perf/plugin/api/custom_api.py +5 -4
  209. evalscope/perf/plugin/api/openai_api.py +11 -9
  210. evalscope/perf/plugin/datasets/custom.py +2 -1
  211. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  212. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  213. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  214. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  215. evalscope/perf/plugin/datasets/openqa.py +4 -2
  216. evalscope/perf/utils/benchmark_util.py +15 -10
  217. evalscope/perf/utils/db_util.py +9 -6
  218. evalscope/perf/utils/local_server.py +11 -3
  219. evalscope/perf/utils/rich_display.py +16 -10
  220. evalscope/report/__init__.py +2 -3
  221. evalscope/report/combinator.py +18 -12
  222. evalscope/report/generator.py +51 -35
  223. evalscope/report/{utils.py → report.py} +8 -6
  224. evalscope/run.py +33 -47
  225. evalscope/summarizer.py +1 -1
  226. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  227. evalscope/utils/__init__.py +21 -2
  228. evalscope/utils/chat_service.py +3 -2
  229. evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope/utils/function_utils.py +29 -0
  231. evalscope/utils/import_utils.py +23 -1
  232. evalscope/utils/io_utils.py +142 -6
  233. evalscope/utils/json_schema.py +208 -0
  234. evalscope/utils/logger.py +51 -12
  235. evalscope/utils/model_utils.py +11 -7
  236. evalscope/utils/multi_choices.py +288 -0
  237. evalscope/utils/url_utils.py +65 -0
  238. evalscope/version.py +2 -2
  239. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
  240. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
  241. tests/benchmark/test_eval.py +385 -0
  242. tests/benchmark/test_image_edit.py +65 -0
  243. tests/{aigc → benchmark}/test_t2i.py +22 -4
  244. tests/benchmark/test_vlm.py +80 -0
  245. tests/cli/test_all.py +85 -47
  246. tests/cli/test_collection.py +20 -8
  247. tests/cli/test_custom.py +22 -15
  248. tests/cli/test_reasoning.py +81 -0
  249. tests/common.py +73 -0
  250. tests/perf/test_perf.py +4 -2
  251. tests/rag/test_clip_benchmark.py +0 -2
  252. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  253. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  254. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  255. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
  256. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  257. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  258. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  259. evalscope/benchmarks/benchmark.py +0 -81
  260. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  261. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  262. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  263. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  264. evalscope/benchmarks/data_adapter.py +0 -528
  265. evalscope/benchmarks/filters.py +0 -59
  266. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  267. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  268. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  269. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  270. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  271. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  272. evalscope/benchmarks/race/race.py +0 -104
  273. evalscope/benchmarks/race/samples.jsonl +0 -5
  274. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  275. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  276. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  277. evalscope/benchmarks/utils.py +0 -60
  278. evalscope/collections/evaluator.py +0 -375
  279. evalscope/metrics/completion_parsers.py +0 -227
  280. evalscope/metrics/named_metrics.py +0 -55
  281. evalscope/models/adapters/__init__.py +0 -14
  282. evalscope/models/adapters/base_adapter.py +0 -84
  283. evalscope/models/adapters/bfcl_adapter.py +0 -246
  284. evalscope/models/adapters/chat_adapter.py +0 -207
  285. evalscope/models/adapters/choice_adapter.py +0 -222
  286. evalscope/models/adapters/custom_adapter.py +0 -71
  287. evalscope/models/adapters/server_adapter.py +0 -236
  288. evalscope/models/adapters/t2i_adapter.py +0 -79
  289. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  290. evalscope/models/custom/__init__.py +0 -4
  291. evalscope/models/custom/custom_model.py +0 -50
  292. evalscope/models/custom/dummy_model.py +0 -99
  293. evalscope/models/local_model.py +0 -128
  294. evalscope/models/register.py +0 -41
  295. tests/cli/test_run.py +0 -489
  296. /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
  297. /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
  298. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  299. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  300. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  301. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  302. /tests/{aigc → benchmark}/__init__.py +0 -0
@@ -27,7 +27,8 @@ def default_query_distribution(llm: BaseRagasLLM, kg: KnowledgeGraph, language:
27
27
  target_lang=language,
28
28
  llm=llm,
29
29
  adapt_instruction=True,
30
- ))
30
+ )
31
+ )
31
32
 
32
33
  default_queries = [
33
34
  single_hop,
@@ -44,8 +44,9 @@ def default_transforms(
44
44
  return bins
45
45
 
46
46
  def filter_doc_with_num_tokens(node, min_num_tokens=500):
47
- return (node.type == NodeType.DOCUMENT
48
- and num_tokens_from_string(node.properties['page_content']) > min_num_tokens)
47
+ return (
48
+ node.type == NodeType.DOCUMENT and num_tokens_from_string(node.properties['page_content']) > min_num_tokens
49
+ )
49
50
 
50
51
  def filter_docs(node):
51
52
  return node.type == NodeType.DOCUMENT
@@ -90,7 +91,8 @@ def default_transforms(
90
91
  target_lang=language,
91
92
  llm=llm,
92
93
  adapt_instruction=True,
93
- ))
94
+ )
95
+ )
94
96
 
95
97
  transforms = [
96
98
  headline_extractor,
@@ -121,7 +123,8 @@ def default_transforms(
121
123
  target_lang=language,
122
124
  llm=llm,
123
125
  adapt_instruction=True,
124
- ))
126
+ )
127
+ )
125
128
 
126
129
  transforms = [
127
130
  summary_extractor,
@@ -113,7 +113,8 @@ def generate_testset(args: TestsetGenerationArguments) -> None:
113
113
 
114
114
  # generate testset
115
115
  generator = TestsetGenerator(
116
- llm=wrapped_llm, embedding_model=wrapped_embeddings, knowledge_graph=knowledge_graph, persona_list=persona_list)
116
+ llm=wrapped_llm, embedding_model=wrapped_embeddings, knowledge_graph=knowledge_graph, persona_list=persona_list
117
+ )
117
118
 
118
119
  testset = generator.generate(
119
120
  testset_size=args.test_size,
@@ -34,7 +34,8 @@ async def translate_prompt(
34
34
 
35
35
  logger.info(f'Translating prompts to {target_lang}')
36
36
  adapted_prompts = await prompt_user.adapt_prompts(
37
- language=target_lang, llm=llm, adapt_instruction=adapt_instruction)
37
+ language=target_lang, llm=llm, adapt_instruction=adapt_instruction
38
+ )
38
39
  prompt_user.set_prompts(**adapted_prompts)
39
40
  try:
40
41
  prompt_user.save_prompts(prompt_dir)
@@ -164,6 +164,13 @@ class CrossEncoderModel(BaseModel):
164
164
  max_length=self.max_seq_length,
165
165
  automodel_args=self.model_kwargs,
166
166
  )
167
+ self.tokenizer = self.model.tokenizer
168
+ # set pad token
169
+ if self.tokenizer.pad_token is None:
170
+ self.tokenizer.pad_token = self.tokenizer.eos_token
171
+ if ('pad_token_id' not in self.model.config) or (self.model.config.pad_token_id is None):
172
+ self.model.config.update({'pad_token_id': self.tokenizer.eos_token_id})
173
+
167
174
  self.supported_encode_params = get_supported_params(self.model.predict)
168
175
 
169
176
  def predict(self, sentences: List[List[str]], **kwargs) -> Tensor:
@@ -189,6 +196,7 @@ class APIEmbeddingModel(BaseModel):
189
196
  self.openai_api_base = kwargs.get('api_base')
190
197
  self.openai_api_key = kwargs.get('api_key')
191
198
  self.dimensions = kwargs.get('dimensions')
199
+ self.check_embedding_ctx_length = kwargs.get('check_embedding_ctx_length', False)
192
200
  self.framework = ['API']
193
201
 
194
202
  self.model = OpenAIEmbeddings(
@@ -196,7 +204,8 @@ class APIEmbeddingModel(BaseModel):
196
204
  openai_api_base=self.openai_api_base,
197
205
  openai_api_key=self.openai_api_key,
198
206
  dimensions=self.dimensions,
199
- check_embedding_ctx_length=False)
207
+ check_embedding_ctx_length=self.check_embedding_ctx_length,
208
+ )
200
209
 
201
210
  super().__init__(model_name_or_path=self.model_name, **kwargs)
202
211
 
@@ -2,11 +2,10 @@ import os
2
2
  from langchain_core.callbacks.manager import CallbackManagerForLLMRun
3
3
  from langchain_core.language_models.llms import LLM as BaseLLM
4
4
  from langchain_openai import ChatOpenAI
5
- from transformers.generation.configuration_utils import GenerationConfig
6
5
  from typing import Any, Dict, Iterator, List, Mapping, Optional
7
6
 
8
- from evalscope.constants import DEFAULT_MODEL_REVISION
9
- from evalscope.models import ChatGenerationModelAdapter, LocalModel
7
+ from evalscope.api.model import GenerateConfig, Model, get_model
8
+ from evalscope.constants import DEFAULT_MODEL_REVISION, EvalType
10
9
 
11
10
 
12
11
  class LLM:
@@ -30,16 +29,19 @@ class LocalLLM(BaseLLM):
30
29
  model_name_or_path: str
31
30
  model_revision: str = DEFAULT_MODEL_REVISION
32
31
  template_type: Optional[str] = None
33
- model_name: Optional[str]
34
- model: Optional[ChatGenerationModelAdapter]
35
- generation_config: Optional[Dict]
32
+ model_name: Optional[str] = None
33
+ model: Optional[Model] = None
34
+ generation_config: Optional[Dict] = {}
36
35
 
37
36
  def __init__(self, **kw):
38
37
  super().__init__(**kw)
39
38
  self.model_name = os.path.basename(self.model_name_or_path)
40
- self.model = ChatGenerationModelAdapter(
41
- model=LocalModel(model_id=self.model_name_or_path, model_revision=self.model_revision),
42
- generation_config=GenerationConfig(**self.generation_config) if self.generation_config else None,
39
+
40
+ # Create and initialize the local model
41
+ self.model = get_model(
42
+ model=self.model_name_or_path,
43
+ eval_type=EvalType.CHECKPOINT,
44
+ config=GenerateConfig(**self.generation_config),
43
45
  )
44
46
 
45
47
  def _call(
@@ -50,10 +52,9 @@ class LocalLLM(BaseLLM):
50
52
  **kwargs: Any,
51
53
  ) -> str:
52
54
  """Run the LLM on the given input."""
53
- infer_cfg = {'stop': stop}
54
55
 
55
- response, _ = self.model.predict([{'data': [prompt]}], infer_cfg=infer_cfg)
56
- return response[0][0]
56
+ response = self.model.generate(input=prompt)
57
+ return response.completion
57
58
 
58
59
  @property
59
60
  def _identifying_params(self) -> Dict[str, Any]:
@@ -4,8 +4,6 @@ import importlib
4
4
  import os
5
5
  import time
6
6
 
7
- from evalscope.benchmarks.benchmark import Benchmark, BenchmarkMeta
8
- from evalscope.benchmarks.data_adapter import DataAdapter
9
7
  from evalscope.utils import get_logger
10
8
 
11
9
  logger = get_logger()
@@ -1,5 +1,12 @@
1
- from evalscope.benchmarks import Benchmark, DataAdapter
2
- from evalscope.metrics import extract_answer, math_equal, strip_answer_string
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ from typing import Any, Dict
4
+
5
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
6
+ from evalscope.api.dataset import Sample
7
+ from evalscope.api.evaluator import TaskState
8
+ from evalscope.api.registry import register_benchmark
9
+ from evalscope.constants import Tags
3
10
  from evalscope.utils.logger import get_logger
4
11
 
5
12
  # flake8: noqa
@@ -7,46 +14,37 @@ from evalscope.utils.logger import get_logger
7
14
  logger = get_logger()
8
15
 
9
16
 
10
- @Benchmark.register(
11
- name='aime24',
12
- pretty_name='AIME-2024',
13
- tags=['Mathematics'],
14
- description=
15
- 'The AIME 2024 benchmark is based on problems from the American Invitational Mathematics Examination, a prestigious high school mathematics competition. This benchmark tests a model’s ability to solve challenging mathematics problems by generating step-by-step solutions and providing the correct final answer.', # noqa: E501
16
- dataset_id='HuggingFaceH4/aime_2024',
17
- subset_list=['default'],
18
- metric_list=['AveragePass@1'],
19
- few_shot_num=0,
20
- train_split=None,
21
- eval_split='train', # Only train set is available
22
- prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
17
+ @register_benchmark(
18
+ BenchmarkMeta(
19
+ name='aime24',
20
+ pretty_name='AIME-2024',
21
+ tags=[Tags.MATH, Tags.REASONING],
22
+ description=
23
+ 'The AIME 2024 benchmark is based on problems from the American Invitational Mathematics Examination, a prestigious high school mathematics competition. This benchmark tests a model\'s ability to solve challenging mathematics problems by generating step-by-step solutions and providing the correct final answer.', # noqa: E501
24
+ dataset_id='HuggingFaceH4/aime_2024',
25
+ subset_list=['default'],
26
+ metric_list=[{
27
+ 'acc': {
28
+ 'numeric': True
29
+ }
30
+ }],
31
+ few_shot_num=0,
32
+ train_split=None,
33
+ eval_split='train', # Only train set is available
34
+ prompt_template='{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
35
+ )
23
36
  )
24
- class AIME24Adapter(DataAdapter):
37
+ class AIME24Adapter(DefaultDataAdapter):
25
38
 
26
39
  def __init__(self, *args, **kwargs):
27
40
  super().__init__(*args, **kwargs)
28
41
 
29
- def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
30
- """
31
- Generate the prompt for the model input.
32
- """
33
- problem = input_d['problem']
34
- full_prompt = self.prompt_template.format(query=problem)
35
-
36
- return self.gen_prompt_data(full_prompt)
37
-
38
- def get_gold_answer(self, input_d: dict) -> str:
39
- # Extract the gold answer from the input dict.
40
- return strip_answer_string(input_d['answer'])
41
-
42
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
43
- """
44
- Parse the model output to get the answer. Could be the best choice index.
45
- """
46
- # Note: Use same extraction method for both of checkpoint/service/custom
47
- result = strip_answer_string(extract_answer(result))
48
- return result
49
-
50
- def match(self, gold: str, pred: str) -> float:
51
- res = math_equal(pred, gold)
52
- return 1.0 if res else 0.0
42
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
43
+ return Sample(
44
+ input=record['problem'],
45
+ target=record['answer'],
46
+ metadata={
47
+ 'problem_id': record.get('id', ''),
48
+ 'solution': record.get('solution', ''),
49
+ },
50
+ )
@@ -1,5 +1,12 @@
1
- from evalscope.benchmarks import Benchmark, DataAdapter
2
- from evalscope.metrics import extract_answer, math_equal, strip_answer_string
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ from typing import Any, Dict
4
+
5
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
6
+ from evalscope.api.dataset import Sample
7
+ from evalscope.api.evaluator import TaskState
8
+ from evalscope.api.registry import register_benchmark
9
+ from evalscope.constants import Tags
3
10
  from evalscope.utils.logger import get_logger
4
11
 
5
12
  # flake8: noqa
@@ -7,46 +14,33 @@ from evalscope.utils.logger import get_logger
7
14
  logger = get_logger()
8
15
 
9
16
 
10
- @Benchmark.register(
11
- name='aime25',
12
- pretty_name='AIME-2025',
13
- tags=['Mathematics'],
14
- description=
15
- 'The AIME 2025 benchmark is based on problems from the American Invitational Mathematics Examination, a prestigious high school mathematics competition. This benchmark tests a model’s ability to solve challenging mathematics problems by generating step-by-step solutions and providing the correct final answer.',
16
- dataset_id='opencompass/AIME2025',
17
- subset_list=['AIME2025-I', 'AIME2025-II'],
18
- metric_list=['AveragePass@1'],
19
- few_shot_num=0,
20
- train_split=None,
21
- eval_split='test', # Only train set is available
22
- prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
17
+ @register_benchmark(
18
+ BenchmarkMeta(
19
+ name='aime25',
20
+ pretty_name='AIME-2025',
21
+ tags=[Tags.MATH, Tags.REASONING],
22
+ description=
23
+ 'The AIME 2025 benchmark is based on problems from the American Invitational Mathematics Examination, a prestigious high school mathematics competition. This benchmark tests a model\'s ability to solve challenging mathematics problems by generating step-by-step solutions and providing the correct final answer.',
24
+ dataset_id='opencompass/AIME2025',
25
+ subset_list=['AIME2025-I', 'AIME2025-II'],
26
+ metric_list=[{
27
+ 'acc': {
28
+ 'numeric': True
29
+ }
30
+ }],
31
+ few_shot_num=0,
32
+ train_split=None,
33
+ eval_split='test',
34
+ prompt_template='{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
35
+ )
23
36
  )
24
- class AIME25Adapter(DataAdapter):
37
+ class AIME25Adapter(DefaultDataAdapter):
25
38
 
26
39
  def __init__(self, *args, **kwargs):
27
40
  super().__init__(*args, **kwargs)
28
41
 
29
- def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
30
- """
31
- Generate the prompt for the model input.
32
- """
33
- problem = input_d['question']
34
- full_prompt = self.prompt_template.format(query=problem)
35
-
36
- return self.gen_prompt_data(full_prompt)
37
-
38
- def get_gold_answer(self, input_d: dict) -> str:
39
- # Extract the gold answer from the input dict.
40
- return strip_answer_string(input_d['answer'])
41
-
42
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
43
- """
44
- Parse the model output to get the answer. Could be the best choice index.
45
- """
46
- # Note: Use same extraction method for both of checkpoint/service/custom
47
- result = strip_answer_string(extract_answer(result))
48
- return result
49
-
50
- def match(self, gold: str, pred: str) -> float:
51
- res = math_equal(pred, gold)
52
- return 1.0 if res else 0.0
42
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
43
+ return Sample(
44
+ input=record['question'],
45
+ target=record['answer'],
46
+ )
@@ -1,16 +1,17 @@
1
1
  import re
2
- from collections import defaultdict
3
- from typing import Any, List
4
-
5
- from evalscope.benchmarks import Benchmark, DataAdapter
6
- from evalscope.metrics import LLMJudge, Metric, mean, metric_registry
2
+ from typing import Any, Dict
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.evaluator import TaskState
7
+ from evalscope.api.metric import Score
8
+ from evalscope.api.registry import register_benchmark
9
+ from evalscope.constants import Tags
7
10
  from evalscope.utils.logger import get_logger
8
11
 
9
- # flake8: noqa
10
-
11
12
  logger = get_logger()
12
13
 
13
- GRADER_SYSTEM_PROMPT = """You are a highly efficient assistant, who evaluates and selects the best large language model (LLMs) based on the quality of their responses to a given instruction. This process will be used to create a leaderboard reflecting the most accurate and human-preferred answers."""
14
+ GRADER_SYSTEM_PROMPT = """You are a highly efficient assistant, who evaluates and selects the best large language model (LLMs) based on the quality of their responses to a given instruction. This process will be used to create a leaderboard reflecting the most accurate and human-preferred answers.""" # noqa: E501
14
15
 
15
16
  GRADER_TEMPLATE = """
16
17
  I require a leaderboard for various large language models. I'll provide you with prompts given to these models and their corresponding outputs. Your task is to assess these responses, and select the model that produces the best output from a human perspective.
@@ -44,64 +45,89 @@ Evaluate the models based on the quality and relevance of their outputs, and sel
44
45
  """.strip() # noqa: E501
45
46
 
46
47
 
47
- @Benchmark.register(
48
- name='alpaca_eval',
49
- pretty_name='AlpacaEval2.0',
50
- tags=['Instruction-Following', 'Arena'],
51
- description='Alpaca Eval 2.0 is an enhanced framework for evaluating instruction-following language models, '
52
- 'featuring an improved auto-annotator, updated baselines, and continuous preference calculation to '
53
- 'provide more accurate and cost-effective model assessments. '
54
- 'Currently not support `length-controlled winrate`; the official Judge model is `gpt-4-1106-preview`, while the baseline model is `gpt-4-turbo`.', # noqa: E501
55
- dataset_id='AI-ModelScope/alpaca_eval',
56
- subset_list=['alpaca_eval_gpt4_baseline'],
57
- metric_list=['winrate'],
58
- few_shot_num=0,
59
- train_split=None,
60
- eval_split='eval')
61
- class AlpacaEvalAdapter(DataAdapter):
48
+ @register_benchmark(
49
+ BenchmarkMeta(
50
+ name='alpaca_eval',
51
+ pretty_name='AlpacaEval2.0',
52
+ tags=[Tags.INSTRUCTION_FOLLOWING, Tags.ARENA],
53
+ description='Alpaca Eval 2.0 is an enhanced framework for evaluating instruction-following language models, '
54
+ 'featuring an improved auto-annotator, updated baselines, and continuous preference calculation to '
55
+ 'provide more accurate and cost-effective model assessments. '
56
+ 'Currently not support `length-controlled winrate`; the official Judge model is `gpt-4-1106-preview`, while the baseline model is `gpt-4-turbo`.', # noqa: E501
57
+ dataset_id='AI-ModelScope/alpaca_eval',
58
+ subset_list=['alpaca_eval_gpt4_baseline'],
59
+ metric_list=['winrate'],
60
+ few_shot_num=0,
61
+ train_split=None,
62
+ eval_split='eval',
63
+ prompt_template='{question}'
64
+ )
65
+ )
66
+ class AlpacaEvalAdapter(DefaultDataAdapter):
62
67
 
63
68
  def __init__(self, *args, **kwargs):
64
69
  super().__init__(*args, **kwargs)
65
70
 
66
- # register metrics
67
- metric_registry.register(Metric(name='winrate', object=mean))
68
-
69
- # whether to use LLM as a judge
70
- self.llm_as_a_judge = True
71
-
72
- def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
73
- question = input_d['instruction']
74
- return self.gen_prompt_data(question)
75
-
76
- def get_gold_answer(self, input_d: dict) -> str:
77
- return input_d['output']
71
+ self._use_llm_judge = True # Use LLM as a judge by default
72
+
73
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
74
+ """
75
+ Convert a data record to a Sample object.
76
+
77
+ Args:
78
+ record (Dict[str, Any]): Input data record.
79
+
80
+ Returns:
81
+ Sample: Sample object with input, target, and metadata.
82
+ """
83
+ instruction = record['instruction']
84
+ baseline_output = record['output'] # baseline model output
85
+
86
+ return Sample(
87
+ input=instruction,
88
+ target=baseline_output,
89
+ metadata={
90
+ 'generator': record.get('generator', 'unknown'),
91
+ 'dataset': record.get('dataset', 'unknown')
92
+ }
93
+ )
94
+
95
+ def llm_match_score(
96
+ self,
97
+ original_prediction: str,
98
+ filtered_prediction: str,
99
+ reference: str,
100
+ task_state: TaskState,
101
+ ) -> Score:
102
+ score = Score(
103
+ extracted_prediction=filtered_prediction,
104
+ prediction=original_prediction,
105
+ )
106
+
107
+ instruction = task_state.input_text
108
+
109
+ # Request judge and obtain score
110
+ # reference is baseline answer 'm', filtered_prediction is model answer 'M'
111
+ prompt = GRADER_TEMPLATE.format(instruction=instruction, output_1=reference, output_2=filtered_prediction)
112
+ judge_response = self.llm_judge.judge(prompt, system_prompt=GRADER_SYSTEM_PROMPT)
78
113
 
79
- def parse_pred_result(self, result: str, raw_input_d: dict = None, **kwargs) -> str:
80
- return result.strip()
81
-
82
- def match(self, gold: str, pred: str):
83
- # simple match
84
- logger.warning(f'Please use LLMJudge to match the result for {self.name}')
85
- return None
86
-
87
- def llm_match(self, gold: Any, pred: Any, judge: LLMJudge, **kwargs) -> bool:
88
- raw_input = kwargs.get('raw_input', None)
89
- instruction = raw_input['instruction']
90
- # gold is baseline answer 'm', pred is model answer 'M'
91
- prompt = GRADER_TEMPLATE.format(instruction=instruction, output_1=gold, output_2=pred)
92
- # get grading response
93
- grading_response = judge(prompt, system_prompt=GRADER_SYSTEM_PROMPT)
94
114
  # parse grading response
95
- match = re.search(r'(m|M)', grading_response)
115
+ match = re.search(r'(m|M)', judge_response)
96
116
  res = match.group(0) if match else None
117
+
97
118
  if res:
98
- return res == 'M'
119
+ winrate = 1 if res == 'M' else 0
99
120
  else:
100
- logger.info(f'Failed to parse grading response: {prompt=}\n {grading_response=}')
101
- return None
102
-
103
- def compute_metric(self, review_res_list: List[bool], **kwargs) -> List[dict]:
104
- # zip dict answers
105
- res_list = [res for res in review_res_list if res is not None]
106
-
107
- return super().compute_metric(res_list, **kwargs)
121
+ logger.info(f'Failed to parse grading response: {prompt=}\n {judge_response=}')
122
+ winrate = 0
123
+
124
+ # Set score based on the match result
125
+ score.value = {'winrate': winrate}
126
+ score.explanation = f'LLM judge: {judge_response}'
127
+ score.metadata = {
128
+ 'source': 'llm_judge',
129
+ 'judge_strategy': self.judge_strategy,
130
+ 'model': self.llm_judge.model_id
131
+ }
132
+ score.main_score_name = 'winrate'
133
+ return score