evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +3 -0
  3. evalscope/api/benchmark/adapters/__init__.py +5 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
  8. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  9. evalscope/api/benchmark/benchmark.py +356 -0
  10. evalscope/api/benchmark/meta.py +121 -0
  11. evalscope/api/dataset/__init__.py +2 -0
  12. evalscope/api/dataset/dataset.py +349 -0
  13. evalscope/api/dataset/loader.py +262 -0
  14. evalscope/api/dataset/utils.py +143 -0
  15. evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope/api/evaluator/cache.py +378 -0
  17. evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope/api/evaluator/state.py +275 -0
  19. evalscope/api/filter/__init__.py +1 -0
  20. evalscope/api/filter/filter.py +72 -0
  21. evalscope/api/messages/__init__.py +12 -0
  22. evalscope/api/messages/chat_message.py +243 -0
  23. evalscope/api/messages/content.py +102 -0
  24. evalscope/api/messages/utils.py +35 -0
  25. evalscope/api/metric/__init__.py +2 -0
  26. evalscope/api/metric/metric.py +55 -0
  27. evalscope/api/metric/scorer.py +113 -0
  28. evalscope/api/mixin/__init__.py +1 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +155 -0
  32. evalscope/api/model/model.py +386 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/app.py +3 -0
  40. evalscope/app/ui/app_ui.py +2 -1
  41. evalscope/app/ui/multi_model.py +50 -25
  42. evalscope/app/ui/single_model.py +26 -14
  43. evalscope/app/utils/data_utils.py +43 -27
  44. evalscope/app/utils/env_utils.py +12 -0
  45. evalscope/app/utils/text_utils.py +14 -14
  46. evalscope/app/utils/visualization.py +9 -4
  47. evalscope/arguments.py +7 -10
  48. evalscope/backend/opencompass/api_meta_template.py +2 -1
  49. evalscope/backend/opencompass/backend_manager.py +6 -5
  50. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  51. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  52. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  53. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  55. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  56. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  57. evalscope/backend/rag_eval/utils/embedding.py +10 -1
  58. evalscope/backend/rag_eval/utils/llm.py +13 -12
  59. evalscope/benchmarks/__init__.py +0 -2
  60. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  61. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  62. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  63. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  64. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  65. evalscope/benchmarks/arena_hard/utils.py +37 -1
  66. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  67. evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
  68. evalscope/benchmarks/bfcl/generation.py +222 -0
  69. evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
  70. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  71. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  72. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  73. evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
  74. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  75. evalscope/benchmarks/docmath/utils.py +4 -5
  76. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  77. evalscope/benchmarks/frames/frames_adapter.py +136 -52
  78. evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
  79. evalscope/benchmarks/general_arena/utils.py +23 -27
  80. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  81. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  82. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  83. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  84. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  85. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  86. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  87. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  88. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  89. evalscope/benchmarks/ifeval/instructions.py +109 -64
  90. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  91. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  92. evalscope/benchmarks/ifeval/utils.py +6 -7
  93. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  94. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  95. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  96. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/math_vista/__init__.py +0 -0
  105. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  106. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  107. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  108. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  109. evalscope/benchmarks/mmmu/__init__.py +0 -0
  110. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  111. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  112. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  113. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  114. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
  115. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  116. evalscope/benchmarks/race/race_adapter.py +33 -119
  117. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  118. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  119. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  120. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  121. evalscope/benchmarks/tau_bench/generation.py +147 -0
  122. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
  123. evalscope/benchmarks/text2image/__init__.py +0 -0
  124. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  125. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  126. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  127. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  128. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  129. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  131. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
  132. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  133. evalscope/cli/cli.py +2 -0
  134. evalscope/cli/start_app.py +7 -1
  135. evalscope/cli/start_perf.py +7 -1
  136. evalscope/cli/start_server.py +6 -3
  137. evalscope/collections/__init__.py +2 -10
  138. evalscope/collections/sampler.py +10 -10
  139. evalscope/collections/schema.py +13 -11
  140. evalscope/config.py +157 -57
  141. evalscope/constants.py +37 -61
  142. evalscope/evaluator/__init__.py +1 -1
  143. evalscope/evaluator/evaluator.py +275 -419
  144. evalscope/filters/__init__.py +2 -0
  145. evalscope/filters/extraction.py +126 -0
  146. evalscope/filters/selection.py +57 -0
  147. evalscope/metrics/__init__.py +13 -13
  148. evalscope/metrics/llm_judge.py +47 -33
  149. evalscope/metrics/math_parser.py +27 -22
  150. evalscope/metrics/metric.py +307 -0
  151. evalscope/metrics/metrics.py +22 -18
  152. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  153. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  154. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  155. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  156. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  157. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  158. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  159. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  160. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  162. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  163. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  184. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  185. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  186. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  187. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  188. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  189. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  190. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  191. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  192. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  193. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  194. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  195. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  196. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  197. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  198. evalscope/models/__init__.py +6 -29
  199. evalscope/models/image_edit_model.py +125 -0
  200. evalscope/models/mockllm.py +65 -0
  201. evalscope/models/model_apis.py +67 -0
  202. evalscope/models/modelscope.py +455 -0
  203. evalscope/models/openai_compatible.py +126 -0
  204. evalscope/models/text2image_model.py +124 -0
  205. evalscope/models/utils/openai.py +701 -0
  206. evalscope/perf/benchmark.py +4 -1
  207. evalscope/perf/http_client.py +4 -2
  208. evalscope/perf/plugin/api/custom_api.py +5 -4
  209. evalscope/perf/plugin/api/openai_api.py +11 -9
  210. evalscope/perf/plugin/datasets/custom.py +2 -1
  211. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  212. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  213. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  214. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  215. evalscope/perf/plugin/datasets/openqa.py +4 -2
  216. evalscope/perf/utils/benchmark_util.py +15 -10
  217. evalscope/perf/utils/db_util.py +9 -6
  218. evalscope/perf/utils/local_server.py +11 -3
  219. evalscope/perf/utils/rich_display.py +16 -10
  220. evalscope/report/__init__.py +2 -3
  221. evalscope/report/combinator.py +18 -12
  222. evalscope/report/generator.py +51 -35
  223. evalscope/report/{utils.py → report.py} +8 -6
  224. evalscope/run.py +33 -47
  225. evalscope/summarizer.py +1 -1
  226. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  227. evalscope/utils/__init__.py +21 -2
  228. evalscope/utils/chat_service.py +3 -2
  229. evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope/utils/function_utils.py +29 -0
  231. evalscope/utils/import_utils.py +23 -1
  232. evalscope/utils/io_utils.py +142 -6
  233. evalscope/utils/json_schema.py +208 -0
  234. evalscope/utils/logger.py +51 -12
  235. evalscope/utils/model_utils.py +11 -7
  236. evalscope/utils/multi_choices.py +288 -0
  237. evalscope/utils/url_utils.py +65 -0
  238. evalscope/version.py +2 -2
  239. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
  240. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
  241. tests/benchmark/test_eval.py +385 -0
  242. tests/benchmark/test_image_edit.py +65 -0
  243. tests/{aigc → benchmark}/test_t2i.py +22 -4
  244. tests/benchmark/test_vlm.py +80 -0
  245. tests/cli/test_all.py +85 -47
  246. tests/cli/test_collection.py +20 -8
  247. tests/cli/test_custom.py +22 -15
  248. tests/cli/test_reasoning.py +81 -0
  249. tests/common.py +73 -0
  250. tests/perf/test_perf.py +4 -2
  251. tests/rag/test_clip_benchmark.py +0 -2
  252. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  253. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  254. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  255. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
  256. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  257. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  258. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  259. evalscope/benchmarks/benchmark.py +0 -81
  260. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  261. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  262. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  263. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  264. evalscope/benchmarks/data_adapter.py +0 -528
  265. evalscope/benchmarks/filters.py +0 -59
  266. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  267. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  268. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  269. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  270. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  271. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  272. evalscope/benchmarks/race/race.py +0 -104
  273. evalscope/benchmarks/race/samples.jsonl +0 -5
  274. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  275. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  276. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  277. evalscope/benchmarks/utils.py +0 -60
  278. evalscope/collections/evaluator.py +0 -375
  279. evalscope/metrics/completion_parsers.py +0 -227
  280. evalscope/metrics/named_metrics.py +0 -55
  281. evalscope/models/adapters/__init__.py +0 -14
  282. evalscope/models/adapters/base_adapter.py +0 -84
  283. evalscope/models/adapters/bfcl_adapter.py +0 -246
  284. evalscope/models/adapters/chat_adapter.py +0 -207
  285. evalscope/models/adapters/choice_adapter.py +0 -222
  286. evalscope/models/adapters/custom_adapter.py +0 -71
  287. evalscope/models/adapters/server_adapter.py +0 -236
  288. evalscope/models/adapters/t2i_adapter.py +0 -79
  289. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  290. evalscope/models/custom/__init__.py +0 -4
  291. evalscope/models/custom/custom_model.py +0 -50
  292. evalscope/models/custom/dummy_model.py +0 -99
  293. evalscope/models/local_model.py +0 -128
  294. evalscope/models/register.py +0 -41
  295. tests/cli/test_run.py +0 -489
  296. /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
  297. /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
  298. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  299. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  300. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  301. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  302. /tests/{aigc → benchmark}/__init__.py +0 -0
@@ -0,0 +1,168 @@
1
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional
2
+
3
+ from evalscope.api.evaluator import TaskState
4
+ from evalscope.api.metric import Score
5
+ from evalscope.constants import JudgeStrategy
6
+ from evalscope.metrics import LLMJudge
7
+ from evalscope.utils.logger import get_logger
8
+
9
+ if TYPE_CHECKING:
10
+ from evalscope.config import TaskConfig
11
+
12
+ logger = get_logger()
13
+
14
+
15
+ class LLMJudgeMixin:
16
+ """
17
+ Mixin class for LLM Judge functionality.
18
+ """
19
+
20
+ def __init__(self, task_config: 'TaskConfig'):
21
+ self._task_config = task_config
22
+ self._use_llm_judge = False
23
+ """Whether to use LLM as a judge"""
24
+
25
+ self._llm_judge: Optional[LLMJudge] = None
26
+
27
+ @property
28
+ def llm_judge(self) -> Optional[LLMJudge]:
29
+ """Get LLM judge instance with lazy initialization."""
30
+ if self._llm_judge is None and self.use_llm_judge:
31
+ self._llm_judge = self.init_llm_judge()
32
+ return self._llm_judge
33
+
34
+ @llm_judge.setter
35
+ def llm_judge(self, value: Optional[LLMJudge]):
36
+ """Set LLM judge instance."""
37
+ self._llm_judge = value
38
+
39
+ @property
40
+ def judge_strategy(self) -> str:
41
+ """Get the judge strategy from the task configuration."""
42
+ return self._task_config.judge_strategy
43
+
44
+ @property
45
+ def use_llm_judge(self) -> bool:
46
+ """Check if LLM judge is enabled."""
47
+ if self.judge_strategy == JudgeStrategy.RULE:
48
+ return False
49
+ elif self.judge_strategy == JudgeStrategy.LLM:
50
+ return True
51
+ elif self.judge_strategy == JudgeStrategy.LLM_RECALL:
52
+ return True
53
+ elif self.judge_strategy == JudgeStrategy.AUTO:
54
+ return self._use_llm_judge
55
+ else:
56
+ logger.warning(f'Unknown judge strategy: {self.judge_strategy}. Defaulting to False.')
57
+ return False
58
+
59
+ def init_llm_judge(self) -> Optional[LLMJudge]:
60
+ """
61
+ Initialize the LLM judge for the benchmark.
62
+
63
+ Returns:
64
+ Optional[LLMJudge]: The initialized LLM judge instance or None
65
+ """
66
+
67
+ if self.judge_strategy == JudgeStrategy.RULE:
68
+ return None
69
+ else:
70
+ return LLMJudge(**self._task_config.judge_model_args)
71
+
72
+ def maybe_llm_match_score(
73
+ self,
74
+ original_prediction: str,
75
+ filtered_prediction: str,
76
+ reference: str,
77
+ task_state: TaskState,
78
+ rule_based_score: Optional[Score] = None,
79
+ ) -> Score:
80
+ """
81
+ Compute the match score between the original and filtered predictions against the reference.
82
+
83
+ Args:
84
+ original_prediction: The original prediction output from the model.
85
+ filtered_prediction: The filtered prediction output from the model.
86
+ reference: The ground truth reference output.
87
+ task_state: The current task state.
88
+ original_score: Optional original score to be used for comparison.
89
+
90
+ Returns:
91
+ Score: The computed match score.
92
+ """
93
+ # If LLM judge is not used, return the rule-based score directly
94
+ if not self.use_llm_judge:
95
+ return rule_based_score
96
+
97
+ # For LLM_RECALL, if rule-based score is already perfect, skip LLM judge
98
+ if float(rule_based_score.main_value) > 0.99:
99
+ return rule_based_score
100
+
101
+ # Compute LLM judge score
102
+ llm_score = self.llm_match_score(
103
+ original_prediction=original_prediction,
104
+ filtered_prediction=filtered_prediction,
105
+ reference=reference,
106
+ task_state=task_state,
107
+ )
108
+
109
+ # For LLM RECALL, merge the scores
110
+ return self._merge_scores(rule_based_score, llm_score)
111
+
112
+ def llm_match_score(
113
+ self,
114
+ original_prediction: str,
115
+ filtered_prediction: str,
116
+ reference: str,
117
+ task_state: TaskState,
118
+ ) -> Score:
119
+ """Compute the LLM match score.
120
+
121
+ Args:
122
+ original_prediction (str): The original prediction output from the model.
123
+ filtered_prediction (str): The filtered prediction output from the model.
124
+ reference (str): The ground truth reference output.
125
+ task_state (TaskState): The current task state.
126
+
127
+ Returns:
128
+ Score: The computed match score.
129
+ """
130
+ score = Score(
131
+ extracted_prediction=filtered_prediction,
132
+ prediction=original_prediction,
133
+ )
134
+
135
+ question = task_state.input_text
136
+
137
+ # Request judge and obtain score
138
+ prompt = self.llm_judge.build_prompt(pred=original_prediction, gold=reference, question=question)
139
+ judge_response = self.llm_judge.judge(prompt)
140
+ judge_score = self.llm_judge.get_score(judge_response)
141
+
142
+ score.value = {'acc': judge_score}
143
+ score.explanation = f'LLM judge: {judge_response}'
144
+ score.metadata = {
145
+ 'source': 'llm_judge',
146
+ 'judge_strategy': self.judge_strategy,
147
+ 'model': self.llm_judge.model_id
148
+ }
149
+
150
+ return score
151
+
152
+ def _merge_scores(self, rule_based_score: Score, llm_score: Score) -> Score:
153
+ """
154
+ Merge rule-based score with LLM judge score for LLM_RECALL strategy.
155
+
156
+ Args:
157
+ rule_based_score: The original rule-based score
158
+ llm_score: The LLM judge score
159
+
160
+ Returns:
161
+ Score: The merged score
162
+ """
163
+ # Update the main value with LLM judge result
164
+ rule_based_score.main_value = llm_score.main_value
165
+ rule_based_score.explanation = llm_score.explanation
166
+ rule_based_score.metadata = llm_score.metadata
167
+
168
+ return rule_based_score
@@ -0,0 +1,12 @@
1
+ from .generate_config import GenerateConfig
2
+ from .model import Model, ModelAPI, get_model, get_model_with_task_config
3
+ from .model_output import (
4
+ ChatCompletionChoice,
5
+ Logprob,
6
+ Logprobs,
7
+ ModelOutput,
8
+ ModelUsage,
9
+ StopReason,
10
+ TopLogprob,
11
+ as_stop_reason,
12
+ )
@@ -0,0 +1,155 @@
1
+ # flake8: noqa: E501
2
+ from copy import deepcopy
3
+ from pydantic import BaseModel, Field, model_validator
4
+ from typing import Any, Dict, List, Literal, Optional, Union
5
+
6
+ from evalscope.utils.json_schema import JSONSchema
7
+
8
+
9
+ class ResponseSchema(BaseModel):
10
+ """Schema for model response when using Structured Output."""
11
+
12
+ name: str
13
+ """The name of the response schema. Must be a-z, A-Z, 0-9, or contain underscores and dashes, with a maximum length of 64."""
14
+
15
+ json_schema: JSONSchema
16
+ """The schema for the response format, described as a JSON Schema object."""
17
+
18
+ description: Optional[str] = Field(default=None)
19
+ """A description of what the response format is for, used by the model to determine how to respond in the format."""
20
+
21
+ strict: Optional[bool] = Field(default=None)
22
+ """Whether to enable strict schema adherence when generating the output. If set to true, the model will always follow the exact schema defined in the schema field.
23
+ OpenAI and Mistral only."""
24
+
25
+
26
+ class GenerateConfig(BaseModel):
27
+ """Model generation options."""
28
+ model_config = {'extra': 'allow'}
29
+
30
+ timeout: Optional[int] = Field(default=None)
31
+ """Request timeout (in seconds)."""
32
+
33
+ batch_size: Optional[int] = Field(default=None)
34
+ """Maximum number of concurrent connections to Model API (default is model specific) or batch size for generation."""
35
+
36
+ stream: Optional[bool] = Field(default=None)
37
+ """Whether to stream the response (default is model specific)."""
38
+
39
+ system_message: Optional[str] = Field(default=None)
40
+ """Override the default system message."""
41
+
42
+ max_tokens: Optional[int] = Field(default=None)
43
+ """The maximum number of tokens that can be generated in the completion (default is model specific)."""
44
+
45
+ top_p: Optional[float] = Field(default=None)
46
+ """An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass."""
47
+
48
+ temperature: Optional[float] = Field(default=None)
49
+ """What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic."""
50
+
51
+ stop_seqs: Optional[List[str]] = Field(default=None)
52
+ """Sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence."""
53
+
54
+ best_of: Optional[int] = Field(default=None)
55
+ """Generates best_of completions server-side and returns the 'best' (the one with the highest log probability per token). vLLM only."""
56
+
57
+ frequency_penalty: Optional[float] = Field(default=None)
58
+ """Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. OpenAI, Google, Grok, Groq, vLLM, and SGLang only."""
59
+
60
+ presence_penalty: Optional[float] = Field(default=None)
61
+ """Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. OpenAI, Google, Grok, Groq, vLLM, and SGLang only."""
62
+
63
+ logit_bias: Optional[Dict[int, float]] = Field(default=None)
64
+ """Map token Ids to an associated bias value from -100 to 100 (e.g. "42=10,43=-10"). OpenAI, Grok, Grok, and vLLM only."""
65
+
66
+ seed: Optional[int] = Field(default=None)
67
+ """Random seed. OpenAI, Google, Mistral, Groq, HuggingFace, and vLLM only."""
68
+
69
+ do_sample: Optional[bool] = Field(default=None)
70
+ """Whether to use sampling; use greedy decoding otherwise. Only transformers models support this parameter."""
71
+
72
+ top_k: Optional[int] = Field(default=None)
73
+ """Randomly sample the next word from the top_k most likely next words. Anthropic, Google, HuggingFace, vLLM, and SGLang only."""
74
+
75
+ n: Optional[int] = Field(default=None)
76
+ """How many chat completion choices to generate for each input message. OpenAI, Grok, Google, TogetherAI, vLLM, and SGLang only."""
77
+
78
+ logprobs: Optional[bool] = Field(default=None)
79
+ """Return log probabilities of the output tokens. OpenAI, Grok, TogetherAI, Huggingface, llama-cpp-python, vLLM, and SGLang only."""
80
+
81
+ top_logprobs: Optional[int] = Field(default=None)
82
+ """Number of most likely tokens (0-20) to return at each token position, each with an associated log probability. OpenAI, Grok, Huggingface, vLLM, and SGLang only."""
83
+
84
+ parallel_tool_calls: Optional[bool] = Field(default=None)
85
+ """Whether to enable parallel function calling during tool use (defaults to True). OpenAI and Groq only."""
86
+
87
+ internal_tools: Optional[bool] = Field(default=None)
88
+ """Whether to automatically map tools to model internal implementations (e.g. 'computer' for anthropic)."""
89
+
90
+ max_tool_output: Optional[int] = Field(default=None)
91
+ """Maximum tool output (in bytes). Defaults to 16 * 1024."""
92
+
93
+ cache_prompt: Union[Literal['auto'], bool, None] = Field(default=None)
94
+ """Whether to cache the prompt prefix. Defaults to "auto", which will enable caching for requests with tools. Anthropic only."""
95
+
96
+ reasoning_effort: Optional[Literal['low', 'medium', 'high']] = Field(default=None)
97
+ """Constrains effort on reasoning for reasoning models (defaults to `medium`). Open AI o1 models only."""
98
+
99
+ reasoning_tokens: Optional[int] = Field(default=None)
100
+ """Maximum number of tokens to use for reasoning. Anthropic Claude models only."""
101
+
102
+ reasoning_summary: Optional[Literal['concise', 'detailed', 'auto']] = Field(default=None)
103
+ """Provide summary of reasoning steps (defaults to no summary). Use 'auto' to access the most detailed summarizer available for the current model. OpenAI reasoning models only."""
104
+
105
+ reasoning_history: Optional[Literal['none', 'all', 'last', 'auto']] = Field(default=None)
106
+ """Include reasoning in chat message history sent to generate."""
107
+
108
+ response_schema: Optional[ResponseSchema] = Field(default=None)
109
+ """Request a response format as JSONSchema (output should still be validated). OpenAI, Google, and Mistral only."""
110
+
111
+ extra_body: Optional[Dict[str, Any]] = Field(default=None)
112
+ """Extra body to be sent with requests to OpenAI compatible servers. OpenAI, vLLM, and SGLang only."""
113
+
114
+ height: Optional[int] = Field(default=None)
115
+ """Image height for image generation model only"""
116
+
117
+ width: Optional[int] = Field(default=None)
118
+ """Image width for image generation model only"""
119
+
120
+ num_inference_steps: Optional[int] = Field(default=None)
121
+ """Number of inference steps for image generation model only"""
122
+
123
+ guidance_scale: Optional[float] = Field(default=None)
124
+ """Guidance scale for image generation model only"""
125
+
126
+ # migrate reasoning_history as a bool
127
+ @model_validator(mode='before')
128
+ @classmethod
129
+ def migrate_reasoning(cls, data: Any) -> Any:
130
+ if isinstance(data, dict):
131
+ reasoning_history = data.get('reasoning_history', None)
132
+ if reasoning_history is True:
133
+ data['reasoning_history'] = 'all'
134
+ elif reasoning_history is False:
135
+ data['reasoning_history'] = 'none'
136
+
137
+ return data
138
+
139
+ def merge(self, other: 'GenerateConfig') -> 'GenerateConfig':
140
+ """Merge another model configuration into this one.
141
+
142
+ Args:
143
+ other (GenerateConfig):
144
+ Configuration to merge.
145
+
146
+ Returns:
147
+ Merged configuration.
148
+ """
149
+ config_keys = [field for field in self.__class__.model_fields.keys()]
150
+ config = deepcopy(self)
151
+ for key in config_keys:
152
+ value = getattr(other, key, None)
153
+ if value is not None:
154
+ setattr(config, key, value)
155
+ return config