evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +3 -0
  3. evalscope/api/benchmark/adapters/__init__.py +5 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
  8. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  9. evalscope/api/benchmark/benchmark.py +356 -0
  10. evalscope/api/benchmark/meta.py +121 -0
  11. evalscope/api/dataset/__init__.py +2 -0
  12. evalscope/api/dataset/dataset.py +349 -0
  13. evalscope/api/dataset/loader.py +262 -0
  14. evalscope/api/dataset/utils.py +143 -0
  15. evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope/api/evaluator/cache.py +378 -0
  17. evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope/api/evaluator/state.py +275 -0
  19. evalscope/api/filter/__init__.py +1 -0
  20. evalscope/api/filter/filter.py +72 -0
  21. evalscope/api/messages/__init__.py +12 -0
  22. evalscope/api/messages/chat_message.py +243 -0
  23. evalscope/api/messages/content.py +102 -0
  24. evalscope/api/messages/utils.py +35 -0
  25. evalscope/api/metric/__init__.py +2 -0
  26. evalscope/api/metric/metric.py +55 -0
  27. evalscope/api/metric/scorer.py +113 -0
  28. evalscope/api/mixin/__init__.py +1 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +155 -0
  32. evalscope/api/model/model.py +386 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/app.py +3 -0
  40. evalscope/app/ui/app_ui.py +2 -1
  41. evalscope/app/ui/multi_model.py +50 -25
  42. evalscope/app/ui/single_model.py +26 -14
  43. evalscope/app/utils/data_utils.py +43 -27
  44. evalscope/app/utils/env_utils.py +12 -0
  45. evalscope/app/utils/text_utils.py +14 -14
  46. evalscope/app/utils/visualization.py +9 -4
  47. evalscope/arguments.py +7 -10
  48. evalscope/backend/opencompass/api_meta_template.py +2 -1
  49. evalscope/backend/opencompass/backend_manager.py +6 -5
  50. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  51. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  52. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  53. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  55. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  56. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  57. evalscope/backend/rag_eval/utils/embedding.py +10 -1
  58. evalscope/backend/rag_eval/utils/llm.py +13 -12
  59. evalscope/benchmarks/__init__.py +0 -2
  60. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  61. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  62. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  63. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  64. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  65. evalscope/benchmarks/arena_hard/utils.py +37 -1
  66. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  67. evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
  68. evalscope/benchmarks/bfcl/generation.py +222 -0
  69. evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
  70. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  71. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  72. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  73. evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
  74. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  75. evalscope/benchmarks/docmath/utils.py +4 -5
  76. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  77. evalscope/benchmarks/frames/frames_adapter.py +136 -52
  78. evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
  79. evalscope/benchmarks/general_arena/utils.py +23 -27
  80. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  81. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  82. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  83. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  84. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  85. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  86. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  87. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  88. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  89. evalscope/benchmarks/ifeval/instructions.py +109 -64
  90. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  91. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  92. evalscope/benchmarks/ifeval/utils.py +6 -7
  93. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  94. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  95. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  96. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/math_vista/__init__.py +0 -0
  105. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  106. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  107. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  108. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  109. evalscope/benchmarks/mmmu/__init__.py +0 -0
  110. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  111. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  112. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  113. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  114. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
  115. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  116. evalscope/benchmarks/race/race_adapter.py +33 -119
  117. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  118. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  119. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  120. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  121. evalscope/benchmarks/tau_bench/generation.py +147 -0
  122. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
  123. evalscope/benchmarks/text2image/__init__.py +0 -0
  124. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  125. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  126. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  127. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  128. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  129. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  131. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
  132. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  133. evalscope/cli/cli.py +2 -0
  134. evalscope/cli/start_app.py +7 -1
  135. evalscope/cli/start_perf.py +7 -1
  136. evalscope/cli/start_server.py +6 -3
  137. evalscope/collections/__init__.py +2 -10
  138. evalscope/collections/sampler.py +10 -10
  139. evalscope/collections/schema.py +13 -11
  140. evalscope/config.py +157 -57
  141. evalscope/constants.py +37 -61
  142. evalscope/evaluator/__init__.py +1 -1
  143. evalscope/evaluator/evaluator.py +275 -419
  144. evalscope/filters/__init__.py +2 -0
  145. evalscope/filters/extraction.py +126 -0
  146. evalscope/filters/selection.py +57 -0
  147. evalscope/metrics/__init__.py +13 -13
  148. evalscope/metrics/llm_judge.py +47 -33
  149. evalscope/metrics/math_parser.py +27 -22
  150. evalscope/metrics/metric.py +307 -0
  151. evalscope/metrics/metrics.py +22 -18
  152. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  153. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  154. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  155. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  156. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  157. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  158. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  159. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  160. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  162. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  163. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  184. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  185. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  186. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  187. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  188. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  189. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  190. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  191. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  192. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  193. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  194. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  195. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  196. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  197. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  198. evalscope/models/__init__.py +6 -29
  199. evalscope/models/image_edit_model.py +125 -0
  200. evalscope/models/mockllm.py +65 -0
  201. evalscope/models/model_apis.py +67 -0
  202. evalscope/models/modelscope.py +455 -0
  203. evalscope/models/openai_compatible.py +126 -0
  204. evalscope/models/text2image_model.py +124 -0
  205. evalscope/models/utils/openai.py +701 -0
  206. evalscope/perf/benchmark.py +4 -1
  207. evalscope/perf/http_client.py +4 -2
  208. evalscope/perf/plugin/api/custom_api.py +5 -4
  209. evalscope/perf/plugin/api/openai_api.py +11 -9
  210. evalscope/perf/plugin/datasets/custom.py +2 -1
  211. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  212. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  213. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  214. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  215. evalscope/perf/plugin/datasets/openqa.py +4 -2
  216. evalscope/perf/utils/benchmark_util.py +15 -10
  217. evalscope/perf/utils/db_util.py +9 -6
  218. evalscope/perf/utils/local_server.py +11 -3
  219. evalscope/perf/utils/rich_display.py +16 -10
  220. evalscope/report/__init__.py +2 -3
  221. evalscope/report/combinator.py +18 -12
  222. evalscope/report/generator.py +51 -35
  223. evalscope/report/{utils.py → report.py} +8 -6
  224. evalscope/run.py +33 -47
  225. evalscope/summarizer.py +1 -1
  226. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  227. evalscope/utils/__init__.py +21 -2
  228. evalscope/utils/chat_service.py +3 -2
  229. evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope/utils/function_utils.py +29 -0
  231. evalscope/utils/import_utils.py +23 -1
  232. evalscope/utils/io_utils.py +142 -6
  233. evalscope/utils/json_schema.py +208 -0
  234. evalscope/utils/logger.py +51 -12
  235. evalscope/utils/model_utils.py +11 -7
  236. evalscope/utils/multi_choices.py +288 -0
  237. evalscope/utils/url_utils.py +65 -0
  238. evalscope/version.py +2 -2
  239. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
  240. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
  241. tests/benchmark/test_eval.py +385 -0
  242. tests/benchmark/test_image_edit.py +65 -0
  243. tests/{aigc → benchmark}/test_t2i.py +22 -4
  244. tests/benchmark/test_vlm.py +80 -0
  245. tests/cli/test_all.py +85 -47
  246. tests/cli/test_collection.py +20 -8
  247. tests/cli/test_custom.py +22 -15
  248. tests/cli/test_reasoning.py +81 -0
  249. tests/common.py +73 -0
  250. tests/perf/test_perf.py +4 -2
  251. tests/rag/test_clip_benchmark.py +0 -2
  252. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  253. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  254. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  255. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
  256. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  257. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  258. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  259. evalscope/benchmarks/benchmark.py +0 -81
  260. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  261. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  262. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  263. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  264. evalscope/benchmarks/data_adapter.py +0 -528
  265. evalscope/benchmarks/filters.py +0 -59
  266. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  267. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  268. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  269. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  270. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  271. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  272. evalscope/benchmarks/race/race.py +0 -104
  273. evalscope/benchmarks/race/samples.jsonl +0 -5
  274. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  275. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  276. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  277. evalscope/benchmarks/utils.py +0 -60
  278. evalscope/collections/evaluator.py +0 -375
  279. evalscope/metrics/completion_parsers.py +0 -227
  280. evalscope/metrics/named_metrics.py +0 -55
  281. evalscope/models/adapters/__init__.py +0 -14
  282. evalscope/models/adapters/base_adapter.py +0 -84
  283. evalscope/models/adapters/bfcl_adapter.py +0 -246
  284. evalscope/models/adapters/chat_adapter.py +0 -207
  285. evalscope/models/adapters/choice_adapter.py +0 -222
  286. evalscope/models/adapters/custom_adapter.py +0 -71
  287. evalscope/models/adapters/server_adapter.py +0 -236
  288. evalscope/models/adapters/t2i_adapter.py +0 -79
  289. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  290. evalscope/models/custom/__init__.py +0 -4
  291. evalscope/models/custom/custom_model.py +0 -50
  292. evalscope/models/custom/dummy_model.py +0 -99
  293. evalscope/models/local_model.py +0 -128
  294. evalscope/models/register.py +0 -41
  295. tests/cli/test_run.py +0 -489
  296. /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
  297. /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
  298. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  299. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  300. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  301. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  302. /tests/{aigc → benchmark}/__init__.py +0 -0
@@ -0,0 +1,386 @@
1
+ import abc
2
+ from pydantic_core import to_jsonable_python
3
+ from typing import TYPE_CHECKING, Any, Dict, Generator, List, Literal, Optional, Sequence, Union
4
+
5
+ from evalscope.api.messages import ChatMessage, ChatMessageAssistant, ChatMessageSystem, ChatMessageUser
6
+ from evalscope.api.registry import get_model_api
7
+ from evalscope.api.tool import ToolChoice, ToolFunction, ToolInfo
8
+ from evalscope.utils import get_logger
9
+ from evalscope.utils.function_utils import thread_safe
10
+ from .generate_config import GenerateConfig
11
+ from .model_output import ModelOutput
12
+
13
+ if TYPE_CHECKING:
14
+ from evalscope.config import TaskConfig
15
+
16
+ logger = get_logger()
17
+
18
+
19
+ class ModelAPI(abc.ABC):
20
+ """Model API provider."""
21
+
22
+ def __init__(
23
+ self,
24
+ model_name: str,
25
+ base_url: Optional[str] = None,
26
+ api_key: Optional[str] = None,
27
+ config: GenerateConfig = GenerateConfig(),
28
+ **kwargs
29
+ ) -> None:
30
+ """Create a model API provider.
31
+
32
+ Args:
33
+ model_name (str): Model name.
34
+ base_url (str | None): Alternate base URL for model.
35
+ api_key (str | None): API key for model.
36
+ api_key_vars (list[str]): Environment variables that
37
+ may contain keys for this provider (used for override)
38
+ config (GenerateConfig): Model configuration.
39
+ """
40
+ self.model_name = model_name
41
+ self.base_url = base_url
42
+ self.api_key = api_key
43
+ self.config = config
44
+
45
+ @abc.abstractmethod
46
+ def generate(
47
+ self,
48
+ input: List[ChatMessage],
49
+ tools: List[ToolInfo],
50
+ tool_choice: ToolChoice,
51
+ config: GenerateConfig,
52
+ ) -> ModelOutput:
53
+ """Generate output from the model.
54
+
55
+ Args:
56
+ input (str | list[ChatMessage]): Chat message
57
+ input (if a `str` is passed it is converted
58
+ to a `ChatUserMessage`).
59
+ tools (list[ToolInfo]): Tools available for the
60
+ model to call.
61
+ tool_choice (ToolChoice): Directives to the model
62
+ as to which tools to prefer.
63
+ config (GenerateConfig): Model configuration.
64
+
65
+ Returns:
66
+ ModelOutput
67
+ """
68
+ ...
69
+
70
+ def batch_generate(
71
+ self,
72
+ inputs: List[List[ChatMessage]],
73
+ tools: List[List[ToolInfo]],
74
+ tool_choices: List[ToolChoice],
75
+ configs: List[GenerateConfig],
76
+ ) -> Generator[ModelOutput, None, None]:
77
+ """Default batch implementation using individual generate calls.
78
+
79
+ ModelAPI implementations can override this for optimized batch processing.
80
+
81
+ Args:
82
+ inputs: List of preprocessed chat message inputs.
83
+ tools: List of tools for each input.
84
+ tool_choices: List of tool choices for each input.
85
+ configs: List of configs for each input.
86
+
87
+ Returns:
88
+ Generator yielding ModelOutput for each input.
89
+ """
90
+ from concurrent.futures import ThreadPoolExecutor
91
+
92
+ def single_generate(args):
93
+ input_msgs, input_tools, tool_choice, config = args
94
+ return self.generate(input_msgs, input_tools, tool_choice, config)
95
+
96
+ with ThreadPoolExecutor(max_workers=self.config.batch_size) as executor:
97
+ futures = []
98
+ for input_msgs, input_tools, tool_choice, config in zip(inputs, tools, tool_choices, configs):
99
+ future = executor.submit(single_generate, (input_msgs, input_tools, tool_choice, config))
100
+ futures.append(future)
101
+
102
+ for future in futures:
103
+ yield future.result()
104
+
105
+ def supports_batch(self) -> bool:
106
+ """Whether this ModelAPI supports optimized batch processing."""
107
+ return False
108
+
109
+ def max_tokens(self) -> Optional[int]:
110
+ """Default max_tokens."""
111
+ return None
112
+
113
+ def max_tokens_for_config(self, config: GenerateConfig) -> Optional[int]:
114
+ """Default max_tokens for a given config.
115
+
116
+ Args:
117
+ config: Generation config.
118
+
119
+ Returns:
120
+ Default maximum tokens for specified configuration.
121
+ """
122
+ return None
123
+
124
+ def tools_required(self) -> bool:
125
+ """Any tool use in a message stream means that tools must be passed."""
126
+ return False
127
+
128
+ def tool_result_images(self) -> bool:
129
+ """Tool results can contain images"""
130
+ return False
131
+
132
+
133
+ class Model:
134
+ """Model interface.
135
+
136
+ Use `get_model()` to get an instance of a model.
137
+ """
138
+
139
+ api: ModelAPI
140
+ """Model API."""
141
+
142
+ config: GenerateConfig
143
+ """Generation config."""
144
+
145
+ def __init__(self, api: ModelAPI, config: GenerateConfig, model_args: Dict[str, Any] = {}) -> None:
146
+ """Create a model.
147
+
148
+ Args:
149
+ api: Model API provider.
150
+ config: Model configuration.
151
+ model_args: Optional model args
152
+ """
153
+ self.api = api
154
+ self.config = config
155
+ self.model_args = model_args
156
+
157
+ @property
158
+ def name(self) -> str:
159
+ """Model name or path to model."""
160
+ return self.api.model_name
161
+
162
+ @property
163
+ def role(self) -> Optional[str]:
164
+ """Model role."""
165
+ return self._role
166
+
167
+ @role.setter
168
+ def role(self, role: str) -> None:
169
+ self._role = role
170
+
171
+ def __str__(self) -> str:
172
+ return f'Model(name={self.model_id}, role={self.role})'
173
+
174
+ def generate(
175
+ self,
176
+ input: Union[str, List[ChatMessage]],
177
+ tools: Optional[Sequence[ToolInfo]] = None,
178
+ tool_choice: Optional[ToolChoice] = None,
179
+ config: Optional[GenerateConfig] = None,
180
+ ) -> ModelOutput:
181
+ """Generate output from the model.
182
+
183
+ Args:
184
+ input: Chat message input (if a `str` is passed it is converted
185
+ to a `ChatMessageUser`).
186
+ tools: Tools available for the model to call.
187
+ tool_choice: Directives to the model as to which tools to prefer.
188
+ config: Model configuration.
189
+
190
+ Returns:
191
+ ModelOutput
192
+ """
193
+ processed_input, processed_tools, processed_tool_choice, processed_config = self._preprocess_input(
194
+ input, tools, tool_choice, config
195
+ )
196
+
197
+ # Call the model's generate method
198
+ output = self.api.generate(
199
+ input=processed_input,
200
+ tools=processed_tools,
201
+ tool_choice=processed_tool_choice,
202
+ config=processed_config,
203
+ )
204
+
205
+ # return output
206
+ return output
207
+
208
+ def batch_generate(
209
+ self,
210
+ inputs: List[List[ChatMessage]],
211
+ tools: List[List[ToolInfo]],
212
+ tool_choices: List[ToolChoice],
213
+ configs: List[GenerateConfig],
214
+ ) -> Generator[ModelOutput, None, None]:
215
+ """Generate output from the model for a batch of inputs.
216
+
217
+ Args:
218
+ inputs (List[List[ChatMessage]]): Batch of chat message inputs.
219
+ tools (List[List[ToolInfo]]): Batch of tools for each input.
220
+ tool_choices (List[ToolChoice]): Batch of tool choices for each input.
221
+ configs (List[GenerateConfig]): Batch of configs for each input.
222
+ """
223
+ preprocessed_data = []
224
+
225
+ for input_item, input_tools, input_tool_choice, input_config in zip(inputs, tools, tool_choices, configs):
226
+ processed_input, processed_tools, processed_tool_choice, processed_config = self._preprocess_input(
227
+ input=input_item, tools=input_tools, tool_choice=input_tool_choice, config=input_config
228
+ )
229
+ preprocessed_data.append((processed_input, processed_tools, processed_tool_choice, processed_config))
230
+
231
+ # check if ModelAPI supports batch processing
232
+ if self.api.supports_batch() and len(preprocessed_data) > 1:
233
+ # use the batch_generate method of the ModelAPI
234
+ inputs, tools, tool_choices, configs = zip(*preprocessed_data)
235
+ batch_results = self.api.batch_generate(
236
+ inputs=list(inputs), tools=list(tools), tool_choices=list(tool_choices), configs=list(configs)
237
+ )
238
+ for result in batch_results:
239
+ yield result
240
+ else:
241
+ # fall back to processing each input individually
242
+ for input_msgs, input_tools, tool_choice, config in preprocessed_data:
243
+ result = self.api.generate(input_msgs, input_tools, tool_choice, config)
244
+ yield result
245
+
246
+ def _preprocess_input(
247
+ self,
248
+ input: Union[str, List[ChatMessage]],
249
+ tools: Optional[Sequence[ToolInfo]] = None,
250
+ tool_choice: Optional[ToolChoice] = None,
251
+ config: Optional[GenerateConfig] = None,
252
+ ) -> tuple[List[ChatMessage], List[ToolInfo], ToolChoice, GenerateConfig]:
253
+ """pre process input for generate."""
254
+
255
+ # merge passed config
256
+ if config is not None:
257
+ config = self.config.merge(config)
258
+ else:
259
+ config = self.config.model_copy(deep=True)
260
+
261
+ # provide max_tokens from the model api if required
262
+ if config.max_tokens is None:
263
+ config.max_tokens = self.api.max_tokens_for_config(config)
264
+ if config.max_tokens is None:
265
+ config.max_tokens = self.api.max_tokens()
266
+
267
+ # normalize input to chat
268
+ if isinstance(input, str):
269
+ input = [ChatMessageUser(content=input)]
270
+
271
+ # handle tools and tool_choice
272
+ tool_choice = tool_choice if tool_choice is not None else 'auto'
273
+ tools_info = list(tools) if tools is not None else []
274
+
275
+ if isinstance(tool_choice, ToolFunction):
276
+ tools_info = [tool for tool in tools_info if tool.name == tool_choice.name]
277
+
278
+ if tool_choice == 'none' or len(tools_info) == 0:
279
+ if not self.api.tools_required():
280
+ tools_info = []
281
+ tool_choice = 'none'
282
+
283
+ return input, tools_info, tool_choice, config
284
+
285
+
286
+ class ModelCache:
287
+ _models: Dict[str, 'Model'] = {}
288
+
289
+ @classmethod
290
+ def get(cls, key: str) -> Optional['Model']:
291
+ return cls._models.get(key, None)
292
+
293
+ @classmethod
294
+ def set(cls, key: str, model: 'Model') -> None:
295
+ cls._models[key] = model
296
+
297
+
298
+ def get_model_with_task_config(task_config: 'TaskConfig') -> Model:
299
+ """Get an instance of a model with the specified task configuration.
300
+
301
+ Args:
302
+ task_config (TaskConfig): Task configuration.
303
+
304
+ Returns:
305
+ Model: An instance of the model.
306
+ """
307
+ model = task_config.model
308
+ eval_type = task_config.eval_type
309
+ base_url = task_config.api_url
310
+ api_key = task_config.api_key
311
+ config = task_config.generation_config
312
+ model_args = task_config.model_args or {}
313
+
314
+ return get_model(
315
+ model=model, eval_type=eval_type, base_url=base_url, api_key=api_key, config=config, model_args=model_args
316
+ )
317
+
318
+
319
+ @thread_safe
320
+ def get_model(
321
+ model: Union[str, Model, ModelAPI],
322
+ eval_type: str,
323
+ base_url: Optional[str] = None,
324
+ api_key: Optional[str] = None,
325
+ config: GenerateConfig = GenerateConfig(),
326
+ model_args: dict = {},
327
+ role: Optional[str] = None,
328
+ memoize: bool = True,
329
+ ) -> Model:
330
+ """Get an instance of a model.
331
+
332
+ Calls to get_model() are memoized (i.e. a call with the same arguments
333
+ will return an existing instance of the model rather than creating a
334
+ new one). You can disable this with `memoize=False`.
335
+
336
+ Args:
337
+ task_config (TaskConfig): Task configuration.
338
+ memoize (bool): Whether to memoize the model instance.
339
+
340
+ Returns:
341
+ Model instance.
342
+
343
+ """
344
+
345
+ # start with seeing if a model was passed
346
+ if isinstance(model, Model):
347
+ return model
348
+
349
+ if isinstance(model, ModelAPI):
350
+ return Model(model, config, model_args)
351
+
352
+ # see if we can return a memoized model instance
353
+ # (exclude mockllm since custom_outputs is an infinite generator)
354
+ model_cache_key: str = ''
355
+ if eval_type.startswith('mock_llm'):
356
+ memoize = False
357
+ if memoize:
358
+ model_cache_key = (
359
+ model + str(role) + config.model_dump_json(exclude_none=True) + str(base_url) + str(api_key)
360
+ + str(to_jsonable_python(model_args, fallback=lambda _: None))
361
+ )
362
+ cached = ModelCache.get(model_cache_key)
363
+ if cached is not None:
364
+ return cached
365
+
366
+ logger.info(
367
+ f'Creating model {model} with eval_type={eval_type} '
368
+ f'base_url={base_url}, api_key={api_key}, config={config}, model_args={model_args}'
369
+ )
370
+
371
+ # find a matching model type
372
+ modelapi_type = get_model_api(eval_type)
373
+
374
+ modelapi_instance = modelapi_type(
375
+ model_name=model,
376
+ base_url=base_url,
377
+ api_key=api_key,
378
+ config=config,
379
+ **model_args,
380
+ )
381
+ m = Model(modelapi_instance, config, model_args)
382
+ if role is not None:
383
+ m.role = role
384
+ if memoize:
385
+ ModelCache.set(model_cache_key, m)
386
+ return m
@@ -0,0 +1,285 @@
1
+ import uuid
2
+ from pydantic import BaseModel, Field, JsonValue, model_validator
3
+ from typing import Any, Dict, List, Literal, Optional, Type, Union
4
+
5
+ from evalscope.api.messages import ChatMessageAssistant, Content
6
+ from evalscope.api.tool import ToolCall, ToolFunction
7
+
8
+
9
+ class ModelUsage(BaseModel):
10
+ """Token usage for completion."""
11
+
12
+ input_tokens: int = Field(default=0)
13
+ """Total input tokens used."""
14
+
15
+ output_tokens: int = Field(default=0)
16
+ """Total output tokens used."""
17
+
18
+ total_tokens: int = Field(default=0)
19
+ """Total tokens used."""
20
+
21
+ input_tokens_cache_write: Optional[int] = Field(default=None)
22
+ """Number of tokens written to the cache."""
23
+
24
+ input_tokens_cache_read: Optional[int] = Field(default=None)
25
+ """Number of tokens retrieved from the cache."""
26
+
27
+ reasoning_tokens: Optional[int] = Field(default=None)
28
+ """Number of tokens used for reasoning."""
29
+
30
+ def __add__(self, other: 'ModelUsage') -> 'ModelUsage':
31
+
32
+ def optional_sum(a: Optional[int], b: Optional[int]) -> Optional[int]:
33
+ if a is not None and b is not None:
34
+ return a + b
35
+ if a is not None:
36
+ return a
37
+ if b is not None:
38
+ return b
39
+ return None
40
+
41
+ return ModelUsage(
42
+ input_tokens=self.input_tokens + other.input_tokens,
43
+ output_tokens=self.output_tokens + other.output_tokens,
44
+ total_tokens=self.total_tokens + other.total_tokens,
45
+ input_tokens_cache_write=optional_sum(self.input_tokens_cache_write, other.input_tokens_cache_write),
46
+ input_tokens_cache_read=optional_sum(self.input_tokens_cache_read, other.input_tokens_cache_read),
47
+ reasoning_tokens=optional_sum(self.reasoning_tokens, other.reasoning_tokens),
48
+ )
49
+
50
+
51
+ StopReason = Literal[
52
+ 'stop',
53
+ 'max_tokens',
54
+ 'model_length',
55
+ 'tool_calls',
56
+ 'content_filter',
57
+ 'unknown',
58
+ ]
59
+ """Reason that the model stopped or failed to generate."""
60
+
61
+
62
+ class TopLogprob(BaseModel):
63
+ """List of the most likely tokens and their log probability, at this token position."""
64
+
65
+ token: str
66
+ """The top-kth token represented as a string."""
67
+
68
+ logprob: float
69
+ """The log probability value of the model for the top-kth token."""
70
+
71
+ bytes: Optional[List[int]] = Field(default=None)
72
+ """The top-kth token represented as a byte array (a list of integers)."""
73
+
74
+
75
+ class Logprob(BaseModel):
76
+ """Log probability for a token."""
77
+
78
+ token: str
79
+ """The predicted token represented as a string."""
80
+
81
+ logprob: float
82
+ """The log probability value of the model for the predicted token."""
83
+
84
+ bytes: Optional[List[int]] = Field(default=None)
85
+ """The predicted token represented as a byte array (a list of integers)."""
86
+
87
+ top_logprobs: Optional[List[TopLogprob]] = Field(default=None)
88
+ """If the `top_logprobs` argument is greater than 0, this will contain an ordered list of the top K most likely tokens and their log probabilities.""" # noqa: E501
89
+
90
+
91
+ class Logprobs(BaseModel):
92
+ """Log probability information for a completion choice."""
93
+
94
+ content: List[Logprob]
95
+ """a (num_generated_tokens,) length list containing the individual log probabilities for each generated token."""
96
+
97
+
98
+ class ChatCompletionChoice(BaseModel):
99
+ """Choice generated for completion."""
100
+
101
+ message: ChatMessageAssistant
102
+ """Assistant message."""
103
+
104
+ stop_reason: StopReason = Field(default='unknown')
105
+ """Reason that the model stopped generating."""
106
+
107
+ logprobs: Optional[Logprobs] = Field(default=None)
108
+ """Logprobs."""
109
+
110
+ @model_validator(mode='before')
111
+ @classmethod
112
+ def migrate_stop_reason(cls: Type['ChatCompletionChoice'], values: Dict[str, Any]) -> Dict[str, Any]:
113
+ if 'stop_reason' in values:
114
+ stop_reason = values['stop_reason']
115
+ if stop_reason == 'length':
116
+ values['stop_reason'] = 'max_tokens'
117
+
118
+ return values
119
+
120
+ @classmethod
121
+ def from_content(cls, content: Union[str, List[Content]]) -> 'ChatCompletionChoice':
122
+ """Create a ChatCompletionChoice from content string."""
123
+ return cls(
124
+ message=ChatMessageAssistant(content=content),
125
+ stop_reason='stop',
126
+ )
127
+
128
+
129
+ class ModelOutput(BaseModel):
130
+ """Output from model generation."""
131
+
132
+ model: str = Field(default_factory=str)
133
+ """Model used for generation."""
134
+
135
+ choices: List[ChatCompletionChoice] = Field(default=[])
136
+ """Completion choices."""
137
+
138
+ usage: Optional[ModelUsage] = Field(default=None)
139
+ """Model token usage"""
140
+
141
+ time: Optional[float] = Field(default=None)
142
+ """Time elapsed (in seconds) for call to generate."""
143
+
144
+ metadata: Optional[Dict[str, Any]] = Field(default=None)
145
+ """Additional metadata associated with model output."""
146
+
147
+ error: Optional[str] = Field(default=None)
148
+ """Error message in the case of content moderation refusals."""
149
+
150
+ @property
151
+ def empty(self) -> bool:
152
+ return len(self.choices) == 0
153
+
154
+ @property
155
+ def stop_reason(self) -> StopReason:
156
+ """First message stop reason."""
157
+ return self.choices[0].stop_reason
158
+
159
+ @property
160
+ def message(self) -> ChatMessageAssistant:
161
+ """First message choice."""
162
+ return self.choices[0].message
163
+
164
+ @property
165
+ def completion(self) -> str:
166
+ """Text of first message choice text."""
167
+ if len(self.choices) > 0:
168
+ return self.choices[0].message.text
169
+ else:
170
+ return '\n'.join(choice.message.text for choice in self.choices)
171
+
172
+ @completion.setter
173
+ def completion(self, completion: str) -> None:
174
+ """Set the text of the first message choice.
175
+
176
+ Args:
177
+ completion (str): Text for first message.
178
+ """
179
+ if len(self.choices) > 0:
180
+ self.choices[0].message.text = completion
181
+ else:
182
+ self.choices.append(
183
+ ChatCompletionChoice(
184
+ message=ChatMessageAssistant(content=completion, model=self.model),
185
+ stop_reason='stop',
186
+ )
187
+ )
188
+
189
+ @property
190
+ def completions(self) -> List[str]:
191
+ """List of all message choices text."""
192
+ return [choice.message.text for choice in self.choices]
193
+
194
+ @staticmethod
195
+ def from_content(
196
+ model: str,
197
+ content: Union[str, List[Content]],
198
+ stop_reason: StopReason = 'stop',
199
+ error: Optional[str] = None,
200
+ ) -> 'ModelOutput':
201
+ """Create ModelOutput from simple text content.
202
+
203
+ Args:
204
+ model: Model name.
205
+ content: Text content from generation.
206
+ stop_reason: Stop reason for generation.
207
+ error: Error message.
208
+ """
209
+ return ModelOutput(
210
+ model=model,
211
+ choices=[
212
+ ChatCompletionChoice(
213
+ message=ChatMessageAssistant(content=content, model=model, source='generate'),
214
+ stop_reason=stop_reason,
215
+ )
216
+ ],
217
+ error=error,
218
+ )
219
+
220
+ @staticmethod
221
+ def for_tool_call(
222
+ model: str,
223
+ tool_name: str,
224
+ tool_arguments: Dict[str, Any],
225
+ internal: Optional[JsonValue] = None,
226
+ tool_call_id: Optional[str] = None,
227
+ content: Optional[str] = None,
228
+ ) -> 'ModelOutput':
229
+ """
230
+ Returns a ModelOutput for requesting a tool call.
231
+
232
+ Args:
233
+ model: model name
234
+ tool_name: The name of the tool.
235
+ internal: The model's internal info for the tool (if any).
236
+ tool_arguments: The arguments passed to the tool.
237
+ tool_call_id: Optional ID for the tool call. Defaults to a random UUID.
238
+ content: Optional content to include in the message. Defaults to "tool call for tool {tool_name}".
239
+
240
+ Returns:
241
+ A ModelOutput corresponding to the tool call
242
+ """
243
+ if content is None:
244
+ content = f'tool call for tool {tool_name}'
245
+
246
+ if tool_call_id is None:
247
+ tool_call_id = f'for_tool_call_{uuid.uuid4()}'
248
+
249
+ return ModelOutput(
250
+ model=model,
251
+ choices=[
252
+ ChatCompletionChoice(
253
+ message=ChatMessageAssistant(
254
+ content=content,
255
+ model=model,
256
+ source='generate',
257
+ tool_calls=[
258
+ ToolCall(
259
+ id=tool_call_id,
260
+ internal=internal,
261
+ function=ToolFunction(
262
+ name=tool_name,
263
+ arguments=tool_arguments,
264
+ )
265
+ )
266
+ ],
267
+ ),
268
+ stop_reason='tool_calls',
269
+ )
270
+ ],
271
+ )
272
+
273
+
274
+ def as_stop_reason(reason: Optional[str]) -> StopReason:
275
+ """Encode common reason strings into standard StopReason."""
276
+ if reason in ['stop', 'eos']:
277
+ return 'stop'
278
+ elif reason == 'length':
279
+ return 'max_tokens'
280
+ elif reason in ['tool_calls', 'function_call']:
281
+ return 'tool_calls'
282
+ elif reason in ['content_filter', 'model_length', 'max_tokens']:
283
+ return reason
284
+ else:
285
+ return 'unknown'