evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +3 -0
  3. evalscope/api/benchmark/adapters/__init__.py +5 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
  8. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  9. evalscope/api/benchmark/benchmark.py +356 -0
  10. evalscope/api/benchmark/meta.py +121 -0
  11. evalscope/api/dataset/__init__.py +2 -0
  12. evalscope/api/dataset/dataset.py +349 -0
  13. evalscope/api/dataset/loader.py +262 -0
  14. evalscope/api/dataset/utils.py +143 -0
  15. evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope/api/evaluator/cache.py +378 -0
  17. evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope/api/evaluator/state.py +275 -0
  19. evalscope/api/filter/__init__.py +1 -0
  20. evalscope/api/filter/filter.py +72 -0
  21. evalscope/api/messages/__init__.py +12 -0
  22. evalscope/api/messages/chat_message.py +243 -0
  23. evalscope/api/messages/content.py +102 -0
  24. evalscope/api/messages/utils.py +35 -0
  25. evalscope/api/metric/__init__.py +2 -0
  26. evalscope/api/metric/metric.py +55 -0
  27. evalscope/api/metric/scorer.py +113 -0
  28. evalscope/api/mixin/__init__.py +1 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +155 -0
  32. evalscope/api/model/model.py +386 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/app.py +3 -0
  40. evalscope/app/ui/app_ui.py +2 -1
  41. evalscope/app/ui/multi_model.py +50 -25
  42. evalscope/app/ui/single_model.py +26 -14
  43. evalscope/app/utils/data_utils.py +43 -27
  44. evalscope/app/utils/env_utils.py +12 -0
  45. evalscope/app/utils/text_utils.py +14 -14
  46. evalscope/app/utils/visualization.py +9 -4
  47. evalscope/arguments.py +7 -10
  48. evalscope/backend/opencompass/api_meta_template.py +2 -1
  49. evalscope/backend/opencompass/backend_manager.py +6 -5
  50. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  51. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  52. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  53. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  55. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  56. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  57. evalscope/backend/rag_eval/utils/embedding.py +10 -1
  58. evalscope/backend/rag_eval/utils/llm.py +13 -12
  59. evalscope/benchmarks/__init__.py +0 -2
  60. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  61. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  62. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  63. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  64. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  65. evalscope/benchmarks/arena_hard/utils.py +37 -1
  66. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  67. evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
  68. evalscope/benchmarks/bfcl/generation.py +222 -0
  69. evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
  70. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  71. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  72. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  73. evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
  74. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  75. evalscope/benchmarks/docmath/utils.py +4 -5
  76. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  77. evalscope/benchmarks/frames/frames_adapter.py +136 -52
  78. evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
  79. evalscope/benchmarks/general_arena/utils.py +23 -27
  80. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  81. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  82. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  83. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  84. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  85. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  86. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  87. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  88. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  89. evalscope/benchmarks/ifeval/instructions.py +109 -64
  90. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  91. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  92. evalscope/benchmarks/ifeval/utils.py +6 -7
  93. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  94. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  95. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  96. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/math_vista/__init__.py +0 -0
  105. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  106. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  107. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  108. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  109. evalscope/benchmarks/mmmu/__init__.py +0 -0
  110. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  111. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  112. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  113. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  114. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
  115. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  116. evalscope/benchmarks/race/race_adapter.py +33 -119
  117. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  118. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  119. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  120. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  121. evalscope/benchmarks/tau_bench/generation.py +147 -0
  122. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
  123. evalscope/benchmarks/text2image/__init__.py +0 -0
  124. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  125. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  126. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  127. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  128. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  129. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  131. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
  132. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  133. evalscope/cli/cli.py +2 -0
  134. evalscope/cli/start_app.py +7 -1
  135. evalscope/cli/start_perf.py +7 -1
  136. evalscope/cli/start_server.py +6 -3
  137. evalscope/collections/__init__.py +2 -10
  138. evalscope/collections/sampler.py +10 -10
  139. evalscope/collections/schema.py +13 -11
  140. evalscope/config.py +157 -57
  141. evalscope/constants.py +37 -61
  142. evalscope/evaluator/__init__.py +1 -1
  143. evalscope/evaluator/evaluator.py +275 -419
  144. evalscope/filters/__init__.py +2 -0
  145. evalscope/filters/extraction.py +126 -0
  146. evalscope/filters/selection.py +57 -0
  147. evalscope/metrics/__init__.py +13 -13
  148. evalscope/metrics/llm_judge.py +47 -33
  149. evalscope/metrics/math_parser.py +27 -22
  150. evalscope/metrics/metric.py +307 -0
  151. evalscope/metrics/metrics.py +22 -18
  152. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  153. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  154. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  155. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  156. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  157. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  158. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  159. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  160. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  162. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  163. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  184. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  185. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  186. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  187. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  188. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  189. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  190. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  191. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  192. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  193. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  194. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  195. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  196. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  197. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  198. evalscope/models/__init__.py +6 -29
  199. evalscope/models/image_edit_model.py +125 -0
  200. evalscope/models/mockllm.py +65 -0
  201. evalscope/models/model_apis.py +67 -0
  202. evalscope/models/modelscope.py +455 -0
  203. evalscope/models/openai_compatible.py +126 -0
  204. evalscope/models/text2image_model.py +124 -0
  205. evalscope/models/utils/openai.py +701 -0
  206. evalscope/perf/benchmark.py +4 -1
  207. evalscope/perf/http_client.py +4 -2
  208. evalscope/perf/plugin/api/custom_api.py +5 -4
  209. evalscope/perf/plugin/api/openai_api.py +11 -9
  210. evalscope/perf/plugin/datasets/custom.py +2 -1
  211. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  212. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  213. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  214. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  215. evalscope/perf/plugin/datasets/openqa.py +4 -2
  216. evalscope/perf/utils/benchmark_util.py +15 -10
  217. evalscope/perf/utils/db_util.py +9 -6
  218. evalscope/perf/utils/local_server.py +11 -3
  219. evalscope/perf/utils/rich_display.py +16 -10
  220. evalscope/report/__init__.py +2 -3
  221. evalscope/report/combinator.py +18 -12
  222. evalscope/report/generator.py +51 -35
  223. evalscope/report/{utils.py → report.py} +8 -6
  224. evalscope/run.py +33 -47
  225. evalscope/summarizer.py +1 -1
  226. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  227. evalscope/utils/__init__.py +21 -2
  228. evalscope/utils/chat_service.py +3 -2
  229. evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope/utils/function_utils.py +29 -0
  231. evalscope/utils/import_utils.py +23 -1
  232. evalscope/utils/io_utils.py +142 -6
  233. evalscope/utils/json_schema.py +208 -0
  234. evalscope/utils/logger.py +51 -12
  235. evalscope/utils/model_utils.py +11 -7
  236. evalscope/utils/multi_choices.py +288 -0
  237. evalscope/utils/url_utils.py +65 -0
  238. evalscope/version.py +2 -2
  239. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
  240. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
  241. tests/benchmark/test_eval.py +385 -0
  242. tests/benchmark/test_image_edit.py +65 -0
  243. tests/{aigc → benchmark}/test_t2i.py +22 -4
  244. tests/benchmark/test_vlm.py +80 -0
  245. tests/cli/test_all.py +85 -47
  246. tests/cli/test_collection.py +20 -8
  247. tests/cli/test_custom.py +22 -15
  248. tests/cli/test_reasoning.py +81 -0
  249. tests/common.py +73 -0
  250. tests/perf/test_perf.py +4 -2
  251. tests/rag/test_clip_benchmark.py +0 -2
  252. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  253. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  254. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  255. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
  256. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  257. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  258. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  259. evalscope/benchmarks/benchmark.py +0 -81
  260. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  261. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  262. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  263. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  264. evalscope/benchmarks/data_adapter.py +0 -528
  265. evalscope/benchmarks/filters.py +0 -59
  266. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  267. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  268. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  269. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  270. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  271. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  272. evalscope/benchmarks/race/race.py +0 -104
  273. evalscope/benchmarks/race/samples.jsonl +0 -5
  274. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  275. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  276. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  277. evalscope/benchmarks/utils.py +0 -60
  278. evalscope/collections/evaluator.py +0 -375
  279. evalscope/metrics/completion_parsers.py +0 -227
  280. evalscope/metrics/named_metrics.py +0 -55
  281. evalscope/models/adapters/__init__.py +0 -14
  282. evalscope/models/adapters/base_adapter.py +0 -84
  283. evalscope/models/adapters/bfcl_adapter.py +0 -246
  284. evalscope/models/adapters/chat_adapter.py +0 -207
  285. evalscope/models/adapters/choice_adapter.py +0 -222
  286. evalscope/models/adapters/custom_adapter.py +0 -71
  287. evalscope/models/adapters/server_adapter.py +0 -236
  288. evalscope/models/adapters/t2i_adapter.py +0 -79
  289. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  290. evalscope/models/custom/__init__.py +0 -4
  291. evalscope/models/custom/custom_model.py +0 -50
  292. evalscope/models/custom/dummy_model.py +0 -99
  293. evalscope/models/local_model.py +0 -128
  294. evalscope/models/register.py +0 -41
  295. tests/cli/test_run.py +0 -489
  296. /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
  297. /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
  298. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  299. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  300. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  301. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  302. /tests/{aigc → benchmark}/__init__.py +0 -0
@@ -1,236 +0,0 @@
1
- import copy
2
- import openai
3
- from collections import defaultdict
4
- from openai.types.chat import ChatCompletion, ChatCompletionChunk
5
- from openai.types.chat.chat_completion import ChatCompletionMessage, Choice
6
- from typing import List, Optional, Union
7
-
8
- from evalscope.utils.argument_utils import get_supported_params
9
- from evalscope.utils.logger import get_logger
10
- from ..register import register_model_adapter
11
- from .base_adapter import BaseModelAdapter
12
-
13
- logger = get_logger()
14
-
15
-
16
- @register_model_adapter(name='server')
17
- class ServerModelAdapter(BaseModelAdapter):
18
- """
19
- Server model adapter to request remote API model and generate results.
20
- """
21
-
22
- def __init__(self, api_url: str, model_id: str, api_key: str = 'EMPTY', **kwargs):
23
- """
24
- Args:
25
- api_url: The URL of the remote API model.
26
- model_id: The ID of the remote API model.
27
- api_key: The API key of the remote API model.
28
- """
29
- self.api_url = api_url.rstrip('/').rsplit('/chat/completions', 1)[0]
30
- self.model_id = model_id
31
- self.api_key = api_key
32
-
33
- self.client = openai.OpenAI(
34
- api_key=self.api_key,
35
- base_url=self.api_url,
36
- )
37
- self.supported_params = get_supported_params(self.client.chat.completions.create)
38
-
39
- self.seed = kwargs.get('seed', None)
40
- self.timeout = kwargs.get('timeout', 60)
41
- self.stream = kwargs.get('stream', False)
42
- self.model_cfg = {'api_url': api_url, 'model_id': model_id, 'api_key': api_key}
43
- super().__init__(model=None, model_cfg=self.model_cfg, **kwargs)
44
-
45
- def predict(self, inputs: List[dict], infer_cfg: Optional[dict] = None) -> List[dict]:
46
- """
47
- Model prediction func.
48
-
49
- Args:
50
- inputs (List[dict]): The input data.
51
- infer_cfg (dict): Inference configuration.
52
-
53
- Returns:
54
- res (List[dict]): The model prediction results.
55
- """
56
- infer_cfg = infer_cfg or {}
57
- results = []
58
-
59
- for input_item in inputs:
60
- response = self.process_single_input(input_item, infer_cfg)
61
- results.append(response)
62
-
63
- return results
64
-
65
- def process_single_input(self, input_item: dict, infer_cfg: dict) -> dict:
66
- """Process a single input item."""
67
- request_json = self.make_request(input_item, infer_cfg)
68
- response = self.send_request(request_json)
69
- return response
70
-
71
- def make_request_messages(self, input_item: dict) -> list:
72
- """
73
- Make request messages for OpenAI API.
74
- """
75
- if input_item.get('messages', None):
76
- return input_item['messages']
77
-
78
- data: list = input_item['data']
79
- if isinstance(data[0], tuple): # for truthful_qa and hellaswag
80
- query = '\n'.join(''.join(item) for item in data)
81
- system_prompt = input_item.get('system_prompt', None)
82
- else:
83
- query = data[0]
84
- system_prompt = input_item.get('system_prompt', None)
85
-
86
- messages = []
87
- if system_prompt:
88
- messages.append({'role': 'system', 'content': system_prompt})
89
-
90
- messages.append({'role': 'user', 'content': query})
91
-
92
- return messages
93
-
94
- def make_request(self, input_item: dict, infer_cfg: dict) -> dict:
95
- """Make request to remote API."""
96
- messages = self.make_request_messages(input_item)
97
- # Format request JSON according to OpenAI API format
98
- request_json = {'model': self.model_id, 'messages': messages, **infer_cfg}
99
-
100
- if self.timeout:
101
- request_json['timeout'] = self.timeout
102
-
103
- request_json['stream'] = self.stream
104
- if self.stream:
105
- request_json['stream_options'] = {'include_usage': True}
106
-
107
- if input_item.get('tools', None):
108
- tools_copy = copy.deepcopy(input_item.get('tools'))
109
- # Remove the "responses" from the functions, as that doesn't
110
- # need to go to the model
111
- for tool in tools_copy:
112
- if 'function' in tool and 'response' in tool['function']:
113
- del tool['function']['response']
114
- request_json['tools'] = tools_copy
115
-
116
- logger.debug(f'Request to remote API: {request_json}')
117
-
118
- return request_json
119
-
120
- def _parse_extra_params(self, request_json):
121
- api_params = {}
122
- extra_body = {}
123
- for key, value in request_json.items():
124
- if key in self.supported_params:
125
- api_params[key] = value
126
- else:
127
- extra_body[key] = value
128
-
129
- if extra_body:
130
- api_params['extra_body'] = extra_body
131
- return api_params
132
-
133
- def send_request(self, request_json: dict) -> dict:
134
- try:
135
- parsed_request = self._parse_extra_params(request_json)
136
- response = self.client.chat.completions.create(**parsed_request)
137
-
138
- if response and self.stream:
139
- response = self._collect_stream_response(response)
140
-
141
- return response.model_dump(exclude_unset=True)
142
- except Exception as e:
143
- logger.error(f'Error when calling remote API: {str(e)}')
144
- raise e
145
-
146
- def _collect_stream_response(self, response_stream: List[ChatCompletionChunk]) -> ChatCompletion:
147
- collected_chunks = []
148
- collected_messages = defaultdict(list)
149
- collected_reasoning = defaultdict(list)
150
- collected_tool_calls = defaultdict(dict)
151
-
152
- for chunk in response_stream:
153
- collected_chunks.append(chunk)
154
- for choice in chunk.choices:
155
- # Handle reasoning content
156
- if hasattr(choice.delta, 'reasoning_content') and choice.delta.reasoning_content is not None:
157
- collected_reasoning[choice.index].append(choice.delta.reasoning_content)
158
-
159
- # Handle regular content
160
- if choice.delta.content is not None:
161
- collected_messages[choice.index].append(choice.delta.content)
162
-
163
- # Handle tool calls
164
- if hasattr(choice.delta, 'tool_calls') and choice.delta.tool_calls:
165
- for tool_call in choice.delta.tool_calls:
166
- tool_id = tool_call.index
167
-
168
- # Initialize tool call if not present
169
- if tool_id not in collected_tool_calls[choice.index]:
170
- collected_tool_calls[choice.index][tool_id] = {
171
- 'id': tool_call.id if hasattr(tool_call, 'id') and tool_call.id else None,
172
- 'type': tool_call.type if hasattr(tool_call, 'type') and tool_call.type else None,
173
- 'function': {
174
- 'name': '',
175
- 'arguments': ''
176
- }
177
- }
178
-
179
- # Update tool call with new chunks
180
- if hasattr(tool_call, 'function'):
181
- if hasattr(tool_call.function, 'name') and tool_call.function.name:
182
- collected_tool_calls[
183
- choice.index][tool_id]['function']['name'] = tool_call.function.name
184
-
185
- if hasattr(tool_call.function, 'arguments') and tool_call.function.arguments:
186
- collected_tool_calls[
187
- choice.index][tool_id]['function']['arguments'] += tool_call.function.arguments
188
-
189
- # Update ID if it was received later
190
- if hasattr(tool_call, 'id') and tool_call.id:
191
- collected_tool_calls[choice.index][tool_id]['id'] = tool_call.id
192
-
193
- # Get all unique choice indices from all collections
194
- all_indices = set(collected_messages.keys()) | set(collected_reasoning.keys()) | set(
195
- collected_tool_calls.keys())
196
-
197
- choices = []
198
- for index in all_indices:
199
- full_reply_content = ''.join(collected_messages.get(index, []))
200
- reasoning = ''.join(collected_reasoning.get(index, []))
201
-
202
- # Process tool_calls for this choice if any exists
203
- tool_calls_list = None
204
- if index in collected_tool_calls and collected_tool_calls[index]:
205
- tool_calls_list = list(collected_tool_calls[index].values())
206
- # Filter out any tool calls with None id (incomplete tool calls)
207
- tool_calls_list = [tc for tc in tool_calls_list if tc['id'] is not None]
208
-
209
- # use the finish_reason from the last chunk that generated this choice
210
- finish_reason = None
211
- for chunk in reversed(collected_chunks):
212
- if chunk.choices and chunk.choices[0].index == index:
213
- finish_reason = chunk.choices[0].finish_reason
214
- break
215
-
216
- message_kwargs = {'role': 'assistant', 'content': full_reply_content}
217
-
218
- if reasoning:
219
- message_kwargs['reasoning_content'] = reasoning
220
-
221
- if tool_calls_list:
222
- message_kwargs['tool_calls'] = tool_calls_list
223
-
224
- choice = Choice(
225
- finish_reason=finish_reason or 'stop', index=index, message=ChatCompletionMessage(**message_kwargs))
226
- choices.append(choice)
227
-
228
- # build the final completion object
229
- return ChatCompletion(
230
- id=collected_chunks[0].id,
231
- choices=choices,
232
- created=collected_chunks[0].created,
233
- model=collected_chunks[0].model,
234
- object='chat.completion',
235
- usage=collected_chunks[-1].usage # use the usage from the last chunk
236
- )
@@ -1,79 +0,0 @@
1
- import os
2
- import time
3
- import torch
4
- from typing import Any, Dict, List, Optional, Tuple, Union
5
-
6
- from evalscope.constants import OutputType
7
- from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage
8
- from evalscope.utils.io_utils import OutputsStructure
9
- from evalscope.utils.logger import get_logger
10
- from ..local_model import LocalModel
11
- from ..register import register_model_adapter
12
- from .base_adapter import BaseModelAdapter
13
-
14
- logger = get_logger()
15
-
16
-
17
- @register_model_adapter(name=OutputType.IMAGE_GENERATION)
18
- class T2IModelAdapter(BaseModelAdapter):
19
- """
20
- Text to image model adapter.
21
- """
22
-
23
- def __init__(self, model: LocalModel, **kwargs):
24
- super().__init__(model)
25
-
26
- self.task_config = kwargs.get('task_cfg', None)
27
- assert self.task_config is not None, 'Task config is required for T2I model adapter.'
28
-
29
- self.save_path = os.path.join(self.task_config.work_dir, OutputsStructure.PREDICTIONS_DIR,
30
- self.task_config.model_id, 'images')
31
- os.makedirs(self.save_path, exist_ok=True)
32
-
33
- def _model_generate(self, prompt, infer_cfg=None) -> List:
34
- """
35
- Generate images from the model.
36
- Args:
37
- prompt: The input prompt.
38
- infer_cfg: The inference configuration.
39
- Returns:
40
- The generated images.
41
- """
42
- infer_cfg = infer_cfg or {}
43
-
44
- sample = self.model(prompt=prompt, **infer_cfg).images
45
- return sample
46
-
47
- @torch.no_grad()
48
- def predict(self, inputs: List[dict], infer_cfg: Optional[dict] = None) -> List[dict]:
49
- """
50
- Args:
51
- inputs: The input data.
52
- infer_cfg: The inference configuration.
53
- Returns:
54
- The prediction results.
55
- """
56
- results = []
57
- for input_item in inputs:
58
- prompt = input_item['data'][0]
59
- image_id = input_item.get('id') or input_item.get('index')
60
-
61
- samples = self._model_generate(prompt, infer_cfg)
62
-
63
- choices_list = []
64
- for index, sample in enumerate(samples):
65
- image_file_path = os.path.join(self.save_path, f'{image_id}_{index}.jpeg')
66
- sample.save(image_file_path)
67
- logger.debug(f'Saved image to {image_file_path}')
68
-
69
- choice = ChatCompletionResponseChoice(
70
- index=index, message=ChatMessage(content=image_file_path, role='assistant'), finish_reason='stop')
71
- choices_list.append(choice)
72
-
73
- res_d = ChatCompletionResponse(
74
- model=self.model_id, choices=choices_list, object='images.generations',
75
- created=int(time.time())).model_dump(exclude_unset=True)
76
-
77
- results.append(res_d)
78
-
79
- return results
@@ -1,189 +0,0 @@
1
- import json
2
- import time
3
- from typing import Any, Dict, List, Optional, Union
4
-
5
- from evalscope.utils.logger import get_logger
6
- from ..register import register_model_adapter
7
- from .server_adapter import ServerModelAdapter
8
-
9
- logger = get_logger()
10
-
11
-
12
- @register_model_adapter(name='tau_bench_server')
13
- class TauBenchAdapter(ServerModelAdapter):
14
- """
15
- TauBench model adapter to request remote API model and generate results for TauBench evaluation.
16
- Support multi-turn and single-turn function calling tasks.
17
- """
18
-
19
- def __init__(self, api_url: str, model_id: str, api_key: str = 'EMPTY', **kwargs):
20
- """
21
- Args:
22
- api_url: The URL of the remote API model.
23
- model_id: The ID of the remote API model.
24
- api_key: The API key of the remote API model.
25
- """
26
- super().__init__(api_url=api_url, model_id=model_id, api_key=api_key, **kwargs)
27
-
28
- self._patch_agent_solve()
29
-
30
- def predict(self, inputs: List[dict], infer_cfg: Optional[dict] = None) -> List[dict]:
31
- """
32
- Model prediction func. For multi-turn evals, we pass a list[list[message]] to the model
33
- where each list is a follow up turn in the conversation
34
- each turn is a List[List[Message]]
35
-
36
- Args:
37
- inputs (List[dict]): The input data.
38
- infer_cfg (dict): Inference configuration.
39
-
40
- Returns:
41
- res (List[dict]): The model prediction results.
42
- """
43
- infer_cfg = infer_cfg or {}
44
- results = []
45
-
46
- for input_item in inputs:
47
- raw_input = input_item.get('raw_input')
48
-
49
- res_d = self.solve(env_name=raw_input['env_name'], task_index=raw_input['task_index'], infer_cfg=infer_cfg)
50
-
51
- wrapper_res = {
52
- 'choices': [{
53
- 'index': 0,
54
- 'message': {
55
- 'content': json.dumps(res_d, ensure_ascii=False),
56
- 'role': 'assistant'
57
- }
58
- }],
59
- 'created':
60
- time.time(),
61
- 'model':
62
- self.model_id,
63
- 'object':
64
- 'chat.completion',
65
- 'usage': {
66
- 'completion_tokens': 0,
67
- 'prompt_tokens': 0,
68
- 'total_tokens': 0
69
- }
70
- }
71
-
72
- results.append(wrapper_res)
73
-
74
- return results
75
-
76
- def _patch_agent_solve(self):
77
- """Patch ToolCallingAgent.solve method to use custom model configuration"""
78
- from tau_bench.agents.tool_calling_agent import ToolCallingAgent, message_to_action
79
- from tau_bench.envs.base import Env
80
- from tau_bench.types import RESPOND_ACTION_NAME, SolveResult
81
- from typing import List, Optional
82
-
83
- def patched_solve(self,
84
- env: Env,
85
- task_index: Optional[int] = None,
86
- max_num_steps: int = 30,
87
- infer_cfg: Optional[dict] = {}) -> SolveResult:
88
- env_reset_res = env.reset(task_index=task_index)
89
- obs = env_reset_res.observation
90
- info = env_reset_res.info.model_dump()
91
- reward = 0.0
92
- messages: List[Dict[str, Any]] = [
93
- {
94
- 'role': 'system',
95
- 'content': self.wiki
96
- },
97
- {
98
- 'role': 'user',
99
- 'content': obs
100
- },
101
- ]
102
-
103
- for step_index in range(max_num_steps):
104
- # Use adapter's model configuration instead of agent's
105
- request_json = adapter_instance.make_request(
106
- input_item={
107
- 'messages': messages,
108
- 'tools': self.tools_info
109
- }, infer_cfg=infer_cfg)
110
- res = adapter_instance.send_request(request_json)
111
-
112
- next_message = res['choices'][0]['message']
113
- action = message_to_action(next_message)
114
- env_response = env.step(action)
115
- reward = env_response.reward
116
- info = {**info, **env_response.info.model_dump()}
117
-
118
- if action.name != RESPOND_ACTION_NAME:
119
- next_message['tool_calls'] = next_message['tool_calls'][:1]
120
- messages.extend([
121
- next_message,
122
- {
123
- 'role': 'tool',
124
- 'tool_call_id': next_message['tool_calls'][0]['id'],
125
- 'name': next_message['tool_calls'][0]['function']['name'],
126
- 'content': env_response.observation,
127
- },
128
- ])
129
- else:
130
- messages.extend([
131
- next_message,
132
- {
133
- 'role': 'user',
134
- 'content': env_response.observation
135
- },
136
- ])
137
- logger.debug(f'Task: {task_index} Step: {step_index} finished')
138
-
139
- if env_response.done:
140
- break
141
-
142
- return SolveResult(
143
- reward=reward,
144
- info=info,
145
- messages=messages,
146
- total_cost=0,
147
- )
148
-
149
- adapter_instance = self
150
-
151
- ToolCallingAgent.solve = patched_solve
152
-
153
- return 'ToolCallingAgent.solve patched successfully'
154
-
155
- def solve(self, env_name, task_index, infer_cfg, **kwargs):
156
- """
157
- Solve a specific task in the TauBench environment.
158
-
159
- Args:
160
- env_name (str): The name of the TauBench environment.
161
- task_index (int): The index of the task to solve.
162
- **kwargs: Additional arguments for the task.
163
-
164
- Returns:
165
- dict: The result of the task.
166
- """
167
- from tau_bench.agents.tool_calling_agent import ToolCallingAgent
168
- from tau_bench.envs import get_env
169
-
170
- # This method can be implemented to solve specific tasks in the TauBench environment
171
- isolated_env = get_env(
172
- env_name=env_name,
173
- user_strategy='llm',
174
- user_model='dummy', # Use dummy model to prevent errors
175
- user_provider='openai', # Use dummy provider to prevent errors
176
- task_split='test',
177
- task_index=task_index,
178
- )
179
- agent = ToolCallingAgent(
180
- tools_info=isolated_env.tools_info,
181
- wiki=isolated_env.wiki,
182
- model='dummy', # Use dummy model to prevent errors
183
- provider='dummy', # Use dummy provider to prevent errors
184
- temperature=0, # dummy temperature to prevent errors
185
- )
186
-
187
- res = agent.solve(env=isolated_env, task_index=task_index, infer_cfg=infer_cfg)
188
-
189
- return res.model_dump()
@@ -1,4 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- from .custom_model import CustomModel
4
- from .dummy_model import DummyCustomModel
@@ -1,50 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import torch
3
- from abc import ABC, abstractmethod
4
- from typing import Any, Dict, List, Union
5
-
6
-
7
- class CustomModel(ABC):
8
-
9
- def __init__(self, config: dict, **kwargs):
10
- self.config = config
11
- self.kwargs = kwargs
12
-
13
- @abstractmethod
14
- @torch.no_grad()
15
- def predict(self, prompts: List[str], **kwargs) -> List[Dict[str, Any]]:
16
- """
17
- Model prediction function for batch inputs.
18
-
19
- Args:
20
- prompts (str): The input batch of prompts to predict.
21
-
22
- **kwargs: kwargs
23
-
24
- Returns:
25
- res (dict): The model prediction results (batch). Format:
26
- [
27
- {
28
- 'choices': [
29
- {
30
- 'index': 0,
31
- 'message': {
32
- 'content': 'xxx',
33
- 'role': 'assistant'
34
- }
35
- }
36
- ],
37
- 'created': 1677664795,
38
- 'model': 'gpt-3.5-turbo-0613', # should be model_id
39
- 'object': 'chat.completion',
40
- 'usage': {
41
- 'completion_tokens': 17,
42
- 'prompt_tokens': 57,
43
- 'total_tokens': 74
44
- }
45
- }
46
- ,
47
- ...
48
- ]
49
- """
50
- raise NotImplementedError
@@ -1,99 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import time
3
- from typing import List
4
-
5
- from evalscope.models import CustomModel
6
- from evalscope.utils.logger import get_logger
7
-
8
- logger = get_logger()
9
-
10
-
11
- class DummyCustomModel(CustomModel):
12
-
13
- def __init__(self, config: dict = {}, **kwargs):
14
- super(DummyCustomModel, self).__init__(config=config, **kwargs)
15
-
16
- def make_request_messages(self, input_item: dict) -> list:
17
- """
18
- Make request messages for OpenAI API.
19
- """
20
- if input_item.get('messages', None):
21
- return input_item['messages']
22
-
23
- data: list = input_item['data']
24
- if isinstance(data[0], tuple): # for truthful_qa and hellaswag
25
- query = '\n'.join(''.join(item) for item in data)
26
- system_prompt = input_item.get('system_prompt', None)
27
- else:
28
- query = data[0]
29
- system_prompt = input_item.get('system_prompt', None)
30
-
31
- messages = []
32
- if system_prompt:
33
- messages.append({'role': 'system', 'content': system_prompt})
34
-
35
- messages.append({'role': 'user', 'content': query})
36
-
37
- return messages
38
-
39
- def predict(self, prompts: List[dict], **kwargs):
40
- original_inputs = kwargs.get('origin_inputs', None)
41
- infer_cfg = kwargs.get('infer_cfg', None)
42
-
43
- logger.debug(f'** Prompts: {prompts}')
44
- if original_inputs is not None:
45
- logger.debug(f'** Original inputs: {original_inputs}')
46
- if infer_cfg is not None:
47
- logger.debug(f'** Inference config: {infer_cfg}')
48
-
49
- # Simulate a response based on the prompts
50
- # Must return a list of dicts with the same format as the OpenAI API.
51
- responses = []
52
- for input_item in original_inputs:
53
- # message = self.make_request_messages(input_item)
54
- # response = f'Dummy response for prompt: {message}'
55
-
56
- res_d = {
57
- 'choices': [{
58
- 'index': 0,
59
- 'message': {
60
- 'content': '*PlaceHolder*',
61
- 'role': 'assistant'
62
- }
63
- }],
64
- 'created': time.time(),
65
- 'model': self.config.get('model_id'),
66
- 'object': 'chat.completion',
67
- 'usage': {
68
- 'completion_tokens': 0,
69
- 'prompt_tokens': 0,
70
- 'total_tokens': 0
71
- }
72
- }
73
-
74
- responses.append(res_d)
75
-
76
- return responses
77
-
78
-
79
- if __name__ == '__main__':
80
- from evalscope import TaskConfig, run_task
81
-
82
- dummy_model = DummyCustomModel()
83
- task_config = TaskConfig(
84
- model=dummy_model,
85
- model_id='evalscope-model-dummy',
86
- datasets=['gsm8k'],
87
- eval_type='custom', # must be custom for custom model evaluation
88
- generation_config={
89
- 'max_new_tokens': 100,
90
- 'temperature': 0.0,
91
- 'top_p': 1.0,
92
- 'top_k': 50,
93
- 'repetition_penalty': 1.0
94
- },
95
- debug=True,
96
- limit=5,
97
- )
98
-
99
- eval_results = run_task(task_cfg=task_config)