evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +3 -0
  3. evalscope/api/benchmark/adapters/__init__.py +5 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
  8. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  9. evalscope/api/benchmark/benchmark.py +356 -0
  10. evalscope/api/benchmark/meta.py +121 -0
  11. evalscope/api/dataset/__init__.py +2 -0
  12. evalscope/api/dataset/dataset.py +349 -0
  13. evalscope/api/dataset/loader.py +262 -0
  14. evalscope/api/dataset/utils.py +143 -0
  15. evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope/api/evaluator/cache.py +378 -0
  17. evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope/api/evaluator/state.py +275 -0
  19. evalscope/api/filter/__init__.py +1 -0
  20. evalscope/api/filter/filter.py +72 -0
  21. evalscope/api/messages/__init__.py +12 -0
  22. evalscope/api/messages/chat_message.py +243 -0
  23. evalscope/api/messages/content.py +102 -0
  24. evalscope/api/messages/utils.py +35 -0
  25. evalscope/api/metric/__init__.py +2 -0
  26. evalscope/api/metric/metric.py +55 -0
  27. evalscope/api/metric/scorer.py +113 -0
  28. evalscope/api/mixin/__init__.py +1 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +155 -0
  32. evalscope/api/model/model.py +386 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/app.py +3 -0
  40. evalscope/app/ui/app_ui.py +2 -1
  41. evalscope/app/ui/multi_model.py +50 -25
  42. evalscope/app/ui/single_model.py +26 -14
  43. evalscope/app/utils/data_utils.py +43 -27
  44. evalscope/app/utils/env_utils.py +12 -0
  45. evalscope/app/utils/text_utils.py +14 -14
  46. evalscope/app/utils/visualization.py +9 -4
  47. evalscope/arguments.py +7 -10
  48. evalscope/backend/opencompass/api_meta_template.py +2 -1
  49. evalscope/backend/opencompass/backend_manager.py +6 -5
  50. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  51. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  52. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  53. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  55. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  56. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  57. evalscope/backend/rag_eval/utils/embedding.py +10 -1
  58. evalscope/backend/rag_eval/utils/llm.py +13 -12
  59. evalscope/benchmarks/__init__.py +0 -2
  60. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  61. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  62. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  63. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  64. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  65. evalscope/benchmarks/arena_hard/utils.py +37 -1
  66. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  67. evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
  68. evalscope/benchmarks/bfcl/generation.py +222 -0
  69. evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
  70. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  71. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  72. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  73. evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
  74. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  75. evalscope/benchmarks/docmath/utils.py +4 -5
  76. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  77. evalscope/benchmarks/frames/frames_adapter.py +136 -52
  78. evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
  79. evalscope/benchmarks/general_arena/utils.py +23 -27
  80. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  81. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  82. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  83. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  84. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  85. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  86. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  87. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  88. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  89. evalscope/benchmarks/ifeval/instructions.py +109 -64
  90. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  91. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  92. evalscope/benchmarks/ifeval/utils.py +6 -7
  93. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  94. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  95. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  96. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/math_vista/__init__.py +0 -0
  105. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  106. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  107. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  108. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  109. evalscope/benchmarks/mmmu/__init__.py +0 -0
  110. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  111. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  112. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  113. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  114. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
  115. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  116. evalscope/benchmarks/race/race_adapter.py +33 -119
  117. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  118. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  119. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  120. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  121. evalscope/benchmarks/tau_bench/generation.py +147 -0
  122. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
  123. evalscope/benchmarks/text2image/__init__.py +0 -0
  124. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  125. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  126. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  127. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  128. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  129. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  131. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
  132. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  133. evalscope/cli/cli.py +2 -0
  134. evalscope/cli/start_app.py +7 -1
  135. evalscope/cli/start_perf.py +7 -1
  136. evalscope/cli/start_server.py +6 -3
  137. evalscope/collections/__init__.py +2 -10
  138. evalscope/collections/sampler.py +10 -10
  139. evalscope/collections/schema.py +13 -11
  140. evalscope/config.py +157 -57
  141. evalscope/constants.py +37 -61
  142. evalscope/evaluator/__init__.py +1 -1
  143. evalscope/evaluator/evaluator.py +275 -419
  144. evalscope/filters/__init__.py +2 -0
  145. evalscope/filters/extraction.py +126 -0
  146. evalscope/filters/selection.py +57 -0
  147. evalscope/metrics/__init__.py +13 -13
  148. evalscope/metrics/llm_judge.py +47 -33
  149. evalscope/metrics/math_parser.py +27 -22
  150. evalscope/metrics/metric.py +307 -0
  151. evalscope/metrics/metrics.py +22 -18
  152. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  153. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  154. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  155. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  156. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  157. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  158. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  159. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  160. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  162. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  163. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  184. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  185. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  186. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  187. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  188. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  189. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  190. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  191. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  192. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  193. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  194. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  195. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  196. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  197. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  198. evalscope/models/__init__.py +6 -29
  199. evalscope/models/image_edit_model.py +125 -0
  200. evalscope/models/mockllm.py +65 -0
  201. evalscope/models/model_apis.py +67 -0
  202. evalscope/models/modelscope.py +455 -0
  203. evalscope/models/openai_compatible.py +126 -0
  204. evalscope/models/text2image_model.py +124 -0
  205. evalscope/models/utils/openai.py +701 -0
  206. evalscope/perf/benchmark.py +4 -1
  207. evalscope/perf/http_client.py +4 -2
  208. evalscope/perf/plugin/api/custom_api.py +5 -4
  209. evalscope/perf/plugin/api/openai_api.py +11 -9
  210. evalscope/perf/plugin/datasets/custom.py +2 -1
  211. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  212. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  213. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  214. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  215. evalscope/perf/plugin/datasets/openqa.py +4 -2
  216. evalscope/perf/utils/benchmark_util.py +15 -10
  217. evalscope/perf/utils/db_util.py +9 -6
  218. evalscope/perf/utils/local_server.py +11 -3
  219. evalscope/perf/utils/rich_display.py +16 -10
  220. evalscope/report/__init__.py +2 -3
  221. evalscope/report/combinator.py +18 -12
  222. evalscope/report/generator.py +51 -35
  223. evalscope/report/{utils.py → report.py} +8 -6
  224. evalscope/run.py +33 -47
  225. evalscope/summarizer.py +1 -1
  226. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  227. evalscope/utils/__init__.py +21 -2
  228. evalscope/utils/chat_service.py +3 -2
  229. evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope/utils/function_utils.py +29 -0
  231. evalscope/utils/import_utils.py +23 -1
  232. evalscope/utils/io_utils.py +142 -6
  233. evalscope/utils/json_schema.py +208 -0
  234. evalscope/utils/logger.py +51 -12
  235. evalscope/utils/model_utils.py +11 -7
  236. evalscope/utils/multi_choices.py +288 -0
  237. evalscope/utils/url_utils.py +65 -0
  238. evalscope/version.py +2 -2
  239. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
  240. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
  241. tests/benchmark/test_eval.py +385 -0
  242. tests/benchmark/test_image_edit.py +65 -0
  243. tests/{aigc → benchmark}/test_t2i.py +22 -4
  244. tests/benchmark/test_vlm.py +80 -0
  245. tests/cli/test_all.py +85 -47
  246. tests/cli/test_collection.py +20 -8
  247. tests/cli/test_custom.py +22 -15
  248. tests/cli/test_reasoning.py +81 -0
  249. tests/common.py +73 -0
  250. tests/perf/test_perf.py +4 -2
  251. tests/rag/test_clip_benchmark.py +0 -2
  252. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  253. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  254. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  255. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
  256. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  257. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  258. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  259. evalscope/benchmarks/benchmark.py +0 -81
  260. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  261. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  262. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  263. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  264. evalscope/benchmarks/data_adapter.py +0 -528
  265. evalscope/benchmarks/filters.py +0 -59
  266. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  267. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  268. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  269. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  270. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  271. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  272. evalscope/benchmarks/race/race.py +0 -104
  273. evalscope/benchmarks/race/samples.jsonl +0 -5
  274. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  275. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  276. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  277. evalscope/benchmarks/utils.py +0 -60
  278. evalscope/collections/evaluator.py +0 -375
  279. evalscope/metrics/completion_parsers.py +0 -227
  280. evalscope/metrics/named_metrics.py +0 -55
  281. evalscope/models/adapters/__init__.py +0 -14
  282. evalscope/models/adapters/base_adapter.py +0 -84
  283. evalscope/models/adapters/bfcl_adapter.py +0 -246
  284. evalscope/models/adapters/chat_adapter.py +0 -207
  285. evalscope/models/adapters/choice_adapter.py +0 -222
  286. evalscope/models/adapters/custom_adapter.py +0 -71
  287. evalscope/models/adapters/server_adapter.py +0 -236
  288. evalscope/models/adapters/t2i_adapter.py +0 -79
  289. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  290. evalscope/models/custom/__init__.py +0 -4
  291. evalscope/models/custom/custom_model.py +0 -50
  292. evalscope/models/custom/dummy_model.py +0 -99
  293. evalscope/models/local_model.py +0 -128
  294. evalscope/models/register.py +0 -41
  295. tests/cli/test_run.py +0 -489
  296. /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
  297. /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
  298. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  299. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  300. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  301. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  302. /tests/{aigc → benchmark}/__init__.py +0 -0
@@ -1,227 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
- # flake8: noqa
3
-
4
- import ast
5
- import re
6
-
7
- # from . import utils as ann_utils
8
- from evalscope.constants import ArenaWinner
9
- from evalscope.utils.logger import get_logger
10
-
11
- logger = get_logger()
12
-
13
- one_score_pattern = re.compile('\[\[(\d+\.?\d*)\]\]')
14
- one_score_pattern_backup = re.compile('\[(\d+\.?\d*)\]')
15
-
16
-
17
- # modified from: https://github.com/lm-sys/FastChat/blob/main/fastchat/eval/eval_gpt_review.py#L47
18
- # does not work with batched completions
19
- def lmsys_parser(completion, output_format):
20
- if output_format == '[[rating]]':
21
- match = re.search(one_score_pattern, completion)
22
- if not match:
23
- match = re.search(one_score_pattern_backup, completion)
24
-
25
- if match:
26
- rating = ast.literal_eval(match.groups()[0])
27
- else:
28
- logger.error(f'Content: {completion}\n'
29
- 'You must manually fix the score.')
30
- rating = -1
31
-
32
- return rating
33
- if output_format == '[[rating_a,rating_b]]':
34
- try:
35
- score_pair = completion.split('\n')[0]
36
- score_pair = score_pair.replace(',', ' ')
37
- sp = score_pair.split(' ')
38
- if len(sp) == 2:
39
- score_1 = float(sp[0])
40
- score_2 = float(sp[1])
41
- if score_1 > score_2:
42
- winner = ArenaWinner.MODEL_A
43
- elif score_1 < score_2:
44
- winner = ArenaWinner.MODEL_B
45
- else:
46
- if score_1 == score_1 == -1:
47
- winner = ArenaWinner.UNKNOWN
48
- winner = ArenaWinner.TIE
49
- return winner, [score_1, score_2]
50
- else:
51
- raise Exception('Invalid score pair.')
52
- except Exception as e:
53
- logger.error(f'{e}\nContent: {completion}\nYou must manually fix the score pair.')
54
- return ArenaWinner.UNKNOWN, [-1, -1]
55
- elif output_format == '[[A]]':
56
- if '[[A]]' in completion:
57
- winner = ArenaWinner.MODEL_A
58
- elif '[[B]]' in completion:
59
- winner = ArenaWinner.MODEL_B
60
- elif '[[C]]' in completion:
61
- winner = ArenaWinner.TIE
62
- else:
63
- logger.error(f'\nContent: {completion}\nYou must manually fix the score.')
64
- winner = ArenaWinner.UNKNOWN
65
- return winner
66
-
67
-
68
- def ranking_parser(completion, **kwargs):
69
- try:
70
- if isinstance(completion, str):
71
- ordered_completions = ast.literal_eval(completion)
72
- else:
73
- ordered_completions = completion
74
-
75
- rank = [c for c in ordered_completions if c['model'] == 'model_a'][0]['rank']
76
- assert rank in [1, 2]
77
-
78
- return ArenaWinner.MODEL_A if rank == 1 else ArenaWinner.MODEL_B
79
- except Exception as e:
80
- logger.error(f'{e}\nContent: {completion}\n'
81
- 'You must manually fix the score pair.')
82
- return ArenaWinner.UNKNOWN
83
-
84
-
85
- class ResponseParser:
86
-
87
- @staticmethod
88
- def parse_first_capital(text: str, options: list[str]) -> str:
89
- for t in text:
90
- if t.isupper() and (t in options):
91
- return t
92
- return ''
93
-
94
- @staticmethod
95
- def parse_last_capital(text: str, options: list[str]) -> str:
96
- for t in text[::-1]:
97
- if t.isupper() and (t in options):
98
- return t
99
- return ''
100
-
101
- @staticmethod
102
- def parse_first_option_with_choices(text: str, options: list[str]) -> str:
103
- """
104
- Find first valid option for text.
105
-
106
- Args:
107
- text: The text to parse.
108
- options: The options to find. e.g. ['A', 'B', 'C', 'D']
109
- """
110
- options_concat = ResponseParser.process_options(options)
111
-
112
- patterns = [
113
- rf'答案是?\s?([{options_concat}])',
114
- rf'答案是?\s?:([{options_concat}])',
115
- rf'答案是?\s?:([{options_concat}])',
116
- rf'答案应该?是\s?([{options_concat}])',
117
- rf'答案应该?选\s?([{options_concat}])',
118
- rf'答案为\s?([{options_concat}])',
119
- rf'答案选\s?([{options_concat}])',
120
- rf'选择?\s?([{options_concat}])',
121
- rf'故选?\s?([{options_concat}])'
122
- rf'只有选?项?\s?([{options_concat}])\s?是?对',
123
- rf'只有选?项?\s?([{options_concat}])\s?是?错',
124
- rf'只有选?项?\s?([{options_concat}])\s?不?正确',
125
- rf'只有选?项?\s?([{options_concat}])\s?错误',
126
- rf'说法不?对选?项?的?是\s?([{options_concat}])',
127
- rf'说法不?正确选?项?的?是\s?([{options_concat}])',
128
- rf'说法错误选?项?的?是\s?([{options_concat}])',
129
- rf'([{options_concat}])\s?是正确的',
130
- rf'([{options_concat}])\s?是正确答案',
131
- rf'选项\s?([{options_concat}])\s?正确',
132
- rf'所以答\s?([{options_concat}])',
133
- rf'所以\s?([{options_concat}][.。$]?$)',
134
- rf'所有\s?([{options_concat}][.。$]?$)',
135
- rf'[\s,::,]([{options_concat}])[。,,\.]?$',
136
- rf'[\s,,::][故即]([{options_concat}])[。\.]?$',
137
- rf'[\s,,::]因此([{options_concat}])[。\.]?$',
138
- rf'[是为。]\s?([{options_concat}])[。\.]?$',
139
- rf'因此\s?([{options_concat}])[。\.]?$',
140
- rf'显然\s?([{options_concat}])[。\.]?$',
141
- rf'答案是\s?(\S+)(?:。|$)',
142
- rf'答案应该是\s?(\S+)(?:。|$)',
143
- rf'答案为\s?(\S+)(?:。|$)',
144
- rf'答案是(.*?)[{options_concat}]',
145
- rf'答案为(.*?)[{options_concat}]',
146
- rf'固选(.*?)[{options_concat}]',
147
- rf'答案应该是(.*?)[{options_concat}]',
148
- rf'[Tt]he answer is \(?[{options_concat}]\)?',
149
- rf'[Tt]he correct answer is [{options_concat}]',
150
- rf'[Tt]he correct answer is:\n[{options_concat}]',
151
- rf'(\s|^)[{options_concat}][\s。,,\.$]', # noqa
152
- rf'^选项\s?([{options_concat}])',
153
- rf'^([{options_concat}])\s?选?项',
154
- rf'(\s|^)[{options_concat}][\s。,,::\.$]',
155
- rf'(\s|^)[{options_concat}](\s|$)',
156
- rf'[{options_concat}]',
157
- ]
158
-
159
- regexes = [re.compile(pattern) for pattern in patterns]
160
- for regex in regexes:
161
- match = regex.search(text)
162
- if match:
163
- outputs = match.group(0)
164
- for i in options:
165
- if i in outputs:
166
- return i
167
- # If no match found, try to find the last capital letter in the text
168
- last_capital = ResponseParser.parse_last_capital(text, options)
169
- if last_capital:
170
- return last_capital
171
- return 'No valid option found'
172
-
173
- @staticmethod
174
- def parse_first_option(text: str, options: list[str]) -> str:
175
- """
176
- Find first valid option for text.
177
-
178
- Args:
179
- text: The text to parse.
180
- """
181
- options_pattern = ResponseParser.process_options(options)
182
-
183
- patterns = [
184
- rf'[Aa]nswer:\s*({options_pattern})',
185
- rf'ANSWER:\s*({options_pattern})',
186
- rf'answer is \(?({options_pattern})\)?',
187
- rf'[Tt]he correct answer is:\s*({options_pattern})',
188
- rf'[Tt]he correct answer is:\n\s*({options_pattern})',
189
- rf'[Tt]he correct answer is:\n\n-\s*({options_pattern})',
190
- rf'[Tt]he answer might be:\n\n-\s*({options_pattern})',
191
- rf'[Tt]he answer is \s*({options_pattern})',
192
- ]
193
-
194
- regexes = [re.compile(pattern) for pattern in patterns]
195
- for regex in regexes:
196
- matches = regex.search(text)
197
- if matches:
198
- return matches.group(1)
199
- # If no match found, try to find the last capital letter in the text
200
- last_capital = ResponseParser.parse_last_capital(text, options)
201
- if last_capital:
202
- return last_capital
203
- return 'No valid option found'
204
-
205
- @staticmethod
206
- def parse_bracketed_answer(text: str, options: list[str]) -> str:
207
- options = ResponseParser.process_options(options)
208
- # Match the first occurrence of the options in angle brackets
209
- match = re.search(rf'<({options})>', text)
210
- if match:
211
- return match.group(1)
212
- return 'No valid option found'
213
-
214
- @staticmethod
215
- def process_options(options: list[str]) -> str:
216
- # Escape each option to ensure special characters in options are treated literally
217
- escaped_options = [re.escape(option) for option in options]
218
- # Join options into a regex pattern separated by '|', to match any of the options
219
- options_pattern = '|'.join(escaped_options)
220
- return options_pattern
221
-
222
-
223
- if __name__ == '__main__':
224
- result = '**Answer: A **Answer: C**'
225
- options = ['A', 'B', 'C', 'D']
226
- parsed_result = ResponseParser.parse_first_option(result, options)
227
- print(f'Parsed result: {parsed_result}') # Should print 'C'
@@ -1,55 +0,0 @@
1
- from dataclasses import dataclass, field
2
- from functools import partial
3
- from typing import Callable, Dict
4
-
5
- from evalscope.metrics.metrics import mean, pass_at_k, weighted_mean
6
- from evalscope.metrics.t2v_metrics import (blip2_score, clip_flant5_score, clip_score, fga_blip2_score, hpsv2_1_score,
7
- hpsv2_score, image_reward_score, mps_score, pick_score)
8
-
9
-
10
- @dataclass
11
- class Metric:
12
- name: str = 'default_metric'
13
- object: Callable = field(default_factory=lambda: mean)
14
-
15
-
16
- class MetricRegistry:
17
-
18
- def __init__(self):
19
- self.metrics: Dict[str, Metric] = {}
20
-
21
- def register(self, metric: Metric):
22
- self.metrics[metric.name] = metric
23
-
24
- def get(self, name: str) -> Metric:
25
- try:
26
- return self.metrics[name]
27
- except KeyError:
28
- raise KeyError(f'Metric {name} not found in the registry. Available metrics: {self.list_metrics()}')
29
-
30
- def list_metrics(self):
31
- return list(self.metrics.keys())
32
-
33
-
34
- metric_registry = MetricRegistry()
35
-
36
- # Register metrics
37
- metric_registry.register(Metric(name='AverageAccuracy', object=mean))
38
- metric_registry.register(Metric(name='WeightedAverageAccuracy', object=weighted_mean))
39
- metric_registry.register(Metric(name='AverageBLEU', object=mean))
40
- metric_registry.register(Metric(name='AverageRouge', object=mean))
41
- metric_registry.register(Metric(name='WeightedAverageBLEU', object=weighted_mean))
42
- metric_registry.register(Metric(name='AveragePass@1', object=mean))
43
- for k in range(1, 17):
44
- metric_registry.register(Metric(name=f'Pass@{k}', object=partial(pass_at_k, k=k)))
45
-
46
- # t2v_metrics
47
- metric_registry.register(Metric(name='VQAScore', object=clip_flant5_score))
48
- metric_registry.register(Metric(name='PickScore', object=pick_score))
49
- metric_registry.register(Metric(name='CLIPScore', object=clip_score))
50
- metric_registry.register(Metric(name='BLIPv2Score', object=blip2_score))
51
- metric_registry.register(Metric(name='HPSv2Score', object=hpsv2_score))
52
- metric_registry.register(Metric(name='HPSv2.1Score', object=hpsv2_1_score))
53
- metric_registry.register(Metric(name='ImageRewardScore', object=image_reward_score))
54
- metric_registry.register(Metric(name='FGA_BLIP2Score', object=fga_blip2_score))
55
- metric_registry.register(Metric(name='MPS', object=mps_score))
@@ -1,14 +0,0 @@
1
- from .base_adapter import BaseModelAdapter, initialize_model_adapter
2
- from .bfcl_adapter import BFCLAdapter
3
- from .chat_adapter import ChatGenerationModelAdapter
4
- from .choice_adapter import ContinuationLogitsModelAdapter, MultiChoiceModelAdapter
5
- from .custom_adapter import CustomModelAdapter
6
- from .server_adapter import ServerModelAdapter
7
- from .t2i_adapter import T2IModelAdapter
8
- from .tau_bench_adapter import TauBenchAdapter
9
-
10
- __all__ = [
11
- 'initialize_model_adapter', 'BaseModelAdapter', 'ChatGenerationModelAdapter', 'ContinuationLogitsModelAdapter',
12
- 'MultiChoiceModelAdapter', 'CustomModelAdapter', 'ServerModelAdapter', 'BFCLAdapter', 'T2IModelAdapter',
13
- 'TauBenchAdapter'
14
- ]
@@ -1,84 +0,0 @@
1
- import torch
2
- from abc import ABC, abstractmethod
3
- from typing import TYPE_CHECKING, Any, List, Optional, Union
4
-
5
- from evalscope.constants import EvalType, OutputType
6
- from evalscope.utils.logger import get_logger
7
- from ..custom import CustomModel
8
- from ..local_model import LocalModel
9
-
10
- logger = get_logger()
11
-
12
- if TYPE_CHECKING:
13
- from evalscope.benchmarks import DataAdapter
14
- from evalscope.config import TaskConfig
15
-
16
-
17
- class BaseModelAdapter(ABC):
18
-
19
- def __init__(self, model: Optional[Union[LocalModel, CustomModel]], **kwargs):
20
- if model is None:
21
- self.model_cfg = kwargs.get('model_cfg', None)
22
- elif isinstance(model, LocalModel):
23
- self.model = model.model
24
- self.model_id = model.model_id
25
- self.model_revision = model.model_revision
26
- self.device = model.device
27
- self.tokenizer = model.tokenizer
28
- self.model_cfg = model.model_cfg
29
- elif isinstance(model, CustomModel):
30
- self.model_cfg = model.config
31
- else:
32
- raise ValueError(f'Unsupported model type: {type(model)}')
33
-
34
- @abstractmethod
35
- @torch.no_grad()
36
- def predict(self, *args, **kwargs) -> Any:
37
- raise NotImplementedError
38
-
39
-
40
- def initialize_model_adapter(task_cfg: 'TaskConfig', benchmark: 'DataAdapter', base_model: 'LocalModel'):
41
- """Initialize the model adapter based on the task configuration."""
42
- if task_cfg.eval_type == EvalType.CUSTOM:
43
- if not isinstance(task_cfg.model, CustomModel):
44
- raise ValueError(f'Expected evalscope.models.custom.CustomModel, but got {type(task_cfg.model)}.')
45
- from evalscope.models import CustomModelAdapter
46
- return CustomModelAdapter(custom_model=task_cfg.model)
47
- else:
48
- from ..register import get_model_adapter
49
-
50
- # we need to determine the model adapter class based on the output type
51
- model_adapter_cls_str = benchmark.model_adapter
52
-
53
- if task_cfg.eval_type == EvalType.SERVICE or task_cfg.api_url is not None:
54
-
55
- if 'server' not in model_adapter_cls_str:
56
- logger.warning(f'Output type {model_adapter_cls_str} is not supported for service evaluation. '
57
- f'Using server model adapter instead.')
58
- model_adapter_cls_str = 'server'
59
- benchmark.model_adapter = model_adapter_cls_str
60
-
61
- # init server model adapter
62
- model_adapter_cls = get_model_adapter(model_adapter_cls_str)
63
-
64
- return model_adapter_cls(
65
- api_url=task_cfg.api_url,
66
- model_id=task_cfg.model,
67
- api_key=task_cfg.api_key,
68
- seed=task_cfg.seed,
69
- timeout=task_cfg.timeout,
70
- stream=task_cfg.stream,
71
- )
72
- else:
73
- if model_adapter_cls_str not in benchmark.output_types:
74
- logger.warning(f'Output type {model_adapter_cls_str} is not supported for benchmark {benchmark.name}.'
75
- f'Using {benchmark.output_types[0]} instead.')
76
- model_adapter_cls_str = benchmark.output_types[0]
77
- benchmark.model_adapter = model_adapter_cls_str
78
-
79
- model_adapter_cls = get_model_adapter(model_adapter_cls_str)
80
- return model_adapter_cls(
81
- model=base_model,
82
- generation_config=task_cfg.generation_config,
83
- chat_template=task_cfg.chat_template,
84
- task_cfg=task_cfg)
@@ -1,246 +0,0 @@
1
- import json
2
- import time
3
- import uuid
4
- from typing import Any, List, Optional, Union
5
-
6
- from evalscope.utils.logger import get_logger
7
- from ..register import register_model_adapter
8
- from .server_adapter import ServerModelAdapter
9
-
10
- logger = get_logger()
11
-
12
-
13
- @register_model_adapter(name='bfcl_server')
14
- class BFCLAdapter(ServerModelAdapter):
15
- """
16
- BFCL model adapter to request remote API model and generate results for BFCL evaluation.
17
- Support multi-turn and single-turn function calling tasks.
18
- """
19
-
20
- def __init__(self, api_url: str, model_id: str, api_key: str = 'EMPTY', **kwargs):
21
- """
22
- Args:
23
- api_url: The URL of the remote API model.
24
- model_id: The ID of the remote API model.
25
- api_key: The API key of the remote API model.
26
- """
27
- super().__init__(api_url=api_url, model_id=model_id, api_key=api_key, **kwargs)
28
-
29
- def predict(self, inputs: List[dict], infer_cfg: Optional[dict] = None) -> List[dict]:
30
- """
31
- Model prediction func. For multi-turn evals, we pass a list[list[message]] to the model
32
- where each list is a follow up turn in the conversation
33
- each turn is a List[List[Message]]
34
-
35
- Args:
36
- inputs (List[dict]): The input data.
37
- infer_cfg (dict): Inference configuration.
38
-
39
- Returns:
40
- res (List[dict]): The model prediction results.
41
- """
42
- infer_cfg = infer_cfg or {}
43
- results = []
44
-
45
- for input_item in inputs:
46
- # This flag decides if we pass tools to the API or try tool calling via prompting
47
- # Passing tools to the API means that we rely on the API to manage system prompt specifics
48
- # and also expect parsed tool calls in the ChatCompletionMessage object
49
- # This is how the is_fc_model=True benchmark is designed to work
50
- # On the other hand, we try to manage
51
- # tool calling via prompting and parse tool calls in the standard text response
52
- # This is how the is_fc_model=False benchmark is designed to work
53
- row = input_item.get('messages')
54
- is_fc_model = row.get('is_fc_model', False)
55
-
56
- if is_fc_model:
57
- response = self.generate_turn_with_tools(row, infer_cfg)
58
- else:
59
- response = self.generate_turn(row, infer_cfg)
60
-
61
- # wrap response with openai types
62
- res_d = {
63
- 'choices': [{
64
- 'index': 0,
65
- 'message': {
66
- 'content': response,
67
- 'role': 'assistant'
68
- }
69
- }],
70
- 'created': time.time(),
71
- 'model': self.model_id,
72
- 'object': 'chat.completion',
73
- 'usage': {
74
- 'completion_tokens': 0,
75
- 'prompt_tokens': 0,
76
- 'total_tokens': 0
77
- }
78
- }
79
- results.append(res_d)
80
-
81
- return results
82
-
83
- def generate_turn(self, row: dict[str, Any], infer_cfg: dict[str, Any]) -> list[str]:
84
- from bfcl_eval.constants.default_prompts import (DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_PROMPTING,
85
- MAXIMUM_STEP_LIMIT)
86
- from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils import execute_multi_turn_func_call
87
- from bfcl_eval.model_handler.utils import default_decode_execute_prompting
88
-
89
- all_model_responses = []
90
- current_messages = []
91
- turns = row['turns']
92
- for turn_idx, messages in enumerate(turns):
93
- n_steps = 0
94
- current_responses = []
95
- current_messages += messages.copy()
96
-
97
- if str(turn_idx) in row['missing_functions']:
98
- assert len(messages) == 0, 'Holdout turn should not have user message.'
99
- new_turn = [{
100
- 'role':
101
- 'user',
102
- 'content':
103
- DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_PROMPTING.format(
104
- functions=row['missing_functions'][str(turn_idx)]),
105
- }]
106
- current_messages += new_turn
107
-
108
- while True:
109
- input_item = {
110
- 'messages': current_messages,
111
- }
112
- responses = self.process_single_input(input_item, infer_cfg)
113
- result = responses['choices'][0]['message']['content']
114
-
115
- logger.debug(f'Turn:{turn_idx} Step:{n_steps} Result: {result}')
116
- current_messages.append({
117
- 'role': 'assistant',
118
- 'content': result,
119
- })
120
- current_responses.append(result)
121
-
122
- execute_tools = row.get('should_execute_tool_calls', False)
123
- if execute_tools:
124
- try:
125
- tool_calls = default_decode_execute_prompting(result)
126
- except Exception:
127
- tool_calls = None
128
-
129
- if tool_calls is None:
130
- break
131
-
132
- tool_outputs, _ = execute_multi_turn_func_call(
133
- tool_calls,
134
- initial_config=row['initial_config'],
135
- involved_classes=row['involved_classes'],
136
- model_name='evaluator_loop',
137
- test_entry_id=row['id'],
138
- long_context=('long_context' in row['test_category'] or 'composite' in row['test_category']),
139
- is_evaL_run=False,
140
- )
141
- # Append tool outputs to the current messages
142
- tool_results = []
143
- for tool_output, tool_call in zip(tool_outputs, tool_calls):
144
- tool_results.append({'role': 'tool', 'name': tool_call, 'content': tool_output})
145
- current_messages.append({
146
- 'role': 'user',
147
- 'content': repr(tool_results),
148
- })
149
- else:
150
- break
151
-
152
- n_steps += 1
153
- if n_steps > MAXIMUM_STEP_LIMIT:
154
- logger.error(f'INFERENCE_ERROR: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
155
- break
156
-
157
- all_model_responses.append(current_responses)
158
-
159
- return all_model_responses
160
-
161
- def generate_turn_with_tools(self, row: dict[str, Any], infer_cfg: dict[str, Any]) -> list[str]:
162
- from bfcl_eval.constants.default_prompts import (DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_FC,
163
- MAXIMUM_STEP_LIMIT)
164
- from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils import execute_multi_turn_func_call
165
- from bfcl_eval.model_handler.utils import convert_to_function_call
166
-
167
- all_model_responses = []
168
- current_messages = []
169
- turns = row['turns']
170
- for turn_idx, messages in enumerate(turns):
171
- n_steps = 0
172
- current_responses = []
173
- current_messages += messages.copy()
174
- tools = row['tools']
175
-
176
- if str(turn_idx) in row['missing_functions']:
177
- assert len(messages) == 0, 'Holdout turn should not have user message.'
178
- # inject new functions on the fly
179
- new_tools = row['missing_functions'][str(turn_idx)]
180
- for new_tool in new_tools:
181
- tools.append({
182
- 'type': 'function',
183
- 'function': new_tool[0],
184
- })
185
- new_turn = [{
186
- 'role': 'user',
187
- 'content': DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_FC,
188
- }]
189
- current_messages += new_turn
190
-
191
- while True:
192
- input_item = {
193
- 'messages': current_messages,
194
- 'tools': tools,
195
- }
196
- responses = self.process_single_input(input_item, infer_cfg)
197
- message = responses['choices'][0]['message']
198
-
199
- current_messages.append(message)
200
- if isinstance(message, str):
201
- model_responses = [message]
202
- tool_call_strs = None
203
- elif message.get('tool_calls'):
204
- model_responses = [{
205
- tc['function']['name']: tc['function']['arguments']
206
- } for tc in message['tool_calls']]
207
- try:
208
- tool_call_strs = convert_to_function_call(model_responses)
209
- except Exception as e:
210
- logger.error(f'Error converting tool calls to function call strings: {e}')
211
- tool_call_strs = None
212
- else:
213
- model_responses = [message['content']]
214
- tool_call_strs = None
215
-
216
- current_responses.extend(model_responses)
217
-
218
- execute_tools = row.get('should_execute_tool_calls', False)
219
- if execute_tools and tool_call_strs is not None:
220
- tool_outputs, _ = execute_multi_turn_func_call(
221
- tool_call_strs,
222
- initial_config=row['initial_config'],
223
- involved_classes=row['involved_classes'],
224
- model_name='evaluator_loop',
225
- test_entry_id=row['id'],
226
- long_context=('long_context' in row['test_category'] or 'composite' in row['test_category']),
227
- is_evaL_run=False,
228
- )
229
-
230
- for tc, tool_output in zip(message['tool_calls'], tool_outputs, strict=False):
231
- current_messages.append({
232
- 'role': 'tool',
233
- 'tool_call_id': tc['id'],
234
- 'content': json.dumps({'response': tool_output}),
235
- })
236
- else:
237
- break
238
-
239
- n_steps += 1
240
- if n_steps > MAXIMUM_STEP_LIMIT:
241
- logger.error(f'INFERENCE_ERROR: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
242
- break
243
-
244
- all_model_responses.append(current_responses)
245
-
246
- return all_model_responses