evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (273) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/__init__.py +0 -0
  3. evalscope/api/benchmark/__init__.py +3 -0
  4. evalscope/api/benchmark/adapters/__init__.py +3 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
  8. evalscope/api/benchmark/benchmark.py +321 -0
  9. evalscope/api/benchmark/meta.py +115 -0
  10. evalscope/api/dataset/__init__.py +2 -0
  11. evalscope/api/dataset/dataset.py +349 -0
  12. evalscope/api/dataset/loader.py +261 -0
  13. evalscope/api/dataset/utils.py +143 -0
  14. evalscope/api/evaluator/__init__.py +3 -0
  15. evalscope/api/evaluator/cache.py +355 -0
  16. evalscope/api/evaluator/evaluator.py +56 -0
  17. evalscope/api/evaluator/state.py +264 -0
  18. evalscope/api/filter/__init__.py +1 -0
  19. evalscope/api/filter/filter.py +72 -0
  20. evalscope/api/messages/__init__.py +11 -0
  21. evalscope/api/messages/chat_message.py +198 -0
  22. evalscope/api/messages/content.py +102 -0
  23. evalscope/api/messages/utils.py +35 -0
  24. evalscope/api/metric/__init__.py +2 -0
  25. evalscope/api/metric/metric.py +55 -0
  26. evalscope/api/metric/scorer.py +105 -0
  27. evalscope/api/mixin/__init__.py +2 -0
  28. evalscope/api/mixin/dataset_mixin.py +105 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +157 -0
  32. evalscope/api/model/model.py +383 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/ui/app_ui.py +2 -1
  40. evalscope/app/ui/multi_model.py +50 -25
  41. evalscope/app/ui/single_model.py +23 -11
  42. evalscope/app/utils/data_utils.py +42 -26
  43. evalscope/app/utils/text_utils.py +0 -2
  44. evalscope/app/utils/visualization.py +9 -4
  45. evalscope/arguments.py +6 -7
  46. evalscope/backend/opencompass/api_meta_template.py +2 -1
  47. evalscope/backend/opencompass/backend_manager.py +6 -3
  48. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  49. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  50. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  51. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  52. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  53. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  55. evalscope/backend/rag_eval/utils/embedding.py +2 -1
  56. evalscope/backend/rag_eval/utils/llm.py +13 -12
  57. evalscope/benchmarks/__init__.py +0 -2
  58. evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
  59. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
  60. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
  61. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
  62. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
  63. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
  64. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
  65. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  66. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  67. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  68. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  69. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  70. evalscope/benchmarks/arena_hard/utils.py +37 -1
  71. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  72. evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
  73. evalscope/benchmarks/bfcl/generation.py +222 -0
  74. evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
  75. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  76. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  77. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  78. evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
  79. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  80. evalscope/benchmarks/docmath/utils.py +4 -5
  81. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  82. evalscope/benchmarks/frames/frames_adapter.py +135 -52
  83. evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
  84. evalscope/benchmarks/general_arena/utils.py +23 -27
  85. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  86. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  87. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  88. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  89. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  90. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  91. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  92. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  93. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  94. evalscope/benchmarks/ifeval/instructions.py +109 -64
  95. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  96. evalscope/benchmarks/ifeval/utils.py +6 -7
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  105. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  106. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  107. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  108. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
  109. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  110. evalscope/benchmarks/race/race_adapter.py +33 -119
  111. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  112. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  113. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  114. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  115. evalscope/benchmarks/tau_bench/generation.py +147 -0
  116. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
  117. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  118. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  119. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
  120. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  121. evalscope/cli/cli.py +2 -0
  122. evalscope/cli/start_server.py +6 -3
  123. evalscope/collections/__init__.py +2 -10
  124. evalscope/collections/sampler.py +10 -10
  125. evalscope/collections/schema.py +13 -11
  126. evalscope/config.py +95 -54
  127. evalscope/constants.py +29 -61
  128. evalscope/evaluator/__init__.py +1 -1
  129. evalscope/evaluator/evaluator.py +277 -423
  130. evalscope/filters/__init__.py +2 -0
  131. evalscope/filters/extraction.py +126 -0
  132. evalscope/filters/selection.py +57 -0
  133. evalscope/metrics/__init__.py +13 -13
  134. evalscope/metrics/llm_judge.py +32 -30
  135. evalscope/metrics/math_parser.py +27 -22
  136. evalscope/metrics/metric.py +307 -0
  137. evalscope/metrics/metrics.py +22 -18
  138. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  139. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  140. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  141. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  142. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  143. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  144. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  145. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  146. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  147. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  148. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  149. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  150. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  151. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  152. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  153. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  154. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  155. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  156. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  157. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  158. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  159. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  160. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  162. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  163. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  184. evalscope/models/__init__.py +6 -29
  185. evalscope/models/mockllm.py +65 -0
  186. evalscope/models/model_apis.py +47 -0
  187. evalscope/models/modelscope.py +455 -0
  188. evalscope/models/openai_compatible.py +123 -0
  189. evalscope/models/text2image_model.py +124 -0
  190. evalscope/models/utils/openai.py +698 -0
  191. evalscope/perf/benchmark.py +2 -1
  192. evalscope/perf/http_client.py +4 -2
  193. evalscope/perf/plugin/api/custom_api.py +5 -4
  194. evalscope/perf/plugin/api/openai_api.py +11 -9
  195. evalscope/perf/plugin/datasets/custom.py +2 -1
  196. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  197. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  198. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  199. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  200. evalscope/perf/plugin/datasets/openqa.py +4 -2
  201. evalscope/perf/utils/benchmark_util.py +7 -5
  202. evalscope/perf/utils/db_util.py +9 -6
  203. evalscope/perf/utils/local_server.py +8 -3
  204. evalscope/perf/utils/rich_display.py +16 -10
  205. evalscope/report/__init__.py +2 -2
  206. evalscope/report/combinator.py +18 -12
  207. evalscope/report/generator.py +101 -6
  208. evalscope/report/{utils.py → report.py} +8 -6
  209. evalscope/run.py +26 -44
  210. evalscope/summarizer.py +1 -1
  211. evalscope/utils/__init__.py +21 -2
  212. evalscope/utils/chat_service.py +2 -1
  213. evalscope/utils/deprecation_utils.py +12 -1
  214. evalscope/utils/function_utils.py +29 -0
  215. evalscope/utils/io_utils.py +100 -5
  216. evalscope/utils/json_schema.py +208 -0
  217. evalscope/utils/logger.py +51 -12
  218. evalscope/utils/model_utils.py +10 -7
  219. evalscope/utils/multi_choices.py +271 -0
  220. evalscope/utils/url_utils.py +65 -0
  221. evalscope/version.py +2 -2
  222. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
  223. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
  224. tests/aigc/test_t2i.py +22 -4
  225. tests/benchmark/__init__.py +1 -0
  226. tests/benchmark/test_eval.py +386 -0
  227. tests/cli/test_all.py +3 -5
  228. tests/cli/test_collection.py +13 -4
  229. tests/cli/test_custom.py +22 -15
  230. tests/rag/test_clip_benchmark.py +1 -0
  231. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  232. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  233. evalscope/benchmarks/benchmark.py +0 -81
  234. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  235. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  236. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  237. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  238. evalscope/benchmarks/data_adapter.py +0 -528
  239. evalscope/benchmarks/filters.py +0 -59
  240. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  241. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  242. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  243. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  244. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  245. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  246. evalscope/benchmarks/race/race.py +0 -104
  247. evalscope/benchmarks/race/samples.jsonl +0 -5
  248. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  249. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  250. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  251. evalscope/benchmarks/utils.py +0 -60
  252. evalscope/collections/evaluator.py +0 -375
  253. evalscope/metrics/completion_parsers.py +0 -227
  254. evalscope/metrics/named_metrics.py +0 -55
  255. evalscope/models/adapters/__init__.py +0 -14
  256. evalscope/models/adapters/base_adapter.py +0 -84
  257. evalscope/models/adapters/bfcl_adapter.py +0 -246
  258. evalscope/models/adapters/chat_adapter.py +0 -207
  259. evalscope/models/adapters/choice_adapter.py +0 -222
  260. evalscope/models/adapters/custom_adapter.py +0 -71
  261. evalscope/models/adapters/server_adapter.py +0 -236
  262. evalscope/models/adapters/t2i_adapter.py +0 -79
  263. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  264. evalscope/models/custom/__init__.py +0 -4
  265. evalscope/models/custom/custom_model.py +0 -50
  266. evalscope/models/custom/dummy_model.py +0 -99
  267. evalscope/models/local_model.py +0 -128
  268. evalscope/models/register.py +0 -41
  269. tests/cli/test_run.py +0 -489
  270. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
  271. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
  272. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
  273. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,5 @@
1
- Answer the following multiple choice question. There is only one correct answer. The last line of your response should be in the format 'Answer: $LETTER' (without quotes), where LETTER is one of A, B, C, D, E, F, G, H, I, or J.
1
+ # flake8: noqa
2
+ FEW_SHOT_SAMPLES = """Answer the following multiple choice question. There is only one correct answer. The last line of your response should be in the format 'ANSWER: $LETTER' (without quotes), where LETTER is one of A, B, C, D, E, F, G, H, I, or J.
2
3
 
3
4
  Question:
4
5
  A refracting telescope consists of two converging lenses separated by 100 cm. The eye-piece lens has a focal length of 20 cm. The angular magnification of the telescope is
@@ -13,8 +14,8 @@ H) 4
13
14
  I) 5
14
15
  J) 20
15
16
 
16
- Answer: Let's think step by step. In a refracting telescope, if both lenses are converging, the focus of both lenses must be between the two lenses, and thus the focal lengths of the two lenses must add up to their separation. Since the focal length of one lens is 20 cm, the focal length of the other must be 80 cm. The magnification is the ratio of these two focal lengths, or 4.
17
- Answer: H.
17
+ ANSWER: Let's think step by step. In a refracting telescope, if both lenses are converging, the focus of both lenses must be between the two lenses, and thus the focal lengths of the two lenses must add up to their separation. Since the focal length of one lens is 20 cm, the focal length of the other must be 80 cm. The magnification is the ratio of these two focal lengths, or 4.
18
+ ANSWER: H.
18
19
 
19
20
  Question:
20
21
  Say the pupil of your eye has a diameter of 5 mm and you have a telescope with an aperture of 50 cm. How much more light can the telescope gather than your eye?
@@ -29,11 +30,11 @@ H) 100 times more
29
30
  I) 10 times more
30
31
  J) N/A
31
32
 
32
- Answer: Let's think step by step. The amount of light a telescope can gather compared to the human eye is proportional to the area of its apertures. The area of a circle is given by the formula $A = \pi \left(\frac{{D}}{{2}}\right)^2$, where $D$ is the diameter. Therefore, the relative light-gathering power is calculated as:
33
+ ANSWER: Let's think step by step. The amount of light a telescope can gather compared to the human eye is proportional to the area of its apertures. The area of a circle is given by the formula $A = \pi \left(\frac{{D}}{{2}}\right)^2$, where $D$ is the diameter. Therefore, the relative light-gathering power is calculated as:
33
34
  \[
34
35
  \frac{{\left(\frac{{50 \text{{ cm}}}}{{2}}\right)^2}}{{\left(\frac{{5 \text{{ mm}}}}{{2}}\right)^2}} = \frac{{\left(\frac{{50 \text{{ cm}}}}{{0.1 \text{{ cm}}}}\right)^2}}{{\left(\frac{{5 \text{{ mm}}}}{{0.1 \text{{ cm}}}}\right)^2}} = \frac{{500^2}}{{5^2}} = 10000.
35
36
  \]
36
- Answer: E.
37
+ ANSWER: E.
37
38
 
38
39
  Question:
39
40
  Where do most short-period comets come from and how do we know?
@@ -44,8 +45,9 @@ D) The Oort cloud; short period comets have orbital periods similar to asteroids
44
45
  E) The Oort Cloud; short period comets tend to come from random directions indicating a spherical distribution of comets called the Oort Cloud.
45
46
  F) The Oort cloud; short period comets tend to be in the plane of the solar system just like the Oort cloud.
46
47
  G) The asteroid belt; short period comets have orbital periods similar to asteroids like Vesta and are found in the plane of the solar system just like the asteroid belt.
47
- Answer: Let's think step by step. Most short-period comets originate from the Kuiper belt. This is deduced from the observation that these comets tend to follow orbits that lie in the plane of the solar system, similar to the distribution of objects in the Kuiper belt itself. Thus, the alignment of these cometary orbits with the ecliptic plane points to their Kuiper belt origin.
48
- Answer: A.
48
+
49
+ ANSWER: Let's think step by step. Most short-period comets originate from the Kuiper belt. This is deduced from the observation that these comets tend to follow orbits that lie in the plane of the solar system, similar to the distribution of objects in the Kuiper belt itself. Thus, the alignment of these cometary orbits with the ecliptic plane points to their Kuiper belt origin.
50
+ ANSWER: A.
49
51
 
50
52
  Question:
51
53
  Colors in a soap bubble result from light
@@ -60,8 +62,8 @@ H) absorption
60
62
  I) diffraction
61
63
  J) transmission
62
64
 
63
- Answer: Let's think step by step. The colorful patterns observed in a soap bubble are caused by the phenomenon of light interference. This occurs when light waves bounce between the two surfaces of the soap film, combining constructively or destructively based on their phase differences and the varying thickness of the film. These interactions result in vibrant color patterns due to variations in the intensity of different wavelengths of light.
64
- Answer: E.
65
+ ANSWER: Let's think step by step. The colorful patterns observed in a soap bubble are caused by the phenomenon of light interference. This occurs when light waves bounce between the two surfaces of the soap film, combining constructively or destructively based on their phase differences and the varying thickness of the film. These interactions result in vibrant color patterns due to variations in the intensity of different wavelengths of light.
66
+ ANSWER: E.
65
67
 
66
68
  Question:
67
69
  A microwave oven is connected to an outlet, 120 V, and draws a current of 2 amps. At what rate is energy being used by the microwave oven?
@@ -76,15 +78,11 @@ H) 150 W
76
78
  I) 60 W
77
79
  J) 300 W
78
80
 
79
- Answer: Let's think step by step. The rate of energy usage, known as power, in an electrical circuit is calculated by the product of voltage and current. For a microwave oven connected to a 120 V outlet and drawing a current of 2 amps, the power consumption can be calculated as follows:
81
+ ANSWER: Let's think step by step. The rate of energy usage, known as power, in an electrical circuit is calculated by the product of voltage and current. For a microwave oven connected to a 120 V outlet and drawing a current of 2 amps, the power consumption can be calculated as follows:
80
82
  \[
81
83
  \text{{Power}} = \text{{Voltage}} \times \text{{Current}} = 120 \, \text{{V}} \times 2 \, \text{{A}} = 240 \, \text{{W}}.
82
84
  \]
83
85
  Therefore, the microwave oven uses energy at a rate of 240 watts.
84
- Answer: A.
85
-
86
- Question:
87
- {query}
88
- {choices}
86
+ ANSWER: A.
89
87
 
90
- Answer: Let's think step by step.
88
+ """
@@ -1,34 +1,16 @@
1
- import os
2
- import random
3
- import re
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
4
2
 
5
- from evalscope.benchmarks import Benchmark, DataAdapter
6
- from evalscope.constants import EvalType, OutputType
7
- from evalscope.metrics import exact_match
8
- from evalscope.utils import logger
3
+ import os
4
+ from typing import Any, Dict
9
5
 
10
- current_dir = os.path.dirname(os.path.abspath(__file__))
6
+ from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
7
+ from evalscope.api.dataset import Sample
8
+ from evalscope.api.registry import register_benchmark
9
+ from evalscope.constants import Tags
10
+ from evalscope.utils.logger import get_logger
11
+ from evalscope.utils.multi_choices import FEW_SHOT_TEMPLATE, MultipleChoiceTemplate
11
12
 
12
- SUBSET_LIST = [
13
- 'Electronic Science and Technology', 'Philosophy', 'Traditional Chinese Medicine', 'Applied Economics',
14
- 'Mathematics', 'Physics', 'Clinical Medicine', 'Computer Science and Technology',
15
- 'Information and Communication Engineering', 'Control Science and Engineering', 'Theoretical Economics', 'Law',
16
- 'History', 'Basic Medicine', 'Education', 'Materials Science and Engineering', 'Electrical Engineering',
17
- 'Systems Science', 'Power Engineering and Engineering Thermophysics', 'Military Science', 'Biology',
18
- 'Business Administration', 'Language and Literature', 'Public Health and Preventive Medicine', 'Political Science',
19
- 'Chemistry', 'Hydraulic Engineering', 'Chemical Engineering and Technology', 'Pharmacy', 'Geography', 'Art Studies',
20
- 'Architecture', 'Forestry Engineering', 'Public Administration', 'Oceanography', 'Journalism and Communication',
21
- 'Nuclear Science and Technology', 'Weapon Science and Technology', 'Naval Architecture and Ocean Engineering',
22
- 'Environmental Science and Engineering', 'Transportation Engineering', 'Geology', 'Physical Oceanography',
23
- 'Musicology', 'Stomatology', 'Aquaculture', 'Mechanical Engineering',
24
- 'Aeronautical and Astronautical Science and Technology', 'Civil Engineering', 'Mechanics',
25
- 'Petroleum and Natural Gas Engineering', 'Sociology', 'Food Science and Engineering', 'Agricultural Engineering',
26
- 'Surveying and Mapping Science and Technology', 'Metallurgical Engineering',
27
- 'Library, Information and Archival Management', 'Mining Engineering', 'Astronomy',
28
- 'Geological Resources and Geological Engineering', 'Atmospheric Science', 'Optical Engineering', 'Animal Husbandry',
29
- 'Geophysics', 'Crop Science', 'Management Science and Engineering', 'Psychology', 'Forestry',
30
- 'Textile Science and Engineering', 'Veterinary Medicine', 'Instrument Science and Technology', 'Physical Education'
31
- ]
13
+ logger = get_logger()
32
14
 
33
15
  SUBSET_MAPPING = {
34
16
  'Electronic Science and Technology': ['Engineering'],
@@ -106,104 +88,78 @@ SUBSET_MAPPING = {
106
88
  }
107
89
 
108
90
 
109
- @Benchmark.register(
110
- name='super_gpqa',
111
- pretty_name='SuperGPQA',
112
- tags=['MCQ', 'Knowledge'],
113
- description=
114
- 'SuperGPQA is a large-scale multiple-choice question answering dataset, designed to evaluate the generalization ability of models across different fields. It contains 100,000+ questions from 50+ fields, with each question having 10 options.', # noqa: E501
115
- dataset_id='m-a-p/SuperGPQA',
116
- model_adapter=OutputType.GENERATION,
117
- output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
118
- subset_list=SUBSET_LIST,
119
- metric_list=['AverageAccuracy'],
120
- few_shot_num=0,
121
- train_split=None,
122
- eval_split='train', # only have train split
91
+ @register_benchmark(
92
+ BenchmarkMeta(
93
+ name='super_gpqa',
94
+ pretty_name='SuperGPQA',
95
+ tags=[Tags.KNOWLEDGE, Tags.MULTIPLE_CHOICE],
96
+ description=
97
+ 'SuperGPQA is a large-scale multiple-choice question answering dataset, designed to evaluate the generalization ability of models across different fields. It contains 100,000+ questions from 50+ fields, with each question having 10 options.', # noqa: E501
98
+ dataset_id='m-a-p/SuperGPQA',
99
+ subset_list=list(SUBSET_MAPPING.keys()),
100
+ metric_list=['acc'],
101
+ few_shot_num=0,
102
+ train_split=None,
103
+ eval_split='train', # only have train split
104
+ prompt_template=MultipleChoiceTemplate.SINGLE_ANSWER_COT,
105
+ )
123
106
  )
124
- class SuperGPQAAdapter(DataAdapter):
107
+ class SuperGPQAAdapter(MultiChoiceAdapter):
125
108
 
126
109
  def __init__(self, **kwargs):
127
- few_shot_num = kwargs.get('few_shot_num', 0)
128
- if few_shot_num > 0 and few_shot_num != 5:
129
- logger.warning(
130
- f'Only support few_shot_num 0 or 5 for SuperGPQA, but got {few_shot_num}. Use 5-shot by default.')
131
- kwargs['few_shot_num'] = 5
110
+
132
111
  super().__init__(**kwargs)
112
+ if self.few_shot_num > 0 and self.few_shot_num != 5:
113
+ logger.warning(
114
+ f'Only support few_shot_num 0 or 5 for SuperGPQA, but got {self.few_shot_num}. Use 5-shot by default.'
115
+ )
116
+ self.few_shot_num = 5
133
117
 
134
- self.choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
118
+ self.reformat_subset = True
135
119
  self.category_map = SUBSET_MAPPING
136
- self.few_shot_prompt = open(os.path.join(current_dir, 'five_shot_prompt.txt'), encoding='utf-8').read()
137
- self.zero_shot_prompt = open(os.path.join(current_dir, 'zero_shot_prompt.txt'), encoding='utf-8').read()
138
-
139
- def load(self, **kwargs):
140
- kwargs['subset_list'] = ['default']
141
- data_dict = super().load(**kwargs)
142
- return self.reformat_subset(data_dict, subset_key='field', format='{}')
143
-
144
- def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
145
- question = input_d['question']
146
- choices = self._format_choices(input_d['options'])
147
- if not self.prompt_template:
148
- if few_shot_list:
149
- prompt = self.few_shot_prompt.format(query=question, choices=choices)
150
- else:
151
- prompt = self.zero_shot_prompt.format(query=question, choices=choices)
152
- else:
153
- prompt = self.prompt_template.format(query=question, choices=choices)
154
- return self.gen_prompt_data(prompt)
155
-
156
- def get_gold_answer(self, input_d: dict) -> str:
157
- # Get the gold choice
158
- return input_d.get('answer_letter')
159
120
 
160
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
121
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
122
+ return Sample(
123
+ input=record['question'],
124
+ choices=record['options'],
125
+ target=record['answer_letter'],
126
+ subset_key=record['field'],
127
+ metadata={
128
+ 'field': record['field'],
129
+ 'discipline': record['discipline'],
130
+ 'uuid': record.get('uuid', ''),
131
+ 'explanation': record.get('answer', ''),
132
+ },
133
+ )
134
+
135
+ def format_fewshot_template(self, fewshot, sample):
136
+ from .prompt import FEW_SHOT_SAMPLES
137
+
138
+ return FEW_SHOT_TEMPLATE.format(fewshot=FEW_SHOT_SAMPLES, ) + self.format_prompt_template(sample)
139
+
140
+ def extract_answer(self, prediction: str, task_state) -> str:
161
141
  """
162
- Parse the model output to get the answer. Could be the best choice index.
163
-
164
- Args:
165
- result: Predicted answer from the model. Usually a string for chat.
166
- raw_input_d: The raw input. Depending on the dataset.
167
- eval_type: 'checkpoint' or 'service' or 'custom'
168
-
169
- Returns:
170
- The parsed answer. Depending on the dataset. Usually a string for chat.
142
+ Extract the answer from the prediction.
171
143
  """
172
- if self.model_adapter == OutputType.MULTIPLE_CHOICE:
173
- return result
144
+ from .utils import extract_option_content, extract_option_labels
145
+
146
+ choices = [choice.value for choice in task_state.choices]
147
+ if self.few_shot_num == 0:
148
+ predict = extract_option_labels(prediction, 'ABCDEFGHIJ')
149
+ if predict is None:
150
+ # Try to extract by content matching
151
+ predict = extract_option_content(prediction, choices)
152
+ predict = chr(choices.index(predict) + 65) if predict else None
174
153
  else:
175
- from evalscope.benchmarks.super_gpqa.utils import extract_option_content, extract_option_labels
176
- sample = raw_input_d
177
- if self.few_shot_num == 0:
178
- predict = extract_option_labels(result, 'ABCDEFGHIJ')
179
- if predict is None:
180
- predict = extract_option_content(result, sample['options'])
181
- predict = chr(sample['options'].index(predict) + 65) if predict else None
182
- else:
183
- response = result.split('Question:')[0]
184
- predict = extract_option_labels(response, 'ABCDEFGHIJ')
154
+ response = prediction.split('Question:')[0]
155
+ predict = extract_option_labels(response, 'ABCDEFGHIJ')
156
+ if predict is None:
157
+ predict = extract_option_content(response, choices)
158
+ predict = chr(choices.index(predict) + 65) if predict else None
159
+ if predict is None:
160
+ predict = extract_option_labels(prediction, 'ABCDEFGHIJ')
185
161
  if predict is None:
186
- predict = extract_option_content(response, sample['options'])
187
- predict = chr(sample['options'].index(predict) + 65) if predict else None
188
- if predict is None:
189
- predict = extract_option_labels(result, 'ABCDEFGHIJ')
190
- if predict is None:
191
- predict = extract_option_content(result, sample['options'])
192
- predict = chr(sample['options'].index(predict) + 65) if predict else None
193
- return predict
194
-
195
- def match(self, gold: str, pred: str) -> float:
196
- return exact_match(gold=gold, pred=pred)
162
+ predict = extract_option_content(prediction, choices)
163
+ predict = chr(choices.index(predict) + 65) if predict else None
197
164
 
198
- def _format_choices(self, choices: list) -> str:
199
- """
200
- Format the choices into a string for display.
201
-
202
- Args:
203
- choices (list): List of choices.
204
-
205
- Returns:
206
- str: Formatted string of choices.
207
- """
208
- choice_list = [f'{option}) {content}' for option, content in zip(self.choices, choices)]
209
- return '\n'.join(choice_list)
165
+ return predict or ''
@@ -1,5 +1,6 @@
1
1
  # flake8: noqa
2
2
  import re
3
+ from collections.abc import Sequence
3
4
 
4
5
 
5
6
  def safe_regex_search(pattern, text, flags=0):
@@ -51,7 +52,7 @@ def extract_option_labels(text, options='ABCDEFGHIJ'):
51
52
 
52
53
 
53
54
  def extract_option_content(text, options_content=None):
54
- if not isinstance(text, str) or not isinstance(options_content, list):
55
+ if not isinstance(text, str) or not isinstance(options_content, Sequence):
55
56
  return 'error'
56
57
 
57
58
  escaped_options_content = [re.escape(option_content) for option_content in options_content]
@@ -0,0 +1,147 @@
1
+ from typing import Any, Dict, List, Optional
2
+
3
+ from evalscope.api.dataset import Sample
4
+ from evalscope.api.messages import dict_to_chat_message
5
+ from evalscope.api.model import Model, ModelOutput
6
+ from evalscope.api.model.model_output import ChatCompletionChoice
7
+ from evalscope.api.tool import ToolInfo
8
+ from evalscope.models.utils.openai import openai_chat_choices
9
+ from evalscope.utils.function_utils import run_once
10
+ from evalscope.utils.logger import get_logger
11
+
12
+ logger = get_logger()
13
+
14
+
15
+ @run_once
16
+ def _patch_agent_solve(model: Model):
17
+ """Patch ToolCallingAgent.solve method to use custom model configuration"""
18
+ from tau_bench.agents.tool_calling_agent import ToolCallingAgent, message_to_action
19
+ from tau_bench.envs.base import Env
20
+ from tau_bench.types import RESPOND_ACTION_NAME, Action, SolveResult
21
+
22
+ def patched_solve(
23
+ self,
24
+ env: Env,
25
+ task_index: Optional[int] = None,
26
+ max_num_steps: int = 30,
27
+ ) -> SolveResult:
28
+ env_reset_res = env.reset(task_index=task_index)
29
+ obs = env_reset_res.observation
30
+ info = env_reset_res.info.model_dump()
31
+ reward = 0.0
32
+ messages: List[Dict[str, Any]] = [
33
+ {
34
+ 'role': 'system',
35
+ 'content': self.wiki
36
+ },
37
+ {
38
+ 'role': 'user',
39
+ 'content': obs
40
+ },
41
+ ]
42
+
43
+ for step_index in range(max_num_steps):
44
+ res = model.generate(
45
+ input=[dict_to_chat_message(msg) for msg in messages],
46
+ tools=[ToolInfo.model_validate(tool['function']) for tool in self.tools_info]
47
+ )
48
+ oai_res = openai_chat_choices(res.choices)
49
+
50
+ next_message = oai_res[0].message.model_dump(exclude_none=True)
51
+
52
+ action = message_to_action(next_message)
53
+
54
+ env_response = env.step(action)
55
+ reward = env_response.reward
56
+ info = {**info, **env_response.info.model_dump()}
57
+
58
+ if action.name != RESPOND_ACTION_NAME:
59
+ next_message['tool_calls'] = next_message['tool_calls'][:1]
60
+ messages.extend([
61
+ next_message,
62
+ {
63
+ 'role': 'tool',
64
+ 'tool_call_id': next_message['tool_calls'][0]['id'],
65
+ 'name': next_message['tool_calls'][0]['function']['name'],
66
+ 'content': env_response.observation,
67
+ },
68
+ ])
69
+ else:
70
+ messages.extend([
71
+ next_message,
72
+ {
73
+ 'role': 'user',
74
+ 'content': env_response.observation
75
+ },
76
+ ])
77
+ logger.debug(f'Task: {task_index} Step: {step_index} finished')
78
+
79
+ if env_response.done:
80
+ break
81
+
82
+ return SolveResult(
83
+ reward=reward,
84
+ info=info,
85
+ messages=messages,
86
+ total_cost=0,
87
+ )
88
+
89
+ ToolCallingAgent.solve = patched_solve
90
+
91
+ return 'ToolCallingAgent.solve patched successfully'
92
+
93
+
94
+ def predict(model: Model, sample: Sample) -> ModelOutput:
95
+ """
96
+ Generate predictions for tau_bench tasks using the model.
97
+
98
+ Args:
99
+ model: The model to use for prediction
100
+ sample: The sample containing task metadata
101
+
102
+ Returns:
103
+ ModelOutput containing the prediction results
104
+ """
105
+ from tau_bench.agents.tool_calling_agent import ToolCallingAgent
106
+ from tau_bench.envs import get_env
107
+
108
+ _patch_agent_solve(model)
109
+ try:
110
+ # Extract task information from sample metadata
111
+ task_data = sample.metadata
112
+ env_name = task_data['env_name']
113
+ task_index = task_data['task_index']
114
+
115
+ # Direct call to tau_bench_server adapter's solve method
116
+ # This method can be implemented to solve specific tasks in the TauBench environment
117
+ isolated_env = get_env(
118
+ env_name=env_name,
119
+ user_strategy='llm',
120
+ user_model='dummy', # Use dummy model to prevent errors
121
+ user_provider='openai', # Use dummy provider to prevent errors
122
+ task_split='test',
123
+ task_index=task_index,
124
+ )
125
+ agent = ToolCallingAgent(
126
+ tools_info=isolated_env.tools_info,
127
+ wiki=isolated_env.wiki,
128
+ model='dummy', # Use dummy model to prevent errors
129
+ provider='dummy', # Use dummy provider to prevent errors
130
+ temperature=0, # dummy temperature to prevent errors
131
+ )
132
+
133
+ res = agent.solve(env=isolated_env, task_index=task_index)
134
+
135
+ sample.metadata['task_result'] = res.model_dump(exclude_none=True)
136
+ return ModelOutput(
137
+ model=model.name,
138
+ choices=[ChatCompletionChoice.from_content(res.model_dump_json(indent=2))],
139
+ )
140
+
141
+ except Exception as e:
142
+ logger.error(f'Error in tau_bench prediction: {str(e)}')
143
+ sample.metadata['task_result'] = {'reward': 0, 'error': str(e)}
144
+ return ModelOutput(
145
+ model=model.name,
146
+ choices=[ChatCompletionChoice.from_content('')],
147
+ )
@@ -2,35 +2,46 @@ import importlib
2
2
  from collections import defaultdict
3
3
  from typing import Dict, List
4
4
 
5
- from evalscope.benchmarks import Benchmark, DataAdapter
6
- from evalscope.metrics import Metric, mean, metric_registry
5
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
6
+ from evalscope.api.dataset import Sample
7
+ from evalscope.api.dataset.dataset import DatasetDict
8
+ from evalscope.api.dataset.loader import DictDataLoader
9
+ from evalscope.api.messages.chat_message import ChatMessageUser
10
+ from evalscope.api.metric import Score
11
+ from evalscope.api.model import Model, ModelOutput
12
+ from evalscope.api.registry import register_benchmark
13
+ from evalscope.constants import Tags
7
14
  from evalscope.utils import get_logger
15
+ from evalscope.utils.function_utils import run_once
8
16
 
9
17
  logger = get_logger()
10
18
 
11
19
 
12
- @Benchmark.register(
13
- name='tau_bench',
14
- pretty_name='τ-bench',
15
- tags=['Reasoning', 'Agent', 'Function Calling'],
16
- description='A benchmark emulating dynamic conversations between a user (simulated by language models) '
17
- 'and a language agent provided with domain-specific API tools and policy guidelines. '
18
- 'Please install it with `pip install git+https://github.com/sierra-research/tau-bench` before evaluating and set a user model. ', # noqa: E501
19
- dataset_id='https://github.com/sierra-research/tau-bench',
20
- model_adapter='tau_bench_server',
21
- subset_list=['airline', 'retail'],
22
- metric_list=['Pass^1'],
23
- eval_split='test',
24
- extra_params={
25
- 'user_model': 'qwen-plus',
26
- 'api_key': 'EMPTY',
27
- 'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
28
- 'generation_config': {
29
- 'temperature': 0.7,
30
- 'max_new_tokens': 1024
20
+ @register_benchmark(
21
+ BenchmarkMeta(
22
+ name='tau_bench',
23
+ pretty_name='τ-bench',
24
+ tags=[Tags.FUNCTION_CALLING, Tags.REASONING],
25
+ description='A benchmark emulating dynamic conversations between a user (simulated by language models) '
26
+ 'and a language agent provided with domain-specific API tools and policy guidelines. '
27
+ 'Please install it with `pip install git+https://github.com/sierra-research/tau-bench` '
28
+ 'before evaluating and set a user model. [Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/tau_bench.html)', # noqa: E501
29
+ dataset_id='https://github.com/sierra-research/tau-bench',
30
+ subset_list=['airline', 'retail'],
31
+ metric_list=['Pass^1'],
32
+ eval_split='test',
33
+ extra_params={
34
+ 'user_model': 'qwen-plus',
35
+ 'api_key': 'EMPTY',
36
+ 'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
37
+ 'generation_config': {
38
+ 'temperature': 0.7,
39
+ 'max_new_tokens': 1024
40
+ }
31
41
  }
32
- })
33
- class TauBenchAdapter(DataAdapter):
42
+ )
43
+ )
44
+ class TauBenchAdapter(DefaultDataAdapter):
34
45
 
35
46
  def __init__(self, **kwargs):
36
47
  super().__init__(**kwargs)
@@ -41,32 +52,39 @@ class TauBenchAdapter(DataAdapter):
41
52
  '`tau_bench` not found, please install it with `pip install git+https://github.com/sierra-research/tau-bench` before evaluating.' # noqa: E501
42
53
  )
43
54
 
44
- metric_registry.register(Metric(name='Pass^1', object=mean))
45
-
46
55
  # setup user model args
47
- extra_params = kwargs.get('extra_params', {})
48
- self.user_model = extra_params.get('user_model', 'qwen-plus')
49
- self.api_key = extra_params.get('api_key', 'EMPTY')
50
- self.api_base = extra_params.get('api_base', 'https://dashscope.aliyuncs.com/compatible-mode/v1')
51
- self.generation_config = extra_params.get('generation_config', {'temperature': 0.7, 'max_new_tokens': 1024})
56
+ self.user_model = self.extra_params.get('user_model', 'qwen-plus')
57
+ self.api_key = self.extra_params.get('api_key', 'EMPTY')
58
+ self.api_base = self.extra_params.get('api_base', 'https://dashscope.aliyuncs.com/compatible-mode/v1')
59
+ self.generation_config = self.extra_params.get(
60
+ 'generation_config', {
61
+ 'temperature': 0.7,
62
+ 'max_new_tokens': 1024
63
+ }
64
+ )
52
65
 
53
66
  self._patch_env_completion()
54
67
 
68
+ @run_once
55
69
  def _patch_env_completion(self) -> str:
56
70
  from tau_bench.envs.user import LLMUserSimulationEnv
57
71
 
58
72
  def new_generate_next_message(self, messages):
59
- from evalscope.models import ServerModelAdapter
73
+ from evalscope.api.messages import dict_to_chat_message
74
+ from evalscope.api.model import GenerateConfig, get_model
75
+ from evalscope.constants import EvalType
76
+
77
+ user_server = get_model(
78
+ model=adapter_instance.user_model,
79
+ eval_type=EvalType.SERVICE,
80
+ base_url=adapter_instance.api_base,
81
+ api_key=adapter_instance.api_key,
82
+ config=GenerateConfig(**adapter_instance.generation_config)
83
+ )
60
84
 
61
- user_server = ServerModelAdapter(
62
- api_url=adapter_instance.api_base,
63
- model_id=adapter_instance.user_model,
64
- api_key=adapter_instance.api_key)
65
- request_json = user_server.make_request(
66
- input_item={'messages': messages}, infer_cfg=adapter_instance.generation_config)
67
- res = user_server.send_request(request_json)
85
+ res = user_server.generate(input=[dict_to_chat_message(msg) for msg in messages])
68
86
 
69
- message = res['choices'][0]['message']
87
+ message = res.message.model_dump(exclude_none=True)
70
88
  self.messages.append(message)
71
89
  self.total_cost = 0
72
90
  return message['content']
@@ -75,7 +93,7 @@ class TauBenchAdapter(DataAdapter):
75
93
  adapter_instance = self
76
94
  LLMUserSimulationEnv.generate_next_message = new_generate_next_message
77
95
 
78
- def load(self, **kwargs):
96
+ def load(self):
79
97
  from tau_bench.envs import get_env
80
98
 
81
99
  data_dict = defaultdict(dict)
@@ -94,17 +112,57 @@ class TauBenchAdapter(DataAdapter):
94
112
  'task_index': i,
95
113
  'env_name': env_name,
96
114
  })
97
- data_dict[env_name][self.eval_split] = tasks
98
-
99
- return data_dict
100
-
101
- def gen_prompt(self, input_d, subset_name, few_shot_list, **kwargs):
102
- return self.gen_prompt_data(extra_data=input_d)
103
-
104
- def get_gold_answer(self, input_d):
105
- return ''
106
-
107
- def match(self, gold, pred):
108
- import json
109
- res = json.loads(pred)
110
- return res.get('reward', 0.0)
115
+ # load dataset
116
+ dataset = DictDataLoader(
117
+ dict_list=tasks, sample_fields=self.record_to_sample, limit=self.limit, repeats=self.repeats
118
+ ).load()
119
+
120
+ data_dict[env_name] = dataset
121
+
122
+ test_dataset = DatasetDict(data_dict)
123
+
124
+ return test_dataset, None
125
+
126
+ def record_to_sample(self, record: Dict) -> Sample:
127
+ """Convert a data record to a Sample object."""
128
+ return Sample(
129
+ input=[ChatMessageUser(content='')],
130
+ target='', # Will use the record for evaluation
131
+ subset_key=record['env_name'],
132
+ metadata=record # Store the full record for evaluation
133
+ )
134
+
135
+ def _on_inference(self, model: Model, sample: Sample) -> ModelOutput:
136
+ from .generation import predict
137
+ return predict(model, sample)
138
+
139
+ def match_score(self, original_prediction: str, filtered_prediction: str, reference: str, task_state) -> Score:
140
+
141
+ score = Score(
142
+ extracted_prediction=filtered_prediction,
143
+ prediction=original_prediction,
144
+ )
145
+
146
+ try:
147
+ # Parse the prediction to get the reward
148
+ res = task_state.metadata
149
+ reward = res.get('reward', 0.0)
150
+
151
+ score.value = {
152
+ 'Pass^1': float(reward),
153
+ }
154
+ score.explanation = f'Task completed with reward: {reward}'
155
+ score.metadata = {
156
+ 'task_result': res,
157
+ 'env_name': task_state.metadata.get('env_name', 'unknown'),
158
+ 'task_index': task_state.metadata.get('task_index', -1)
159
+ }
160
+ score.main_score_name = 'Pass^1'
161
+
162
+ except Exception as e:
163
+ score.value = {'Pass^1': 0.0}
164
+ score.explanation = f'Evaluation failed: {str(e)}'
165
+ score.metadata = {'error': str(e)}
166
+ score.main_score_name = 'Pass^1'
167
+
168
+ return score