evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +3 -0
  3. evalscope/api/benchmark/adapters/__init__.py +5 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
  8. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  9. evalscope/api/benchmark/benchmark.py +356 -0
  10. evalscope/api/benchmark/meta.py +121 -0
  11. evalscope/api/dataset/__init__.py +2 -0
  12. evalscope/api/dataset/dataset.py +349 -0
  13. evalscope/api/dataset/loader.py +262 -0
  14. evalscope/api/dataset/utils.py +143 -0
  15. evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope/api/evaluator/cache.py +378 -0
  17. evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope/api/evaluator/state.py +275 -0
  19. evalscope/api/filter/__init__.py +1 -0
  20. evalscope/api/filter/filter.py +72 -0
  21. evalscope/api/messages/__init__.py +12 -0
  22. evalscope/api/messages/chat_message.py +243 -0
  23. evalscope/api/messages/content.py +102 -0
  24. evalscope/api/messages/utils.py +35 -0
  25. evalscope/api/metric/__init__.py +2 -0
  26. evalscope/api/metric/metric.py +55 -0
  27. evalscope/api/metric/scorer.py +113 -0
  28. evalscope/api/mixin/__init__.py +1 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +155 -0
  32. evalscope/api/model/model.py +386 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/app.py +3 -0
  40. evalscope/app/ui/app_ui.py +2 -1
  41. evalscope/app/ui/multi_model.py +50 -25
  42. evalscope/app/ui/single_model.py +26 -14
  43. evalscope/app/utils/data_utils.py +43 -27
  44. evalscope/app/utils/env_utils.py +12 -0
  45. evalscope/app/utils/text_utils.py +14 -14
  46. evalscope/app/utils/visualization.py +9 -4
  47. evalscope/arguments.py +7 -10
  48. evalscope/backend/opencompass/api_meta_template.py +2 -1
  49. evalscope/backend/opencompass/backend_manager.py +6 -5
  50. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  51. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  52. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  53. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  55. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  56. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  57. evalscope/backend/rag_eval/utils/embedding.py +10 -1
  58. evalscope/backend/rag_eval/utils/llm.py +13 -12
  59. evalscope/benchmarks/__init__.py +0 -2
  60. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  61. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  62. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  63. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  64. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  65. evalscope/benchmarks/arena_hard/utils.py +37 -1
  66. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  67. evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
  68. evalscope/benchmarks/bfcl/generation.py +222 -0
  69. evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
  70. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  71. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  72. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  73. evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
  74. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  75. evalscope/benchmarks/docmath/utils.py +4 -5
  76. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  77. evalscope/benchmarks/frames/frames_adapter.py +136 -52
  78. evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
  79. evalscope/benchmarks/general_arena/utils.py +23 -27
  80. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  81. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  82. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  83. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  84. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  85. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  86. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  87. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  88. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  89. evalscope/benchmarks/ifeval/instructions.py +109 -64
  90. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  91. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  92. evalscope/benchmarks/ifeval/utils.py +6 -7
  93. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  94. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  95. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  96. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/math_vista/__init__.py +0 -0
  105. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  106. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  107. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  108. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  109. evalscope/benchmarks/mmmu/__init__.py +0 -0
  110. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  111. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  112. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  113. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  114. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
  115. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  116. evalscope/benchmarks/race/race_adapter.py +33 -119
  117. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  118. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  119. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  120. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  121. evalscope/benchmarks/tau_bench/generation.py +147 -0
  122. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
  123. evalscope/benchmarks/text2image/__init__.py +0 -0
  124. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  125. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  126. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  127. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  128. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  129. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  131. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
  132. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  133. evalscope/cli/cli.py +2 -0
  134. evalscope/cli/start_app.py +7 -1
  135. evalscope/cli/start_perf.py +7 -1
  136. evalscope/cli/start_server.py +6 -3
  137. evalscope/collections/__init__.py +2 -10
  138. evalscope/collections/sampler.py +10 -10
  139. evalscope/collections/schema.py +13 -11
  140. evalscope/config.py +157 -57
  141. evalscope/constants.py +37 -61
  142. evalscope/evaluator/__init__.py +1 -1
  143. evalscope/evaluator/evaluator.py +275 -419
  144. evalscope/filters/__init__.py +2 -0
  145. evalscope/filters/extraction.py +126 -0
  146. evalscope/filters/selection.py +57 -0
  147. evalscope/metrics/__init__.py +13 -13
  148. evalscope/metrics/llm_judge.py +47 -33
  149. evalscope/metrics/math_parser.py +27 -22
  150. evalscope/metrics/metric.py +307 -0
  151. evalscope/metrics/metrics.py +22 -18
  152. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  153. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  154. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  155. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  156. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  157. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  158. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  159. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  160. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  162. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  163. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  184. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  185. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  186. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  187. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  188. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  189. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  190. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  191. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  192. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  193. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  194. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  195. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  196. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  197. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  198. evalscope/models/__init__.py +6 -29
  199. evalscope/models/image_edit_model.py +125 -0
  200. evalscope/models/mockllm.py +65 -0
  201. evalscope/models/model_apis.py +67 -0
  202. evalscope/models/modelscope.py +455 -0
  203. evalscope/models/openai_compatible.py +126 -0
  204. evalscope/models/text2image_model.py +124 -0
  205. evalscope/models/utils/openai.py +701 -0
  206. evalscope/perf/benchmark.py +4 -1
  207. evalscope/perf/http_client.py +4 -2
  208. evalscope/perf/plugin/api/custom_api.py +5 -4
  209. evalscope/perf/plugin/api/openai_api.py +11 -9
  210. evalscope/perf/plugin/datasets/custom.py +2 -1
  211. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  212. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  213. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  214. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  215. evalscope/perf/plugin/datasets/openqa.py +4 -2
  216. evalscope/perf/utils/benchmark_util.py +15 -10
  217. evalscope/perf/utils/db_util.py +9 -6
  218. evalscope/perf/utils/local_server.py +11 -3
  219. evalscope/perf/utils/rich_display.py +16 -10
  220. evalscope/report/__init__.py +2 -3
  221. evalscope/report/combinator.py +18 -12
  222. evalscope/report/generator.py +51 -35
  223. evalscope/report/{utils.py → report.py} +8 -6
  224. evalscope/run.py +33 -47
  225. evalscope/summarizer.py +1 -1
  226. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  227. evalscope/utils/__init__.py +21 -2
  228. evalscope/utils/chat_service.py +3 -2
  229. evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope/utils/function_utils.py +29 -0
  231. evalscope/utils/import_utils.py +23 -1
  232. evalscope/utils/io_utils.py +142 -6
  233. evalscope/utils/json_schema.py +208 -0
  234. evalscope/utils/logger.py +51 -12
  235. evalscope/utils/model_utils.py +11 -7
  236. evalscope/utils/multi_choices.py +288 -0
  237. evalscope/utils/url_utils.py +65 -0
  238. evalscope/version.py +2 -2
  239. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
  240. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
  241. tests/benchmark/test_eval.py +385 -0
  242. tests/benchmark/test_image_edit.py +65 -0
  243. tests/{aigc → benchmark}/test_t2i.py +22 -4
  244. tests/benchmark/test_vlm.py +80 -0
  245. tests/cli/test_all.py +85 -47
  246. tests/cli/test_collection.py +20 -8
  247. tests/cli/test_custom.py +22 -15
  248. tests/cli/test_reasoning.py +81 -0
  249. tests/common.py +73 -0
  250. tests/perf/test_perf.py +4 -2
  251. tests/rag/test_clip_benchmark.py +0 -2
  252. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  253. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  254. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  255. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
  256. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  257. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  258. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  259. evalscope/benchmarks/benchmark.py +0 -81
  260. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  261. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  262. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  263. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  264. evalscope/benchmarks/data_adapter.py +0 -528
  265. evalscope/benchmarks/filters.py +0 -59
  266. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  267. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  268. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  269. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  270. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  271. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  272. evalscope/benchmarks/race/race.py +0 -104
  273. evalscope/benchmarks/race/samples.jsonl +0 -5
  274. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  275. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  276. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  277. evalscope/benchmarks/utils.py +0 -60
  278. evalscope/collections/evaluator.py +0 -375
  279. evalscope/metrics/completion_parsers.py +0 -227
  280. evalscope/metrics/named_metrics.py +0 -55
  281. evalscope/models/adapters/__init__.py +0 -14
  282. evalscope/models/adapters/base_adapter.py +0 -84
  283. evalscope/models/adapters/bfcl_adapter.py +0 -246
  284. evalscope/models/adapters/chat_adapter.py +0 -207
  285. evalscope/models/adapters/choice_adapter.py +0 -222
  286. evalscope/models/adapters/custom_adapter.py +0 -71
  287. evalscope/models/adapters/server_adapter.py +0 -236
  288. evalscope/models/adapters/t2i_adapter.py +0 -79
  289. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  290. evalscope/models/custom/__init__.py +0 -4
  291. evalscope/models/custom/custom_model.py +0 -50
  292. evalscope/models/custom/dummy_model.py +0 -99
  293. evalscope/models/local_model.py +0 -128
  294. evalscope/models/register.py +0 -41
  295. tests/cli/test_run.py +0 -489
  296. /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
  297. /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
  298. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  299. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  300. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  301. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  302. /tests/{aigc → benchmark}/__init__.py +0 -0
@@ -2,80 +2,90 @@ import importlib
2
2
  from collections import defaultdict
3
3
  from typing import Dict, List
4
4
 
5
- from evalscope.benchmarks import Benchmark, DataAdapter
6
- from evalscope.metrics import Metric, mean, metric_registry
5
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
6
+ from evalscope.api.dataset import Sample
7
+ from evalscope.api.dataset.dataset import DatasetDict
8
+ from evalscope.api.dataset.loader import DictDataLoader
9
+ from evalscope.api.messages.chat_message import ChatMessageUser
10
+ from evalscope.api.metric import Score
11
+ from evalscope.api.model import Model, ModelOutput
12
+ from evalscope.api.registry import register_benchmark
13
+ from evalscope.constants import Tags
7
14
  from evalscope.utils import get_logger
15
+ from evalscope.utils.function_utils import run_once
16
+ from evalscope.utils.import_utils import check_import
8
17
 
9
18
  logger = get_logger()
10
19
 
11
20
 
12
- @Benchmark.register(
13
- name='tau_bench',
14
- pretty_name='τ-bench',
15
- tags=['Reasoning', 'Agent', 'Function Calling'],
16
- description='A benchmark emulating dynamic conversations between a user (simulated by language models) '
17
- 'and a language agent provided with domain-specific API tools and policy guidelines. '
18
- 'Please install it with `pip install git+https://github.com/sierra-research/tau-bench` before evaluating and set a user model. ', # noqa: E501
19
- dataset_id='https://github.com/sierra-research/tau-bench',
20
- model_adapter='tau_bench_server',
21
- subset_list=['airline', 'retail'],
22
- metric_list=['Pass^1'],
23
- eval_split='test',
24
- extra_params={
25
- 'user_model': 'qwen-plus',
26
- 'api_key': 'EMPTY',
27
- 'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
28
- 'generation_config': {
29
- 'temperature': 0.7,
30
- 'max_new_tokens': 1024
21
+ @register_benchmark(
22
+ BenchmarkMeta(
23
+ name='tau_bench',
24
+ pretty_name='τ-bench',
25
+ tags=[Tags.FUNCTION_CALLING, Tags.REASONING],
26
+ description='A benchmark emulating dynamic conversations between a user (simulated by language models) '
27
+ 'and a language agent provided with domain-specific API tools and policy guidelines. '
28
+ 'Please install it with `pip install git+https://github.com/sierra-research/tau-bench` '
29
+ 'before evaluating and set a user model. [Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/tau_bench.html)', # noqa: E501
30
+ dataset_id='https://github.com/sierra-research/tau-bench',
31
+ subset_list=['airline', 'retail'],
32
+ metric_list=['Pass^1'],
33
+ eval_split='test',
34
+ extra_params={
35
+ 'user_model': 'qwen-plus',
36
+ 'api_key': 'EMPTY',
37
+ 'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
38
+ 'generation_config': {
39
+ 'temperature': 0.0,
40
+ 'max_tokens': 4096,
41
+ }
31
42
  }
32
- })
33
- class TauBenchAdapter(DataAdapter):
43
+ )
44
+ )
45
+ class TauBenchAdapter(DefaultDataAdapter):
34
46
 
35
47
  def __init__(self, **kwargs):
36
48
  super().__init__(**kwargs)
37
49
 
38
- spec = importlib.util.find_spec('tau_bench')
39
- if spec is None:
40
- raise ImportError(
41
- '`tau_bench` not found, please install it with `pip install git+https://github.com/sierra-research/tau-bench` before evaluating.' # noqa: E501
42
- )
43
-
44
- metric_registry.register(Metric(name='Pass^1', object=mean))
50
+ check_import('tau_bench', package='git+https://github.com/sierra-research/tau-bench', raise_error=True)
45
51
 
46
52
  # setup user model args
47
- extra_params = kwargs.get('extra_params', {})
48
- self.user_model = extra_params.get('user_model', 'qwen-plus')
49
- self.api_key = extra_params.get('api_key', 'EMPTY')
50
- self.api_base = extra_params.get('api_base', 'https://dashscope.aliyuncs.com/compatible-mode/v1')
51
- self.generation_config = extra_params.get('generation_config', {'temperature': 0.7, 'max_new_tokens': 1024})
53
+ self.user_model = self.extra_params.get('user_model', 'qwen-plus')
54
+ self.api_key = self.extra_params.get('api_key', 'EMPTY')
55
+ self.api_base = self.extra_params.get('api_base', 'https://dashscope.aliyuncs.com/compatible-mode/v1')
56
+ self.generation_config = self.extra_params.get('generation_config', {'temperature': 0.0, 'max_tokens': 4096})
52
57
 
53
58
  self._patch_env_completion()
54
59
 
60
+ @run_once
55
61
  def _patch_env_completion(self) -> str:
56
62
  from tau_bench.envs.user import LLMUserSimulationEnv
57
63
 
58
64
  def new_generate_next_message(self, messages):
59
- from evalscope.models import ServerModelAdapter
65
+ from evalscope.api.messages import dict_to_chat_message
66
+ from evalscope.api.model import GenerateConfig, get_model
67
+ from evalscope.constants import EvalType
68
+
69
+ user_server = get_model(
70
+ model=adapter_instance.user_model,
71
+ eval_type=EvalType.SERVICE,
72
+ base_url=adapter_instance.api_base,
73
+ api_key=adapter_instance.api_key,
74
+ config=GenerateConfig(**adapter_instance.generation_config)
75
+ )
60
76
 
61
- user_server = ServerModelAdapter(
62
- api_url=adapter_instance.api_base,
63
- model_id=adapter_instance.user_model,
64
- api_key=adapter_instance.api_key)
65
- request_json = user_server.make_request(
66
- input_item={'messages': messages}, infer_cfg=adapter_instance.generation_config)
67
- res = user_server.send_request(request_json)
77
+ res = user_server.generate(input=[dict_to_chat_message(msg) for msg in messages])
68
78
 
69
- message = res['choices'][0]['message']
79
+ message = {'role': 'assistant', 'content': res.completion}
70
80
  self.messages.append(message)
71
81
  self.total_cost = 0
72
- return message['content']
82
+ return res.completion
73
83
 
74
84
  # get the current instance of TauBenchAdapter
75
85
  adapter_instance = self
76
86
  LLMUserSimulationEnv.generate_next_message = new_generate_next_message
77
87
 
78
- def load(self, **kwargs):
88
+ def load(self):
79
89
  from tau_bench.envs import get_env
80
90
 
81
91
  data_dict = defaultdict(dict)
@@ -94,17 +104,61 @@ class TauBenchAdapter(DataAdapter):
94
104
  'task_index': i,
95
105
  'env_name': env_name,
96
106
  })
97
- data_dict[env_name][self.eval_split] = tasks
98
-
99
- return data_dict
100
-
101
- def gen_prompt(self, input_d, subset_name, few_shot_list, **kwargs):
102
- return self.gen_prompt_data(extra_data=input_d)
103
-
104
- def get_gold_answer(self, input_d):
105
- return ''
106
-
107
- def match(self, gold, pred):
108
- import json
109
- res = json.loads(pred)
110
- return res.get('reward', 0.0)
107
+ # load dataset
108
+ dataset = DictDataLoader(
109
+ dict_list=tasks,
110
+ sample_fields=self.record_to_sample,
111
+ limit=self.limit,
112
+ repeats=self.repeats,
113
+ shuffle=self.shuffle,
114
+ ).load()
115
+
116
+ data_dict[env_name] = dataset
117
+
118
+ test_dataset = DatasetDict(data_dict)
119
+
120
+ return test_dataset, None
121
+
122
+ def record_to_sample(self, record: Dict) -> Sample:
123
+ """Convert a data record to a Sample object."""
124
+ return Sample(
125
+ input=[ChatMessageUser(content='')],
126
+ target='', # Will use the record for evaluation
127
+ subset_key=record['env_name'],
128
+ metadata=record # Store the full record for evaluation
129
+ )
130
+
131
+ def _on_inference(self, model: Model, sample: Sample) -> ModelOutput:
132
+ from .generation import predict
133
+ return predict(model, sample)
134
+
135
+ def match_score(self, original_prediction: str, filtered_prediction: str, reference: str, task_state) -> Score:
136
+
137
+ score = Score(
138
+ extracted_prediction=filtered_prediction,
139
+ prediction=original_prediction,
140
+ )
141
+
142
+ try:
143
+ # Parse the prediction to get the reward
144
+ task_result = task_state.metadata['task_result']
145
+ reward = task_result.get('reward', 0.0)
146
+
147
+ score.value = {
148
+ 'Pass^1': float(reward),
149
+ }
150
+ score.explanation = f'Task completed with reward: {reward}'
151
+ score.metadata = {
152
+ 'task_result': task_result,
153
+ 'env_name': task_state.metadata.get('env_name', 'unknown'),
154
+ 'task_index': task_state.metadata.get('task_index', -1)
155
+ }
156
+ score.main_score_name = 'Pass^1'
157
+
158
+ except Exception as e:
159
+ score.value = {'Pass^1': 0.0}
160
+ score.explanation = f'Evaluation failed: {str(e)}'
161
+ score.metadata = {'error': str(e)}
162
+ score.main_score_name = 'Pass^1'
163
+
164
+ return score
File without changes
@@ -0,0 +1,78 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from collections import defaultdict
3
+ from typing import List, Optional, Union
4
+
5
+ from evalscope.api.benchmark import BenchmarkMeta, Text2ImageAdapter
6
+ from evalscope.api.metric.scorer import AggScore, Score
7
+ from evalscope.api.registry import get_metric, register_benchmark
8
+ from evalscope.constants import Tags
9
+ from evalscope.metrics import mean
10
+ from evalscope.utils.function_utils import thread_safe
11
+ from evalscope.utils.logger import get_logger
12
+
13
+ logger = get_logger()
14
+
15
+
16
+ @register_benchmark(
17
+ BenchmarkMeta(
18
+ name='evalmuse',
19
+ pretty_name='EvalMuse',
20
+ dataset_id='AI-ModelScope/T2V-Eval-Prompts',
21
+ description='EvalMuse Text-to-Image Benchmark. Used for evaluating the quality '
22
+ 'and semantic alignment of finely generated images',
23
+ tags=[Tags.TEXT_TO_IMAGE],
24
+ subset_list=['EvalMuse'],
25
+ metric_list=['FGA_BLIP2Score'],
26
+ few_shot_num=0,
27
+ train_split=None,
28
+ eval_split='test',
29
+ )
30
+ )
31
+ class EvalMuseAdapter(Text2ImageAdapter):
32
+
33
+ def __init__(self, **kwargs):
34
+ super().__init__(**kwargs)
35
+ assert len(self.metric_list
36
+ ) == 1 and self.metric_list[0] == 'FGA_BLIP2Score', 'Only FGA_BLIP2Score is supported for EvalMuse'
37
+
38
+ @thread_safe
39
+ def match_score(self, original_prediction, filtered_prediction, reference, task_state):
40
+ # Get prediction and prompt from task state
41
+ image_path = task_state.metadata.get('image_path', original_prediction)
42
+
43
+ # Initialize the score object with prediction details
44
+ score = Score(
45
+ extracted_prediction=image_path,
46
+ prediction=image_path,
47
+ )
48
+
49
+ # Calculate scores for each configured metric
50
+ try:
51
+ metric_name = self.metric_list[0]
52
+ metric_cls = get_metric(metric_name)
53
+ metric_func = metric_cls() # Initialize with parameters
54
+ metric_score = metric_func(image_path, task_state.metadata)[0]
55
+
56
+ for k, v in metric_score.items():
57
+ score.value[f'{metric_name}:{k}'] = v.cpu().item()
58
+ except Exception as e:
59
+ logger.error(f'Error calculating metric {metric_name}: {e}')
60
+ score.value[metric_name] = 0
61
+ score.metadata[metric_name] = f'error: {str(e)}'
62
+
63
+ return score
64
+
65
+ def aggregate_scores(self, sample_scores) -> List[AggScore]:
66
+ new_items = defaultdict(list)
67
+ agg_list = []
68
+ for sample_score in sample_scores:
69
+ for metric_name, value in sample_score.score.value.items():
70
+ metrics_prefix = metric_name.split(':')[0]
71
+ category = metric_name.rpartition('(')[-1].split(')')[0]
72
+ category = category.split('-')[0].lower() # remove the suffix if exists
73
+ new_items[f'{metrics_prefix}:{category}'].append(value)
74
+
75
+ for k, v in new_items.items():
76
+ agg_list.append(AggScore(metric_name=k, score=mean(v), num=len(v)))
77
+
78
+ return agg_list
@@ -0,0 +1,53 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import os
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, Text2ImageAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.messages import ChatMessageUser
7
+ from evalscope.api.registry import get_metric, register_benchmark
8
+ from evalscope.constants import Tags
9
+ from evalscope.utils.logger import get_logger
10
+
11
+ logger = get_logger()
12
+
13
+
14
+ @register_benchmark(
15
+ BenchmarkMeta(
16
+ name='genai_bench',
17
+ pretty_name='GenAI-Bench',
18
+ dataset_id='AI-ModelScope/T2V-Eval-Prompts',
19
+ description='GenAI-Bench Text-to-Image Benchmark. Includes 1600 prompts for text-to-image task.',
20
+ tags=[Tags.TEXT_TO_IMAGE],
21
+ subset_list=['GenAI-Bench-1600'],
22
+ metric_list=['VQAScore'],
23
+ few_shot_num=0,
24
+ train_split=None,
25
+ eval_split='test',
26
+ )
27
+ )
28
+ class GenAIBenchAdapter(Text2ImageAdapter):
29
+
30
+ def __init__(self, **kwargs):
31
+ super().__init__(**kwargs)
32
+
33
+ def load_from_disk(self, **kwargs):
34
+ if os.path.isfile(self.dataset_id):
35
+ file_name = os.path.basename(self.dataset_id)
36
+ file_without_ext = os.path.splitext(file_name)[0]
37
+ self.subset_list = [file_without_ext]
38
+
39
+ return super().load_from_disk(use_local_loader=True)
40
+
41
+ def record_to_sample(self, record) -> Sample:
42
+ """Convert a record dictionary to a Sample object."""
43
+ advanced = record['tags'].get('advanced')
44
+ return Sample(
45
+ input=[ChatMessageUser(content=record['prompt'])],
46
+ metadata={
47
+ 'id': record['id'],
48
+ 'prompt': record['prompt'],
49
+ 'category': 'advanced' if advanced else 'basic',
50
+ 'tags': record.get('tags', []),
51
+ 'image_path': record.get('image_path', ''), # Optional field for existing image path
52
+ }
53
+ )
@@ -0,0 +1,42 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import os
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, Text2ImageAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.messages import ChatMessageSystem, ChatMessageUser
7
+ from evalscope.api.registry import register_benchmark
8
+ from evalscope.constants import Tags
9
+ from evalscope.utils.logger import get_logger
10
+
11
+ logger = get_logger()
12
+
13
+
14
+ @register_benchmark(
15
+ BenchmarkMeta(
16
+ name='general_t2i',
17
+ dataset_id='general_t2i',
18
+ description='General Text-to-Image Benchmark',
19
+ tags=[Tags.TEXT_TO_IMAGE, Tags.CUSTOM],
20
+ subset_list=['default'],
21
+ metric_list=['PickScore'],
22
+ few_shot_num=0,
23
+ train_split=None,
24
+ eval_split='test',
25
+ )
26
+ )
27
+ class GeneralT2IAdapter(Text2ImageAdapter):
28
+
29
+ def __init__(self, **kwargs):
30
+
31
+ super().__init__(**kwargs)
32
+
33
+ def load_from_disk(self, **kwargs):
34
+ if os.path.isfile(self.dataset_id):
35
+ file_name = os.path.basename(self.dataset_id)
36
+ file_without_ext = os.path.splitext(file_name)[0]
37
+ self.subset_list = [file_without_ext]
38
+
39
+ return super().load_from_disk(use_local_loader=True)
40
+
41
+ def record_to_sample(self, record):
42
+ return Sample(input=[ChatMessageUser(content=record['prompt'])], metadata={'image_path': record['image_path']})
@@ -0,0 +1,52 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import os
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, Text2ImageAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.messages import ChatMessageUser
7
+ from evalscope.api.registry import register_benchmark
8
+ from evalscope.constants import Tags
9
+ from evalscope.utils.logger import get_logger
10
+
11
+ logger = get_logger()
12
+
13
+
14
+ @register_benchmark(
15
+ BenchmarkMeta(
16
+ name='hpdv2',
17
+ pretty_name='HPD-v2',
18
+ dataset_id='AI-ModelScope/T2V-Eval-Prompts',
19
+ description='HPDv2 Text-to-Image Benchmark. Evaluation metrics based on human preferences, '
20
+ 'trained on the Human Preference Dataset (HPD v2)',
21
+ tags=[Tags.TEXT_TO_IMAGE],
22
+ subset_list=['HPDv2'],
23
+ metric_list=['HPSv2.1Score'],
24
+ few_shot_num=0,
25
+ train_split=None,
26
+ eval_split='test',
27
+ )
28
+ )
29
+ class HPDv2Adapter(Text2ImageAdapter):
30
+
31
+ def __init__(self, **kwargs):
32
+ super().__init__(**kwargs)
33
+
34
+ def load_from_disk(self, **kwargs):
35
+ if os.path.isfile(self.dataset_id):
36
+ file_name = os.path.basename(self.dataset_id)
37
+ file_without_ext = os.path.splitext(file_name)[0]
38
+ self.subset_list = [file_without_ext]
39
+
40
+ return super().load_from_disk(use_local_loader=True)
41
+
42
+ def record_to_sample(self, record):
43
+ return Sample(
44
+ input=[ChatMessageUser(content=record['prompt'])],
45
+ metadata={
46
+ 'id': record['id'],
47
+ 'prompt': record['prompt'],
48
+ 'category': record.get('tags', {}).get('category', ''),
49
+ 'tags': record.get('tags', {}),
50
+ 'image_path': record.get('image_path', ''), # Optional field for existing image path
51
+ }
52
+ )
@@ -0,0 +1,27 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from evalscope.api.benchmark import BenchmarkMeta, Text2ImageAdapter
3
+ from evalscope.api.registry import register_benchmark
4
+ from evalscope.constants import Tags
5
+ from evalscope.utils.logger import get_logger
6
+
7
+ logger = get_logger()
8
+
9
+
10
+ @register_benchmark(
11
+ BenchmarkMeta(
12
+ name='tifa160',
13
+ pretty_name='TIFA-160',
14
+ dataset_id='AI-ModelScope/T2V-Eval-Prompts',
15
+ description='TIFA-160 Text-to-Image Benchmark',
16
+ tags=[Tags.TEXT_TO_IMAGE],
17
+ subset_list=['TIFA-160'],
18
+ metric_list=['PickScore'],
19
+ few_shot_num=0,
20
+ train_split=None,
21
+ eval_split='test',
22
+ )
23
+ )
24
+ class TIFA_Adapter(Text2ImageAdapter):
25
+
26
+ def __init__(self, **kwargs):
27
+ super().__init__(**kwargs)
@@ -1,81 +1,102 @@
1
1
  import json
2
- from typing import Dict, List
3
-
4
- from evalscope.benchmarks import Benchmark, DataAdapter
5
- from evalscope.constants import EvalType, OutputType
6
- from evalscope.metrics import Metric, mean, metric_registry
7
-
8
-
9
- @Benchmark.register(
10
- name='tool_bench',
11
- pretty_name='ToolBench-Static',
12
- tags=['Reasoning', 'Agent', 'Function Calling'],
13
- description='ToolBench is a benchmark for evaluating AI models on tool use tasks. '
14
- 'It includes various subsets such as in-domain and out-of-domain, '
15
- 'each with its own set of problems that require step-by-step reasoning to arrive at the correct answer. '
16
- '[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/toolbench.html)', # noqa: E501
17
- dataset_id='AI-ModelScope/ToolBench-Static',
18
- subset_list=['in_domain', 'out_of_domain'],
19
- metric_list=['Act.EM', 'Plan.EM', 'F1', 'HalluRate', 'Rouge-L'],
20
- few_shot_num=0,
21
- train_split=None,
22
- eval_split='test',
2
+ from typing import Any, Dict
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.evaluator import TaskState
7
+ from evalscope.api.messages.chat_message import ChatMessage, dict_to_chat_message
8
+ from evalscope.api.metric import Score
9
+ from evalscope.api.registry import register_benchmark
10
+ from evalscope.constants import Tags
11
+ from evalscope.utils.logger import get_logger
12
+
13
+ logger = get_logger()
14
+
15
+
16
+ @register_benchmark(
17
+ BenchmarkMeta(
18
+ name='tool_bench',
19
+ pretty_name='ToolBench-Static',
20
+ tags=[Tags.REASONING, Tags.FUNCTION_CALLING],
21
+ description='ToolBench is a benchmark for evaluating AI models on tool use tasks. '
22
+ 'It includes various subsets such as in-domain and out-of-domain, '
23
+ 'each with its own set of problems that require step-by-step reasoning to arrive at the correct answer. '
24
+ '[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/toolbench.html)',
25
+ dataset_id='AI-ModelScope/ToolBench-Static',
26
+ subset_list=['in_domain', 'out_of_domain'],
27
+ metric_list=['Act.EM', 'Plan.EM', 'F1', 'HalluRate', 'Rouge-L'],
28
+ eval_split='test',
29
+ )
23
30
  )
24
- class ToolBenchAdapter(DataAdapter):
31
+ class ToolBenchAdapter(DefaultDataAdapter):
32
+ """
33
+ ToolBench adapter using the new data processing framework.
34
+ """
25
35
 
26
36
  def __init__(self, **kwargs):
27
37
  super().__init__(**kwargs)
28
38
 
29
- metric_registry.register(Metric(name='Rouge-L', object=mean))
30
- metric_registry.register(Metric(name='Act.EM', object=mean))
31
- metric_registry.register(Metric(name='Plan.EM', object=mean))
32
- metric_registry.register(Metric(name='F1', object=mean))
33
- metric_registry.register(Metric(name='HalluRate', object=mean))
34
-
35
- def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
36
- """
37
- Generate model prompt from input data.
38
- """
39
- messages = input_d['messages']
40
- # use prepared messages and remove the name field
39
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
40
+ """Convert a data record to a Sample object."""
41
+ messages = record['messages']
42
+
43
+ # Process messages and remove the name field, convert function messages
44
+ processed_messages = []
41
45
  for message in messages:
42
- if 'name' in message:
43
- del message['name']
44
- if 'role' in message:
45
- if message['role'] == 'function':
46
- content = json.dumps(message, ensure_ascii=False)
47
- message['role'] = 'user'
48
- message['content'] = content
49
- return self.gen_prompt_data(prompt='', messages=messages)
50
-
51
- def get_gold_answer(self, input_d: dict) -> str:
52
- """
53
- Parse the raw input labels (gold).
54
- """
55
- return input_d
56
-
57
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
58
- """
59
- Parse the predicted result and extract proper answer.
60
- """
61
- return result
62
-
63
- def match(self, gold: dict, pred: str) -> Dict:
64
- """
65
- Match the gold answer and the predicted answer.
66
- """
46
+ msg_dict = message.copy()
47
+ if 'name' in msg_dict:
48
+ del msg_dict['name']
49
+ if 'role' in msg_dict:
50
+ if msg_dict['role'] == 'function':
51
+ content = json.dumps(msg_dict, ensure_ascii=False)
52
+ msg_dict['role'] = 'user'
53
+ msg_dict['content'] = content
54
+
55
+ # Convert to ChatMessage object
56
+ chat_msg = dict_to_chat_message(msg_dict)
57
+ processed_messages.append(chat_msg)
58
+
59
+ return Sample(
60
+ input=processed_messages,
61
+ target='', # Store the full record as target for evaluation
62
+ metadata={
63
+ 'target': record['target'],
64
+ 'tools': record['tools'],
65
+ 'messages': record['messages']
66
+ }
67
+ )
68
+
69
+ def match_score(
70
+ self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
71
+ ) -> Score:
67
72
  from .utils import calculate_metrics
68
73
 
69
- data = {
70
- 'target': gold['target'],
71
- 'predictions': pred,
72
- 'tools': gold['tools'],
73
- }
74
- metrics = calculate_metrics(data)
75
- return metrics
74
+ score = Score(
75
+ extracted_prediction=filtered_prediction,
76
+ prediction=original_prediction,
77
+ )
78
+
79
+ doc = task_state.metadata
80
+
81
+ try:
82
+ data = {
83
+ 'target': doc['target'],
84
+ 'predictions': filtered_prediction,
85
+ 'tools': doc['tools'],
86
+ }
87
+ metrics = calculate_metrics(data)
88
+
89
+ score.value = metrics
90
+ score.explanation = f'Metrics: {metrics}'
91
+ score.metadata = {'target': doc['target'], 'tools': doc['tools'], 'detailed_metrics': metrics}
92
+ # Set the main score (you can choose the most important metric)
93
+ score.main_score_name = 'F1'
76
94
 
77
- def compute_metric(self, review_res_list: List[dict], **kwargs) -> Dict:
78
- # aggregate review results
79
- res_dict = super().compute_dict_metric(review_res_list, **kwargs)
95
+ except Exception as e:
96
+ # Handle evaluation errors
97
+ score.value = {'Act.EM': 0.0, 'Plan.EM': 0.0, 'F1': 0.0, 'HalluRate': 1.0, 'Rouge-L': 0.0}
98
+ score.explanation = f'Evaluation failed: {str(e)}'
99
+ score.metadata = {'error': str(e)}
100
+ score.main_score_name = 'F1'
80
101
 
81
- return super().compute_metric(res_dict, **kwargs)
102
+ return score