evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +3 -0
  3. evalscope/api/benchmark/adapters/__init__.py +5 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
  8. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  9. evalscope/api/benchmark/benchmark.py +356 -0
  10. evalscope/api/benchmark/meta.py +121 -0
  11. evalscope/api/dataset/__init__.py +2 -0
  12. evalscope/api/dataset/dataset.py +349 -0
  13. evalscope/api/dataset/loader.py +262 -0
  14. evalscope/api/dataset/utils.py +143 -0
  15. evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope/api/evaluator/cache.py +378 -0
  17. evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope/api/evaluator/state.py +275 -0
  19. evalscope/api/filter/__init__.py +1 -0
  20. evalscope/api/filter/filter.py +72 -0
  21. evalscope/api/messages/__init__.py +12 -0
  22. evalscope/api/messages/chat_message.py +243 -0
  23. evalscope/api/messages/content.py +102 -0
  24. evalscope/api/messages/utils.py +35 -0
  25. evalscope/api/metric/__init__.py +2 -0
  26. evalscope/api/metric/metric.py +55 -0
  27. evalscope/api/metric/scorer.py +113 -0
  28. evalscope/api/mixin/__init__.py +1 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +155 -0
  32. evalscope/api/model/model.py +386 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/app.py +3 -0
  40. evalscope/app/ui/app_ui.py +2 -1
  41. evalscope/app/ui/multi_model.py +50 -25
  42. evalscope/app/ui/single_model.py +26 -14
  43. evalscope/app/utils/data_utils.py +43 -27
  44. evalscope/app/utils/env_utils.py +12 -0
  45. evalscope/app/utils/text_utils.py +14 -14
  46. evalscope/app/utils/visualization.py +9 -4
  47. evalscope/arguments.py +7 -10
  48. evalscope/backend/opencompass/api_meta_template.py +2 -1
  49. evalscope/backend/opencompass/backend_manager.py +6 -5
  50. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  51. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  52. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  53. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  55. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  56. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  57. evalscope/backend/rag_eval/utils/embedding.py +10 -1
  58. evalscope/backend/rag_eval/utils/llm.py +13 -12
  59. evalscope/benchmarks/__init__.py +0 -2
  60. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  61. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  62. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  63. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  64. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  65. evalscope/benchmarks/arena_hard/utils.py +37 -1
  66. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  67. evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
  68. evalscope/benchmarks/bfcl/generation.py +222 -0
  69. evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
  70. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  71. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  72. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  73. evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
  74. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  75. evalscope/benchmarks/docmath/utils.py +4 -5
  76. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  77. evalscope/benchmarks/frames/frames_adapter.py +136 -52
  78. evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
  79. evalscope/benchmarks/general_arena/utils.py +23 -27
  80. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  81. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  82. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  83. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  84. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  85. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  86. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  87. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  88. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  89. evalscope/benchmarks/ifeval/instructions.py +109 -64
  90. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  91. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  92. evalscope/benchmarks/ifeval/utils.py +6 -7
  93. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  94. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  95. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  96. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/math_vista/__init__.py +0 -0
  105. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  106. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  107. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  108. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  109. evalscope/benchmarks/mmmu/__init__.py +0 -0
  110. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  111. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  112. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  113. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  114. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
  115. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  116. evalscope/benchmarks/race/race_adapter.py +33 -119
  117. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  118. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  119. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  120. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  121. evalscope/benchmarks/tau_bench/generation.py +147 -0
  122. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
  123. evalscope/benchmarks/text2image/__init__.py +0 -0
  124. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  125. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  126. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  127. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  128. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  129. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  131. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
  132. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  133. evalscope/cli/cli.py +2 -0
  134. evalscope/cli/start_app.py +7 -1
  135. evalscope/cli/start_perf.py +7 -1
  136. evalscope/cli/start_server.py +6 -3
  137. evalscope/collections/__init__.py +2 -10
  138. evalscope/collections/sampler.py +10 -10
  139. evalscope/collections/schema.py +13 -11
  140. evalscope/config.py +157 -57
  141. evalscope/constants.py +37 -61
  142. evalscope/evaluator/__init__.py +1 -1
  143. evalscope/evaluator/evaluator.py +275 -419
  144. evalscope/filters/__init__.py +2 -0
  145. evalscope/filters/extraction.py +126 -0
  146. evalscope/filters/selection.py +57 -0
  147. evalscope/metrics/__init__.py +13 -13
  148. evalscope/metrics/llm_judge.py +47 -33
  149. evalscope/metrics/math_parser.py +27 -22
  150. evalscope/metrics/metric.py +307 -0
  151. evalscope/metrics/metrics.py +22 -18
  152. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  153. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  154. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  155. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  156. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  157. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  158. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  159. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  160. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  162. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  163. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  184. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  185. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  186. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  187. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  188. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  189. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  190. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  191. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  192. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  193. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  194. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  195. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  196. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  197. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  198. evalscope/models/__init__.py +6 -29
  199. evalscope/models/image_edit_model.py +125 -0
  200. evalscope/models/mockllm.py +65 -0
  201. evalscope/models/model_apis.py +67 -0
  202. evalscope/models/modelscope.py +455 -0
  203. evalscope/models/openai_compatible.py +126 -0
  204. evalscope/models/text2image_model.py +124 -0
  205. evalscope/models/utils/openai.py +701 -0
  206. evalscope/perf/benchmark.py +4 -1
  207. evalscope/perf/http_client.py +4 -2
  208. evalscope/perf/plugin/api/custom_api.py +5 -4
  209. evalscope/perf/plugin/api/openai_api.py +11 -9
  210. evalscope/perf/plugin/datasets/custom.py +2 -1
  211. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  212. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  213. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  214. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  215. evalscope/perf/plugin/datasets/openqa.py +4 -2
  216. evalscope/perf/utils/benchmark_util.py +15 -10
  217. evalscope/perf/utils/db_util.py +9 -6
  218. evalscope/perf/utils/local_server.py +11 -3
  219. evalscope/perf/utils/rich_display.py +16 -10
  220. evalscope/report/__init__.py +2 -3
  221. evalscope/report/combinator.py +18 -12
  222. evalscope/report/generator.py +51 -35
  223. evalscope/report/{utils.py → report.py} +8 -6
  224. evalscope/run.py +33 -47
  225. evalscope/summarizer.py +1 -1
  226. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  227. evalscope/utils/__init__.py +21 -2
  228. evalscope/utils/chat_service.py +3 -2
  229. evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope/utils/function_utils.py +29 -0
  231. evalscope/utils/import_utils.py +23 -1
  232. evalscope/utils/io_utils.py +142 -6
  233. evalscope/utils/json_schema.py +208 -0
  234. evalscope/utils/logger.py +51 -12
  235. evalscope/utils/model_utils.py +11 -7
  236. evalscope/utils/multi_choices.py +288 -0
  237. evalscope/utils/url_utils.py +65 -0
  238. evalscope/version.py +2 -2
  239. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
  240. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
  241. tests/benchmark/test_eval.py +385 -0
  242. tests/benchmark/test_image_edit.py +65 -0
  243. tests/{aigc → benchmark}/test_t2i.py +22 -4
  244. tests/benchmark/test_vlm.py +80 -0
  245. tests/cli/test_all.py +85 -47
  246. tests/cli/test_collection.py +20 -8
  247. tests/cli/test_custom.py +22 -15
  248. tests/cli/test_reasoning.py +81 -0
  249. tests/common.py +73 -0
  250. tests/perf/test_perf.py +4 -2
  251. tests/rag/test_clip_benchmark.py +0 -2
  252. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  253. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  254. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  255. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
  256. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  257. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  258. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  259. evalscope/benchmarks/benchmark.py +0 -81
  260. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  261. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  262. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  263. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  264. evalscope/benchmarks/data_adapter.py +0 -528
  265. evalscope/benchmarks/filters.py +0 -59
  266. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  267. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  268. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  269. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  270. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  271. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  272. evalscope/benchmarks/race/race.py +0 -104
  273. evalscope/benchmarks/race/samples.jsonl +0 -5
  274. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  275. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  276. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  277. evalscope/benchmarks/utils.py +0 -60
  278. evalscope/collections/evaluator.py +0 -375
  279. evalscope/metrics/completion_parsers.py +0 -227
  280. evalscope/metrics/named_metrics.py +0 -55
  281. evalscope/models/adapters/__init__.py +0 -14
  282. evalscope/models/adapters/base_adapter.py +0 -84
  283. evalscope/models/adapters/bfcl_adapter.py +0 -246
  284. evalscope/models/adapters/chat_adapter.py +0 -207
  285. evalscope/models/adapters/choice_adapter.py +0 -222
  286. evalscope/models/adapters/custom_adapter.py +0 -71
  287. evalscope/models/adapters/server_adapter.py +0 -236
  288. evalscope/models/adapters/t2i_adapter.py +0 -79
  289. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  290. evalscope/models/custom/__init__.py +0 -4
  291. evalscope/models/custom/custom_model.py +0 -50
  292. evalscope/models/custom/dummy_model.py +0 -99
  293. evalscope/models/local_model.py +0 -128
  294. evalscope/models/register.py +0 -41
  295. tests/cli/test_run.py +0 -489
  296. /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
  297. /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
  298. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  299. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  300. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  301. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  302. /tests/{aigc → benchmark}/__init__.py +0 -0
@@ -0,0 +1,182 @@
1
+ import copy
2
+ from typing import TYPE_CHECKING, Callable, Dict, Optional, Type, Union
3
+
4
+ if TYPE_CHECKING:
5
+ from evalscope.api.benchmark import BenchmarkMeta, DataAdapter
6
+ from evalscope.api.filter import Filter
7
+ from evalscope.api.metric import Aggregator, Metric
8
+ from evalscope.api.model.model import ModelAPI
9
+ from evalscope.config import TaskConfig
10
+
11
+ # BEGIN: Registry for benchmarks
12
+ # Registry for benchmarks, allowing dynamic registration and retrieval of benchmark metadata and data adapters.
13
+ BENCHMARK_REGISTRY: Dict[str, 'BenchmarkMeta'] = {}
14
+
15
+
16
+ def register_benchmark(metadata: 'BenchmarkMeta'):
17
+ """Register a benchmark with its metadata."""
18
+
19
+ def register_wrapper(data_adapter: Type['DataAdapter']):
20
+ if metadata.name in BENCHMARK_REGISTRY:
21
+ raise ValueError(f'Benchmark {metadata.name} already registered')
22
+ metadata.data_adapter = data_adapter
23
+ BENCHMARK_REGISTRY[metadata.name] = metadata
24
+ return data_adapter
25
+
26
+ return register_wrapper
27
+
28
+
29
+ def get_benchmark(name: str, config: Optional['TaskConfig'] = None) -> 'DataAdapter':
30
+ """
31
+ Retrieve a registered benchmark by name.
32
+
33
+ Args:
34
+ name (str): The name of the benchmark.
35
+ config (Optional['TaskConfig']): The task configuration.
36
+ dataset_args (Optional[dict]): The dataset-specific arguments.
37
+
38
+ """
39
+ # copy to avoid modifying the original metadata
40
+ metadata = copy.deepcopy(BENCHMARK_REGISTRY.get(name))
41
+ if not metadata:
42
+ raise ValueError(f'Benchmark {name} not found, available benchmarks: {list(sorted(BENCHMARK_REGISTRY.keys()))}')
43
+
44
+ # Update metadata with dataset-specific configuration
45
+ if config is not None:
46
+ metadata._update(config.dataset_args.get(name, {}))
47
+ # Return the data adapter initialized with the benchmark metadata
48
+ data_adapter_cls = metadata.data_adapter
49
+ return data_adapter_cls(benchmark_meta=metadata, task_config=config)
50
+
51
+
52
+ # END: Registry for benchmarks
53
+
54
+ # BEGIN: Registry for model APIs
55
+ # Registry for model APIs, allowing dynamic registration and retrieval of model API classes.
56
+ MODEL_APIS: Dict[str, Type['ModelAPI']] = {}
57
+
58
+
59
+ def register_model_api(name: str):
60
+ """
61
+ Decorator to register a model API class with a given name.
62
+
63
+ :param name: The name of the model API.
64
+ """
65
+
66
+ def decorator(api_class: Type['ModelAPI']):
67
+ if name in MODEL_APIS:
68
+ raise ValueError(f"Model API '{name}' is already registered.")
69
+ MODEL_APIS[name] = api_class
70
+ return api_class
71
+
72
+ return decorator
73
+
74
+
75
+ def get_model_api(name: str) -> Type['ModelAPI']:
76
+ """
77
+ Retrieve a registered model API class by name.
78
+
79
+ :param name: The name of the model API.
80
+ :return: The model API class.
81
+ """
82
+ if name not in MODEL_APIS:
83
+ raise ValueError(f"Model API '{name}' is not registered. Available model APIs: {list(MODEL_APIS.keys())}")
84
+
85
+ wrapped = MODEL_APIS[name]
86
+ if not isinstance(wrapped, type):
87
+ return wrapped()
88
+ else:
89
+ return wrapped
90
+
91
+
92
+ # END: Registry for model APIs
93
+
94
+ # BEGIN: Registry for metrics
95
+ METRIC_REGISTRY: Dict[str, Type['Metric']] = {}
96
+
97
+
98
+ def register_metric(name: str):
99
+
100
+ def decorate(fn):
101
+ if name in METRIC_REGISTRY:
102
+ raise ValueError(f"Metric named '{name}' conflicts with existing registered metric!")
103
+
104
+ METRIC_REGISTRY[name] = fn
105
+ return fn
106
+
107
+ return decorate
108
+
109
+
110
+ def get_metric(name: str) -> Type['Metric']:
111
+ if name in METRIC_REGISTRY:
112
+ return METRIC_REGISTRY[name]
113
+ else:
114
+ raise ValueError(
115
+ f"Metric '{name}' not found in the registry. Available metrics: {list(METRIC_REGISTRY.keys())}"
116
+ )
117
+
118
+
119
+ # END: Registry for metrics
120
+
121
+ # BEGIN: Registry for filters
122
+
123
+ FILTER_REGISTRY: Dict[str, Type['Filter']] = {}
124
+
125
+
126
+ def register_filter(name):
127
+
128
+ def decorate(cls):
129
+ if name in FILTER_REGISTRY:
130
+ raise ValueError(f'Registering filter `{name}` that is already in Registry {FILTER_REGISTRY}')
131
+ FILTER_REGISTRY[name] = cls
132
+ return cls
133
+
134
+ return decorate
135
+
136
+
137
+ def get_filter(filter_name: str) -> Type['Filter']:
138
+ if filter_name not in FILTER_REGISTRY:
139
+ raise KeyError(
140
+ f"Filter '{filter_name}' not found in the registry. Available filters: {list(FILTER_REGISTRY.keys())}"
141
+ )
142
+ return FILTER_REGISTRY[filter_name]
143
+
144
+
145
+ # END: Registry for filters
146
+
147
+ # BEGIN: Registry for aggregation functions
148
+ AGGREGATION_REGISTRY: Dict[str, Type['Aggregator']] = {}
149
+
150
+
151
+ def register_aggregation(name: str):
152
+ """
153
+ Decorator to register an aggregation function with a given name.
154
+
155
+ :param name: The name of the aggregation function.
156
+ """
157
+
158
+ def decorator(aggregation_fn: 'Aggregator'):
159
+ if name in AGGREGATION_REGISTRY:
160
+ raise ValueError(f"Aggregation function '{name}' is already registered.")
161
+ AGGREGATION_REGISTRY[name] = aggregation_fn
162
+ return aggregation_fn
163
+
164
+ return decorator
165
+
166
+
167
+ def get_aggregation(name: str) -> Type['Aggregator']:
168
+ """
169
+ Retrieve a registered aggregation function by name.
170
+
171
+ :param name: The name of the aggregation function.
172
+ :return: The aggregation function.
173
+ """
174
+ if name not in AGGREGATION_REGISTRY:
175
+ raise ValueError(
176
+ f"Aggregation function '{name}' is not registered. "
177
+ f'Available aggregations: {list(AGGREGATION_REGISTRY.keys())}'
178
+ )
179
+ return AGGREGATION_REGISTRY[name]
180
+
181
+
182
+ # END: Registry for aggregation functions
@@ -0,0 +1,3 @@
1
+ from .tool_call import ToolCall, ToolCallContent, ToolCallError, ToolCallView, ToolChoice, ToolFunction
2
+ from .tool_info import Tool, ToolDescription, ToolInfo, ToolParams, set_tool_description, tool_description
3
+ from .utils import parse_tool_call, tool_parse_error_message
@@ -0,0 +1,101 @@
1
+ import json
2
+ from pydantic import BaseModel, Field, JsonValue, field_validator
3
+ from typing import Any, Callable, Dict, List, Literal, Optional, Union
4
+
5
+
6
+ class ToolFunction(BaseModel):
7
+ """Indicate that a specific tool function should be called."""
8
+
9
+ name: str
10
+ """The name of the tool function to call."""
11
+
12
+ arguments: Dict[str, Any]
13
+ """The arguments of the tool function to call"""
14
+
15
+ @field_validator('arguments', mode='before')
16
+ @classmethod
17
+ def parse_arguments(cls, v):
18
+ if isinstance(v, str):
19
+ try:
20
+ v = json.loads(v)
21
+ except Exception as e:
22
+ raise ValueError(f'arguments field string is not valid JSON: {e}')
23
+ if not isinstance(v, dict):
24
+ raise ValueError('arguments must be a dict or a JSON string representing a dict')
25
+ return v
26
+
27
+
28
+ class ToolCallContent(BaseModel):
29
+ """Content to include in tool call view."""
30
+
31
+ title: Optional[str] = Field(default=None)
32
+ """Optional (plain text) title for tool call content."""
33
+
34
+ format: Literal['text', 'markdown']
35
+ """Format (text or markdown)."""
36
+
37
+ content: str
38
+ """Text or markdown content."""
39
+
40
+
41
+ class ToolCallView(BaseModel):
42
+ """Custom view of a tool call.
43
+
44
+ Both `context` and `call` are optional. If `call` is not specified
45
+ then the view will default to a syntax highlighted Python function call.
46
+ """
47
+
48
+ context: Optional[ToolCallContent] = Field(default=None)
49
+ """Context for the tool call (i.e. current tool state)."""
50
+
51
+ call: Optional[ToolCallContent] = Field(default=None)
52
+ """Custom representation of tool call."""
53
+
54
+
55
+ class ToolCall(BaseModel):
56
+ id: str
57
+ """Unique identifier for tool call."""
58
+
59
+ function: ToolFunction
60
+ """Function to call."""
61
+
62
+ internal: Optional[JsonValue] = Field(default=None)
63
+ """Model provider specific payload - typically used to aid transformation back to model types."""
64
+
65
+ parse_error: Optional[str] = Field(default=None)
66
+ """Error which occurred parsing tool call."""
67
+
68
+ view: Optional[ToolCallContent] = Field(default=None)
69
+ """Custom view of tool call input."""
70
+
71
+ type: Optional[str] = Field(default=None)
72
+ """Tool call type (deprecated)."""
73
+
74
+
75
+ class ToolCallError(BaseModel):
76
+ """Error raised by a tool call."""
77
+
78
+ type: Literal[
79
+ 'parsing',
80
+ 'timeout',
81
+ 'unicode_decode',
82
+ 'permission',
83
+ 'file_not_found',
84
+ 'is_a_directory',
85
+ 'limit',
86
+ 'approval',
87
+ 'unknown',
88
+ ]
89
+ """Error type."""
90
+
91
+ message: str
92
+ """Error message."""
93
+
94
+
95
+ ToolChoice = Union[Literal['auto', 'any', 'none'], ToolFunction]
96
+ """Specify which tool to call.
97
+
98
+ "auto" means the model decides; "any" means use at least one tool,
99
+ "none" means never call a tool; ToolFunction instructs the model
100
+ to call a specific function.
101
+ """
@@ -0,0 +1,173 @@
1
+ import inspect
2
+ from dataclasses import dataclass
3
+ from docstring_parser import Docstring, parse
4
+ from pydantic import BaseModel, Field
5
+ from typing import Any, Callable, Dict, List, Literal, Optional, TypeAlias, Union, get_args, get_type_hints
6
+
7
+ from evalscope.utils.json_schema import JSONSchema, JSONType, json_schema, python_type_to_json_type
8
+
9
+ ToolParam: TypeAlias = JSONSchema
10
+ """Description of tool parameter in JSON Schema format."""
11
+
12
+
13
+ class Tool:
14
+
15
+ def __call__(
16
+ self,
17
+ *args: Any,
18
+ **kwargs: Any,
19
+ ) -> Any:
20
+ ...
21
+
22
+
23
+ class ToolParams(BaseModel):
24
+ """Description of tool parameters object in JSON Schema format."""
25
+
26
+ type: Literal['object'] = Field(default='object')
27
+ """Params type (always 'object')"""
28
+
29
+ properties: Dict[str, ToolParam] = Field(default_factory=dict)
30
+ """Tool function parameters."""
31
+
32
+ required: List[str] = Field(default_factory=list)
33
+ """List of required fields."""
34
+
35
+ additionalProperties: bool = Field(default=False)
36
+ """Are additional object properties allowed? (always `False`)"""
37
+
38
+
39
+ @dataclass
40
+ class ToolDescription:
41
+ name: Optional[str] = None
42
+ description: Optional[str] = None
43
+ parameters: Optional[ToolParams] = None
44
+
45
+
46
+ def tool_description(tool: Tool) -> ToolDescription:
47
+ return getattr(tool, TOOL_DESCRIPTION, ToolDescription())
48
+
49
+
50
+ def set_tool_description(tool: Tool, description: ToolDescription) -> None:
51
+ setattr(tool, TOOL_DESCRIPTION, description)
52
+
53
+
54
+ TOOL_DESCRIPTION = '__TOOL_DESCRIPTION__'
55
+
56
+
57
+ class ToolInfo(BaseModel):
58
+ """Specification of a tool (JSON Schema compatible)
59
+
60
+ If you are implementing a ModelAPI, most LLM libraries can
61
+ be passed this object (dumped to a dict) directly as a function
62
+ specification. For example, in the OpenAI provider:
63
+
64
+ ```python
65
+ ChatCompletionToolParam(
66
+ type="function",
67
+ function=tool.model_dump(exclude_none=True),
68
+ )
69
+ ```
70
+
71
+ In some cases the field names don't match up exactly. In that case
72
+ call `model_dump()` on the `parameters` field. For example, in the
73
+ Anthropic provider:
74
+
75
+ ```python
76
+ ToolParam(
77
+ name=tool.name,
78
+ description=tool.description,
79
+ input_schema=tool.parameters.model_dump(exclude_none=True),
80
+ )
81
+ ```
82
+ """
83
+
84
+ name: str
85
+ """Name of tool."""
86
+ description: str
87
+ """Short description of tool."""
88
+ parameters: ToolParams = Field(default_factory=ToolParams)
89
+ """JSON Schema of tool parameters object."""
90
+ options: Optional[Dict[str, object]] = Field(default=None)
91
+ """Optional property bag that can be used by the model provider to customize the implementation of the tool"""
92
+
93
+
94
+ def parse_tool_info(func: Callable[..., Any]) -> ToolInfo:
95
+ # tool may already have registry attributes w/ tool info
96
+ description = tool_description(func)
97
+ if (description.name and description.description and description.parameters is not None):
98
+ return ToolInfo(
99
+ name=description.name,
100
+ description=description.description,
101
+ parameters=description.parameters,
102
+ )
103
+
104
+ signature = inspect.signature(func)
105
+ type_hints = get_type_hints(func)
106
+ docstring = inspect.getdoc(func)
107
+ parsed_docstring: Optional[Docstring] = parse(docstring) if docstring else None
108
+
109
+ info = ToolInfo(name=func.__name__, description='')
110
+
111
+ for param_name, param in signature.parameters.items():
112
+ tool_param = ToolParam()
113
+
114
+ # Parse docstring
115
+ docstring_info = parse_docstring(docstring, param_name)
116
+
117
+ # Get type information from type annotations
118
+ if param_name in type_hints:
119
+ tool_param = json_schema(type_hints[param_name])
120
+ # as a fallback try to parse it from the docstring
121
+ # (this is minimally necessary for backwards compatiblity
122
+ # with tools gen1 type parsing, which only used docstrings)
123
+ elif 'docstring_type' in docstring_info:
124
+ json_type = python_type_to_json_type(docstring_info['docstring_type'])
125
+ if json_type and (json_type in get_args(JSONType)):
126
+ tool_param = ToolParam(type=json_type)
127
+
128
+ # Get default value
129
+ if param.default is param.empty:
130
+ info.parameters.required.append(param_name)
131
+ else:
132
+ tool_param.default = param.default
133
+
134
+ # Add description from docstring
135
+ if 'description' in docstring_info:
136
+ tool_param.description = docstring_info['description']
137
+
138
+ # append the tool param
139
+ info.parameters.properties[param_name] = tool_param
140
+
141
+ # Add function description if available
142
+ if parsed_docstring:
143
+ if parsed_docstring.description:
144
+ info.description = parsed_docstring.description.strip()
145
+ elif parsed_docstring.long_description:
146
+ info.description = parsed_docstring.long_description.strip()
147
+ elif parsed_docstring.short_description:
148
+ info.description = parsed_docstring.short_description.strip()
149
+
150
+ # Add examples if available
151
+ if parsed_docstring.examples:
152
+ examples = '\n\n'.join([(example.description or '') for example in parsed_docstring.examples])
153
+ info.description = f'{info.description}\n\nExamples\n\n{examples}'
154
+
155
+ return info
156
+
157
+
158
+ def parse_docstring(docstring: Optional[str], param_name: str) -> Dict[str, str]:
159
+ if not docstring:
160
+ return {}
161
+
162
+ parsed_docstring: Docstring = parse(docstring)
163
+
164
+ for param in parsed_docstring.params:
165
+ if param.arg_name == param_name:
166
+ schema: Dict[str, str] = {'description': param.description or ''}
167
+
168
+ if param.type_name:
169
+ schema['docstring_type'] = param.type_name
170
+
171
+ return schema
172
+
173
+ return {}
@@ -0,0 +1,64 @@
1
+ import json
2
+ import yaml
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ from evalscope.utils import get_logger
6
+ from .tool_call import ToolCall, ToolFunction
7
+ from .tool_info import ToolInfo
8
+
9
+ logger = get_logger()
10
+
11
+
12
+ def parse_tool_call(id: str, function: str, arguments: str, tools: Optional[List[ToolInfo]] = None) -> ToolCall:
13
+ """Parse a tool call from a JSON payload.
14
+
15
+ Note that this function doesn't know about internal tool names so the caller
16
+ should ammend the returned `ToolCall` by mapping the parsed `function` field from
17
+ from an internal name to an inspect tool name and fixing up the `ToolCall` object
18
+ as required to reflect this change.
19
+ """
20
+ error: Optional[str] = None
21
+ arguments_dict: Dict[str, Any] = {}
22
+
23
+ def report_parse_error(ex: Exception) -> None:
24
+ nonlocal error
25
+ error = tool_parse_error_message(arguments, ex)
26
+ logger.info(error)
27
+
28
+ # if the arguments is a dict, then handle it with a plain json.loads
29
+ arguments = arguments.strip()
30
+ if arguments.startswith('{'):
31
+ try:
32
+ arguments_dict = json.loads(arguments)
33
+ except json.JSONDecodeError as ex:
34
+ report_parse_error(ex)
35
+
36
+ # otherwise parse it as yaml (which will pickup unquoted strings, numbers, and true/false)
37
+ # and then create a dict that maps it to the first function argument
38
+ elif function and tools:
39
+ tool_info = next(
40
+ (tool for tool in tools if tool.name == function and len(tool.parameters.properties) > 0),
41
+ None,
42
+ )
43
+ if tool_info:
44
+ param_names = list(tool_info.parameters.properties.keys())
45
+ try:
46
+ value = yaml.safe_load(arguments)
47
+ arguments_dict[param_names[0]] = value
48
+ except yaml.error.YAMLError:
49
+ # If the yaml parser fails, we treat it as a string argument.
50
+ arguments_dict[param_names[0]] = arguments
51
+
52
+ # return ToolCall with error payload
53
+ return ToolCall(
54
+ id=id,
55
+ function=ToolFunction(
56
+ name=function,
57
+ arguments=arguments_dict,
58
+ ),
59
+ parse_error=error,
60
+ )
61
+
62
+
63
+ def tool_parse_error_message(arguments: str, ex: Exception) -> str:
64
+ return f'Error parsing the following tool call arguments:\n\n{arguments}\n\nError details: {ex}'
evalscope/app/app.py CHANGED
@@ -6,6 +6,7 @@ import argparse
6
6
  from evalscope.utils.logger import configure_logging
7
7
  from .arguments import add_argument
8
8
  from .ui import create_app_ui
9
+ from .utils.env_utils import setup_env
9
10
 
10
11
 
11
12
  def create_app(args: argparse.Namespace):
@@ -17,6 +18,8 @@ def create_app(args: argparse.Namespace):
17
18
  """
18
19
  configure_logging(debug=args.debug)
19
20
 
21
+ setup_env(args)
22
+
20
23
  demo = create_app_ui(args)
21
24
 
22
25
  demo.launch(
@@ -32,7 +32,8 @@ def create_app_ui(args: argparse.Namespace):
32
32
 
33
33
  @sidebar.load_btn.click(
34
34
  inputs=[sidebar.reports_dropdown],
35
- outputs=[visualization.single_model.report_name, visualization.multi_model.multi_report_name])
35
+ outputs=[visualization.single_model.report_name, visualization.multi_model.multi_report_name]
36
+ )
36
37
  def update_displays(reports_dropdown):
37
38
  if not reports_dropdown:
38
39
  gr.Warning(locale_dict['note'], duration=3)