evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +3 -0
  3. evalscope/api/benchmark/adapters/__init__.py +5 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
  8. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  9. evalscope/api/benchmark/benchmark.py +356 -0
  10. evalscope/api/benchmark/meta.py +121 -0
  11. evalscope/api/dataset/__init__.py +2 -0
  12. evalscope/api/dataset/dataset.py +349 -0
  13. evalscope/api/dataset/loader.py +262 -0
  14. evalscope/api/dataset/utils.py +143 -0
  15. evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope/api/evaluator/cache.py +378 -0
  17. evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope/api/evaluator/state.py +275 -0
  19. evalscope/api/filter/__init__.py +1 -0
  20. evalscope/api/filter/filter.py +72 -0
  21. evalscope/api/messages/__init__.py +12 -0
  22. evalscope/api/messages/chat_message.py +243 -0
  23. evalscope/api/messages/content.py +102 -0
  24. evalscope/api/messages/utils.py +35 -0
  25. evalscope/api/metric/__init__.py +2 -0
  26. evalscope/api/metric/metric.py +55 -0
  27. evalscope/api/metric/scorer.py +113 -0
  28. evalscope/api/mixin/__init__.py +1 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +155 -0
  32. evalscope/api/model/model.py +386 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/app.py +3 -0
  40. evalscope/app/ui/app_ui.py +2 -1
  41. evalscope/app/ui/multi_model.py +50 -25
  42. evalscope/app/ui/single_model.py +26 -14
  43. evalscope/app/utils/data_utils.py +43 -27
  44. evalscope/app/utils/env_utils.py +12 -0
  45. evalscope/app/utils/text_utils.py +14 -14
  46. evalscope/app/utils/visualization.py +9 -4
  47. evalscope/arguments.py +7 -10
  48. evalscope/backend/opencompass/api_meta_template.py +2 -1
  49. evalscope/backend/opencompass/backend_manager.py +6 -5
  50. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  51. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  52. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  53. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  55. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  56. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  57. evalscope/backend/rag_eval/utils/embedding.py +10 -1
  58. evalscope/backend/rag_eval/utils/llm.py +13 -12
  59. evalscope/benchmarks/__init__.py +0 -2
  60. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  61. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  62. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  63. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  64. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  65. evalscope/benchmarks/arena_hard/utils.py +37 -1
  66. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  67. evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
  68. evalscope/benchmarks/bfcl/generation.py +222 -0
  69. evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
  70. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  71. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  72. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  73. evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
  74. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  75. evalscope/benchmarks/docmath/utils.py +4 -5
  76. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  77. evalscope/benchmarks/frames/frames_adapter.py +136 -52
  78. evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
  79. evalscope/benchmarks/general_arena/utils.py +23 -27
  80. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  81. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  82. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  83. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  84. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  85. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  86. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  87. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  88. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  89. evalscope/benchmarks/ifeval/instructions.py +109 -64
  90. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  91. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  92. evalscope/benchmarks/ifeval/utils.py +6 -7
  93. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  94. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  95. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  96. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/math_vista/__init__.py +0 -0
  105. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  106. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  107. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  108. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  109. evalscope/benchmarks/mmmu/__init__.py +0 -0
  110. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  111. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  112. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  113. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  114. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
  115. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  116. evalscope/benchmarks/race/race_adapter.py +33 -119
  117. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  118. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  119. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  120. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  121. evalscope/benchmarks/tau_bench/generation.py +147 -0
  122. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
  123. evalscope/benchmarks/text2image/__init__.py +0 -0
  124. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  125. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  126. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  127. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  128. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  129. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  131. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
  132. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  133. evalscope/cli/cli.py +2 -0
  134. evalscope/cli/start_app.py +7 -1
  135. evalscope/cli/start_perf.py +7 -1
  136. evalscope/cli/start_server.py +6 -3
  137. evalscope/collections/__init__.py +2 -10
  138. evalscope/collections/sampler.py +10 -10
  139. evalscope/collections/schema.py +13 -11
  140. evalscope/config.py +157 -57
  141. evalscope/constants.py +37 -61
  142. evalscope/evaluator/__init__.py +1 -1
  143. evalscope/evaluator/evaluator.py +275 -419
  144. evalscope/filters/__init__.py +2 -0
  145. evalscope/filters/extraction.py +126 -0
  146. evalscope/filters/selection.py +57 -0
  147. evalscope/metrics/__init__.py +13 -13
  148. evalscope/metrics/llm_judge.py +47 -33
  149. evalscope/metrics/math_parser.py +27 -22
  150. evalscope/metrics/metric.py +307 -0
  151. evalscope/metrics/metrics.py +22 -18
  152. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  153. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  154. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  155. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  156. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  157. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  158. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  159. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  160. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  162. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  163. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  184. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  185. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  186. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  187. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  188. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  189. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  190. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  191. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  192. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  193. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  194. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  195. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  196. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  197. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  198. evalscope/models/__init__.py +6 -29
  199. evalscope/models/image_edit_model.py +125 -0
  200. evalscope/models/mockllm.py +65 -0
  201. evalscope/models/model_apis.py +67 -0
  202. evalscope/models/modelscope.py +455 -0
  203. evalscope/models/openai_compatible.py +126 -0
  204. evalscope/models/text2image_model.py +124 -0
  205. evalscope/models/utils/openai.py +701 -0
  206. evalscope/perf/benchmark.py +4 -1
  207. evalscope/perf/http_client.py +4 -2
  208. evalscope/perf/plugin/api/custom_api.py +5 -4
  209. evalscope/perf/plugin/api/openai_api.py +11 -9
  210. evalscope/perf/plugin/datasets/custom.py +2 -1
  211. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  212. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  213. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  214. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  215. evalscope/perf/plugin/datasets/openqa.py +4 -2
  216. evalscope/perf/utils/benchmark_util.py +15 -10
  217. evalscope/perf/utils/db_util.py +9 -6
  218. evalscope/perf/utils/local_server.py +11 -3
  219. evalscope/perf/utils/rich_display.py +16 -10
  220. evalscope/report/__init__.py +2 -3
  221. evalscope/report/combinator.py +18 -12
  222. evalscope/report/generator.py +51 -35
  223. evalscope/report/{utils.py → report.py} +8 -6
  224. evalscope/run.py +33 -47
  225. evalscope/summarizer.py +1 -1
  226. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  227. evalscope/utils/__init__.py +21 -2
  228. evalscope/utils/chat_service.py +3 -2
  229. evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope/utils/function_utils.py +29 -0
  231. evalscope/utils/import_utils.py +23 -1
  232. evalscope/utils/io_utils.py +142 -6
  233. evalscope/utils/json_schema.py +208 -0
  234. evalscope/utils/logger.py +51 -12
  235. evalscope/utils/model_utils.py +11 -7
  236. evalscope/utils/multi_choices.py +288 -0
  237. evalscope/utils/url_utils.py +65 -0
  238. evalscope/version.py +2 -2
  239. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
  240. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
  241. tests/benchmark/test_eval.py +385 -0
  242. tests/benchmark/test_image_edit.py +65 -0
  243. tests/{aigc → benchmark}/test_t2i.py +22 -4
  244. tests/benchmark/test_vlm.py +80 -0
  245. tests/cli/test_all.py +85 -47
  246. tests/cli/test_collection.py +20 -8
  247. tests/cli/test_custom.py +22 -15
  248. tests/cli/test_reasoning.py +81 -0
  249. tests/common.py +73 -0
  250. tests/perf/test_perf.py +4 -2
  251. tests/rag/test_clip_benchmark.py +0 -2
  252. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  253. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  254. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  255. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
  256. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  257. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  258. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  259. evalscope/benchmarks/benchmark.py +0 -81
  260. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  261. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  262. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  263. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  264. evalscope/benchmarks/data_adapter.py +0 -528
  265. evalscope/benchmarks/filters.py +0 -59
  266. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  267. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  268. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  269. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  270. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  271. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  272. evalscope/benchmarks/race/race.py +0 -104
  273. evalscope/benchmarks/race/samples.jsonl +0 -5
  274. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  275. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  276. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  277. evalscope/benchmarks/utils.py +0 -60
  278. evalscope/collections/evaluator.py +0 -375
  279. evalscope/metrics/completion_parsers.py +0 -227
  280. evalscope/metrics/named_metrics.py +0 -55
  281. evalscope/models/adapters/__init__.py +0 -14
  282. evalscope/models/adapters/base_adapter.py +0 -84
  283. evalscope/models/adapters/bfcl_adapter.py +0 -246
  284. evalscope/models/adapters/chat_adapter.py +0 -207
  285. evalscope/models/adapters/choice_adapter.py +0 -222
  286. evalscope/models/adapters/custom_adapter.py +0 -71
  287. evalscope/models/adapters/server_adapter.py +0 -236
  288. evalscope/models/adapters/t2i_adapter.py +0 -79
  289. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  290. evalscope/models/custom/__init__.py +0 -4
  291. evalscope/models/custom/custom_model.py +0 -50
  292. evalscope/models/custom/dummy_model.py +0 -99
  293. evalscope/models/local_model.py +0 -128
  294. evalscope/models/register.py +0 -41
  295. tests/cli/test_run.py +0 -489
  296. /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
  297. /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
  298. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  299. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  300. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  301. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  302. /tests/{aigc → benchmark}/__init__.py +0 -0
@@ -28,6 +28,12 @@ class StartAppCMD(CLICommand):
28
28
  parser.set_defaults(func=subparser_func)
29
29
 
30
30
  def execute(self):
31
- from evalscope.app import create_app
31
+ try:
32
+ from evalscope.app import create_app
33
+ except ImportError as e:
34
+ raise ImportError(
35
+ f'Failed to import create_app from evalscope.app, due to {e}. '
36
+ "Please run `pip install 'evalscope[app]'`."
37
+ )
32
38
 
33
39
  create_app(self.args)
@@ -28,6 +28,12 @@ class PerfBenchCMD(CLICommand):
28
28
  parser.set_defaults(func=subparser_func)
29
29
 
30
30
  def execute(self):
31
- from evalscope.perf.main import run_perf_benchmark
31
+ try:
32
+ from evalscope.perf.main import run_perf_benchmark
33
+ except ImportError as e:
34
+ raise ImportError(
35
+ f'Failed to import run_perf_benchmark from evalscope.perf.main, due to {e}. '
36
+ "Please run `pip install 'evalscope[perf]'`."
37
+ )
32
38
 
33
39
  run_perf_benchmark(self.args)
@@ -25,14 +25,16 @@ def add_perf_args(parser):
25
25
  '--logdir',
26
26
  required=True,
27
27
  type=str,
28
- help='The monitor log save dir, tensorboard start at this path for display!')
28
+ help='The monitor log save dir, tensorboard start at this path for display!'
29
+ )
29
30
  parser.add_argument('--host', type=str, default='0.0.0.0', help='The tensorboard host')
30
31
  parser.add_argument('--tensorboard-port', type=str, default='6006', help='The tensorboard port')
31
32
 
32
33
 
33
34
  def async_run_command_with_popen(cmd):
34
35
  sub_process = subprocess.Popen(
35
- cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, bufsize=1, universal_newlines=True, encoding='utf8')
36
+ cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, bufsize=1, universal_newlines=True, encoding='utf8'
37
+ )
36
38
  return sub_process
37
39
 
38
40
 
@@ -61,7 +63,8 @@ def start_server(args):
61
63
  bufsize=1,
62
64
  shell=True,
63
65
  universal_newlines=True,
64
- encoding='utf8')
66
+ encoding='utf8'
67
+ )
65
68
 
66
69
  os.set_blocking(sub_process.stdout.fileno(), False)
67
70
  return sub_process
@@ -4,20 +4,12 @@ from typing import TYPE_CHECKING
4
4
  from evalscope.utils.import_utils import _LazyModule
5
5
 
6
6
  if TYPE_CHECKING:
7
- from .evaluator import EvaluatorCollection
8
- from .sampler import StratifiedSampler, UniformSampler, WeightedSampler
7
+ from .sampler import DatasetEntry, StratifiedSampler, UniformSampler, WeightedSampler
9
8
  from .schema import CollectionSchema, DatasetInfo
10
9
 
11
10
  else:
12
11
  _import_structure = {
13
- 'evaluator': [
14
- 'EvaluatorCollection',
15
- ],
16
- 'sampler': [
17
- 'StratifiedSampler',
18
- 'UniformSampler',
19
- 'WeightedSampler',
20
- ],
12
+ 'sampler': ['StratifiedSampler', 'UniformSampler', 'WeightedSampler', 'DatasetEntry'],
21
13
  'schema': [
22
14
  'CollectionSchema',
23
15
  'DatasetInfo',
@@ -1,18 +1,17 @@
1
1
  import random
2
2
  from abc import ABC, abstractmethod
3
- from dataclasses import asdict, dataclass, field
3
+ from pydantic import BaseModel, Field
4
4
  from tqdm import tqdm
5
5
  from typing import List, Optional
6
6
 
7
7
  from evalscope.collections.schema import CollectionSchema, DatasetInfo
8
8
 
9
9
 
10
- @dataclass
11
- class DatasetEntry:
10
+ class DatasetEntry(BaseModel):
12
11
  index: int = 0
13
- prompt: dict = field(default_factory=dict)
14
- tags: List[str] = field(default_factory=list)
15
- categories: List[str] = field(default_factory=list)
12
+ prompt: dict = Field(default_factory=dict)
13
+ tags: List[str] = Field(default_factory=list)
14
+ categories: List[str] = Field(default_factory=list)
16
15
  task_type: str = ''
17
16
  weight: float = 0.0
18
17
  dataset_name: str = ''
@@ -33,17 +32,18 @@ class Sampler(ABC):
33
32
  all_data = []
34
33
  data_dict = dataset.get_data()
35
34
  for subset_name, subset_data in data_dict.items():
36
- for prompt in subset_data:
35
+ for sample in subset_data:
37
36
  all_data.append(
38
37
  DatasetEntry(
39
- prompt=prompt,
38
+ prompt=sample.model_dump(exclude_none=True),
40
39
  tags=dataset.tags,
41
40
  categories=dataset.hierarchy,
42
41
  task_type=dataset.task_type,
43
42
  weight=dataset.weight,
44
43
  dataset_name=dataset.name,
45
44
  subset_name=subset_name,
46
- ))
45
+ )
46
+ )
47
47
  count = min(count, len(all_data)) # avoid sampling more than the dataset size
48
48
  sampled_data = random.sample(all_data, k=count)
49
49
  return sampled_data
@@ -52,7 +52,7 @@ class Sampler(ABC):
52
52
  result = []
53
53
  for i, entry in enumerate(all_data):
54
54
  entry.index = i
55
- result.append(asdict(entry))
55
+ result.append(entry.model_dump())
56
56
  return result
57
57
 
58
58
 
@@ -3,6 +3,10 @@ import json
3
3
  from dataclasses import asdict, dataclass, field
4
4
  from typing import List, Union
5
5
 
6
+ from evalscope.api.dataset import DatasetDict
7
+ from evalscope.api.registry import get_benchmark
8
+ from evalscope.config import TaskConfig
9
+
6
10
 
7
11
  @dataclass
8
12
  class DatasetInfo:
@@ -13,15 +17,11 @@ class DatasetInfo:
13
17
  args: dict = field(default_factory=dict)
14
18
  hierarchy: List[str] = field(default_factory=list)
15
19
 
16
- def get_data(self) -> dict:
17
- from evalscope.benchmarks import Benchmark
18
-
19
- benchmark_meta = Benchmark.get(self.name)
20
-
21
- data_adapter = benchmark_meta.get_data_adapter(config=self.args)
22
- data_dict = data_adapter.load()
23
- prompts = data_adapter.gen_prompts(data_dict)
24
- return prompts
20
+ def get_data(self) -> DatasetDict:
21
+ dataset_args = {self.name: self.args}
22
+ benchmark_meta = get_benchmark(self.name, config=TaskConfig(dataset_args=dataset_args))
23
+ data_dict = benchmark_meta.load_dataset()
24
+ return data_dict
25
25
 
26
26
 
27
27
  def flatten_weight(collection: 'CollectionSchema', base_weight=1):
@@ -111,8 +111,10 @@ if __name__ == '__main__':
111
111
  ]),
112
112
  CollectionSchema(
113
113
  name='chinese',
114
- datasets=[DatasetInfo(name='ceval', weight=1, tags=['zh'], args={'subset_list': ['logic']})])
115
- ])
114
+ datasets=[DatasetInfo(name='ceval', weight=1, tags=['zh'], args={'subset_list': ['logic']})]
115
+ )
116
+ ]
117
+ )
116
118
  print(schema)
117
119
  print(schema.flatten())
118
120
  schema.dump_json('outputs/schema.json')
evalscope/config.py CHANGED
@@ -1,16 +1,24 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
2
+ # flake8: noqa: E501
3
3
  import copy
4
4
  import os
5
5
  from argparse import Namespace
6
6
  from dataclasses import dataclass, field
7
7
  from typing import Dict, List, Optional, Union
8
8
 
9
- from evalscope.constants import (DEFAULT_DATASET_CACHE_DIR, DEFAULT_WORK_DIR, EvalBackend, EvalStage, EvalType, HubType,
10
- JudgeStrategy, ModelTask, OutputType)
11
- from evalscope.models import CustomModel, DummyCustomModel
9
+ from evalscope.api.model import GenerateConfig, Model, ModelAPI
10
+ from evalscope.constants import (
11
+ DEFAULT_DATASET_CACHE_DIR,
12
+ DEFAULT_WORK_DIR,
13
+ EvalBackend,
14
+ EvalType,
15
+ HubType,
16
+ JudgeStrategy,
17
+ ModelTask,
18
+ )
12
19
  from evalscope.utils.argument_utils import BaseArgument, parse_int_or_float
13
- from evalscope.utils.io_utils import dict_to_yaml, gen_hash
20
+ from evalscope.utils.deprecation_utils import deprecated_warning
21
+ from evalscope.utils.io_utils import dict_to_yaml, gen_hash, safe_filename
14
22
  from evalscope.utils.logger import get_logger
15
23
 
16
24
  logger = get_logger()
@@ -19,104 +27,191 @@ logger = get_logger()
19
27
  @dataclass
20
28
  class TaskConfig(BaseArgument):
21
29
  # Model-related arguments
22
- model: Union[str, 'CustomModel', None] = None
30
+ model: Optional[Union[str, Model, ModelAPI]] = None
31
+ """The model to be evaluated. Can be a string path, Model object, or ModelAPI object."""
32
+
23
33
  model_id: Optional[str] = None
34
+ """Unique identifier for the model. Auto-generated from model name if not provided."""
35
+
24
36
  model_args: Dict = field(default_factory=dict)
37
+ """Additional arguments to pass to the model during initialization."""
38
+
25
39
  model_task: str = ModelTask.TEXT_GENERATION
40
+ """The type of task the model performs (e.g., text generation, image generation)."""
26
41
 
27
42
  # Template-related arguments
28
- template_type: Optional[str] = None # Deprecated, will be removed in v1.0.0.
29
43
  chat_template: Optional[str] = None
44
+ """Chat template to use for formatting conversations with the model."""
30
45
 
31
46
  # Dataset-related arguments
32
47
  datasets: List[str] = field(default_factory=list)
48
+ """List of dataset names to evaluate the model on."""
49
+
33
50
  dataset_args: Dict = field(default_factory=dict)
51
+ """Additional arguments to pass to datasets during loading."""
52
+
34
53
  dataset_dir: str = DEFAULT_DATASET_CACHE_DIR
54
+ """Directory where datasets are cached locally."""
55
+
35
56
  dataset_hub: str = HubType.MODELSCOPE
57
+ """Hub platform to download datasets from (e.g., ModelScope, HuggingFace)."""
58
+
59
+ repeats: int = 1
60
+ """Number of times to repeat the dataset items for k-metrics evaluation."""
36
61
 
37
62
  # Generation configuration arguments
38
- generation_config: Dict = field(default_factory=dict)
63
+ generation_config: Union[Dict, GenerateConfig] = field(default_factory=dict)
64
+ """Configuration parameters for text/image generation."""
39
65
 
40
66
  # Evaluation-related arguments
41
67
  eval_type: str = EvalType.CHECKPOINT
68
+ """Type of evaluation: checkpoint, service, or mock."""
69
+
42
70
  eval_backend: str = EvalBackend.NATIVE
71
+ """Backend framework to use for evaluation."""
72
+
43
73
  eval_config: Union[str, Dict, None] = None
44
- stage: str = EvalStage.ALL
74
+ """Additional evaluation configuration parameters."""
75
+
45
76
  limit: Optional[Union[int, float]] = None
46
- eval_batch_size: Optional[int] = None
77
+ """Maximum number of samples to evaluate. Can be int (count) or float (fraction)."""
78
+
79
+ eval_batch_size: int = 1
80
+ """Batch size for evaluation processing."""
47
81
 
48
82
  # Cache and working directory arguments
49
- mem_cache: bool = False # Deprecated, will be removed in v1.0.0.
50
83
  use_cache: Optional[str] = None
84
+ """Whether to use cached results and which cache strategy to apply."""
85
+
86
+ rerun_review: bool = False
87
+ """Whether to rerun the review process even if results exist."""
88
+
51
89
  work_dir: str = DEFAULT_WORK_DIR
52
- outputs: Optional[str] = None # Deprecated, will be removed in v1.0.0.
90
+ """Working directory for storing evaluation results and temporary files."""
53
91
 
54
92
  # Debug and runtime mode arguments
55
93
  ignore_errors: bool = False
94
+ """Whether to continue evaluation when encountering errors."""
95
+
56
96
  debug: bool = False
57
- dry_run: bool = False
97
+ """Enable debug mode for detailed logging and error reporting."""
98
+
58
99
  seed: Optional[int] = 42
59
- api_url: Optional[str] = None # Only used for server model
60
- api_key: Optional[str] = 'EMPTY' # Only used for server model
61
- timeout: Optional[float] = None # Only used for server model
62
- stream: bool = False # Only used for server model
100
+ """Random seed for reproducible results."""
101
+
102
+ api_url: Optional[str] = None
103
+ """API endpoint URL for server-based model evaluation."""
104
+
105
+ api_key: Optional[str] = 'EMPTY'
106
+ """API key for authenticating with server-based models."""
107
+
108
+ timeout: Optional[float] = None
109
+ """Request timeout in seconds for server-based models."""
110
+
111
+ stream: Optional[bool] = None
112
+ """Whether to use streaming responses for server-based models."""
63
113
 
64
114
  # LLMJudge arguments
65
115
  judge_strategy: str = JudgeStrategy.AUTO
116
+ """Strategy for LLM-based judgment (auto, single, pairwise)."""
117
+
66
118
  judge_worker_num: int = 1
119
+ """Number of worker processes for parallel LLM judging."""
120
+
67
121
  judge_model_args: Optional[Dict] = field(default_factory=dict)
122
+ """Additional arguments for the judge model configuration."""
123
+
68
124
  analysis_report: bool = False
125
+ """Whether to generate detailed analysis reports after evaluation."""
69
126
 
70
127
  def __post_init__(self):
128
+ self.__init_model_and_id()
129
+
130
+ self.__init_eval_data_config()
131
+
132
+ # Set default generation_config and model_args
133
+ self.__init_default_generation_config()
134
+ self.__init_default_model_args()
135
+
136
+ def __init_model_and_id(self):
137
+ # Set model to DummyCustomModel if not provided
71
138
  if self.model is None:
72
- self.model = DummyCustomModel()
73
- self.eval_type = EvalType.CUSTOM
139
+ self.model = self.model_task
140
+ self.eval_type = EvalType.MOCK_LLM
74
141
 
75
- if (not self.model_id) and self.model:
76
- if isinstance(self.model, CustomModel):
77
- self.model_id = self.model.config.get('model_id', 'custom_model')
142
+ # Set model_id if not provided
143
+ if not self.model_id:
144
+ if isinstance(self.model, str):
145
+ self.model_id = safe_filename(os.path.basename(self.model))
146
+ elif isinstance(self.model, Model):
147
+ self.model_id = safe_filename(self.model.name)
148
+ elif isinstance(self.model, ModelAPI):
149
+ self.model_id = safe_filename(self.model.model_name)
78
150
  else:
79
- self.model_id = os.path.basename(self.model).rstrip(os.sep)
80
- # fix path error, see http://github.com/modelscope/evalscope/issues/377
81
- self.model_id = self.model_id.replace(':', '-')
82
-
83
- # Set default eval_batch_size based on eval_type
84
- if self.eval_batch_size is None:
85
- self.eval_batch_size = 8 if self.eval_type == EvalType.SERVICE else 1
151
+ self.model_id = 'dummy_model'
86
152
 
153
+ def __init_eval_data_config(self):
87
154
  # Post process limit
88
155
  if self.limit is not None:
89
156
  self.limit = parse_int_or_float(self.limit)
90
157
 
91
- # Set default generation_config and model_args
92
- self.__init_default_generation_config()
93
- self.__init_default_model_args()
94
-
95
158
  def __init_default_generation_config(self):
96
- if self.generation_config:
97
- return
98
- if self.model_task == ModelTask.IMAGE_GENERATION:
99
- self.generation_config = {
100
- 'height': 1024,
101
- 'width': 1024,
102
- 'num_inference_steps': 50,
103
- 'guidance_scale': 9.0,
104
- }
105
- elif self.model_task == ModelTask.TEXT_GENERATION:
106
- if self.eval_type == EvalType.CHECKPOINT:
159
+ if not self.generation_config:
160
+ if self.model_task == ModelTask.IMAGE_GENERATION:
107
161
  self.generation_config = {
108
- 'max_length': 2048,
109
- 'max_new_tokens': 512,
110
- 'do_sample': False,
111
- 'top_k': 50,
112
- 'top_p': 1.0,
113
- 'temperature': 1.0,
114
- }
115
- elif self.eval_type == EvalType.SERVICE:
116
- self.generation_config = {
117
- 'max_tokens': 2048,
118
- 'temperature': 0.0,
162
+ 'height': 1024,
163
+ 'width': 1024,
164
+ 'num_inference_steps': 50,
165
+ 'guidance_scale': 9.0,
119
166
  }
167
+ if self.eval_batch_size != 1:
168
+ logger.warning(
169
+ 'For image generation task, we only support eval_batch_size=1 for now, changed to 1.'
170
+ )
171
+ self.eval_batch_size = 1
172
+ elif self.model_task == ModelTask.TEXT_GENERATION:
173
+ if self.eval_type == EvalType.CHECKPOINT:
174
+ self.generation_config = {
175
+ 'max_tokens': 2048,
176
+ 'do_sample': False,
177
+ 'top_k': 50,
178
+ 'top_p': 1.0,
179
+ 'temperature': 1.0,
180
+ 'n': 1,
181
+ }
182
+ elif self.eval_type == EvalType.SERVICE:
183
+ self.generation_config = {
184
+ 'max_tokens': 2048,
185
+ 'temperature': 0.0,
186
+ }
187
+ if isinstance(self.generation_config, dict):
188
+ self.generation_config = GenerateConfig.model_validate(self.generation_config)
189
+
190
+ # Set eval_batch_size to generation_config.batch_size
191
+ self.generation_config.batch_size = self.eval_batch_size
192
+
193
+ # Set default values for generation_config
194
+ if self.timeout is not None:
195
+ deprecated_warning(
196
+ logger,
197
+ 'The `timeout` parameter is deprecated and will be removed in v1.1.0. Use `generation_config.timeout` instead.'
198
+ )
199
+ self.generation_config.timeout = self.timeout
200
+
201
+ if self.stream is not None:
202
+ deprecated_warning(
203
+ logger,
204
+ 'The `stream` parameter is deprecated and will be removed in v1.1.0. Use `generation_config.stream` instead.'
205
+ )
206
+ self.generation_config.stream = self.stream
207
+
208
+ if self.generation_config.n is not None and self.generation_config.n > 1:
209
+ self.repeats = self.generation_config.n
210
+ self.generation_config.n = 1
211
+ deprecated_warning(
212
+ logger,
213
+ 'The `n` parameter in generation_config is deprecated and will be removed in v1.1.0. Use `TaskConfig.repeats` instead.'
214
+ )
120
215
 
121
216
  def __init_default_model_args(self):
122
217
  if self.model_args:
@@ -143,9 +238,14 @@ class TaskConfig(BaseArgument):
143
238
  logger.warning(f'Failed to dump overall task config: {e}')
144
239
 
145
240
  def to_dict(self):
146
- result = self.__dict__.copy()
147
- if isinstance(self.model, CustomModel):
241
+ result = copy.deepcopy(self.__dict__)
242
+ del result['api_key'] # Do not expose api_key in the config
243
+
244
+ if isinstance(self.model, (Model, ModelAPI)):
148
245
  result['model'] = self.model.__class__.__name__
246
+
247
+ if isinstance(self.generation_config, GenerateConfig):
248
+ result['generation_config'] = self.generation_config.model_dump(exclude_unset=True)
149
249
  return result
150
250
 
151
251
 
evalscope/constants.py CHANGED
@@ -9,9 +9,12 @@ from modelscope.utils.file_utils import get_dataset_cache_root, get_model_cache_
9
9
 
10
10
  DEFAULT_WORK_DIR = './outputs'
11
11
  DEFAULT_MODEL_REVISION = DEFAULT_REPOSITORY_REVISION # master
12
- DEFAULT_MODEL_CACHE_DIR = get_model_cache_root() # ~/.cache/modelscope/hub
13
- DEFAULT_DATASET_CACHE_DIR = get_dataset_cache_root() # ~/.cache/modelscope/datasets
12
+ DEFAULT_MODEL_CACHE_DIR = get_model_cache_root() # ~/.cache/modelscope/hub/models
13
+ DEFAULT_DATASET_CACHE_DIR = get_dataset_cache_root() # ~/.cache/modelscope/hub/datasets
14
14
  DEFAULT_ROOT_CACHE_DIR = DEFAULT_DATASET_CACHE_DIR # compatible with old version
15
+ DEFAULT_EVALSCOPE_CACHE_DIR = os.path.expanduser(
16
+ os.getenv('EVALSCOPE_CACHE', '~/.cache/evalscope')
17
+ ) # ~/.cache/evalscope
15
18
 
16
19
 
17
20
  class HubType:
@@ -44,22 +47,12 @@ class MetricsConstant:
44
47
  class ArenaWinner:
45
48
 
46
49
  MODEL_A = 'model_a'
47
-
48
50
  MODEL_B = 'model_b'
49
-
50
51
  TIE = 'tie'
51
-
52
52
  TIE_BOTH_BAD = 'tie_both_bad'
53
-
54
53
  UNKNOWN = 'unknown'
55
54
 
56
55
 
57
- class ArenaMode:
58
- SINGLE = 'single'
59
- PAIRWISE = 'pairwise'
60
- PAIRWISE_BASELINE = 'pairwise_baseline'
61
-
62
-
63
56
  class AnswerKeys:
64
57
  INDEX = 'index'
65
58
  ANSWER_ID = 'answer_id'
@@ -70,58 +63,14 @@ class AnswerKeys:
70
63
  CHOICES = 'choices'
71
64
 
72
65
 
73
- class ReviewKeys:
74
- REVIEW_ID = 'review_id'
75
- REVIEWED = 'reviewed'
76
- REVIEWER_SPEC = 'reviewer_spec'
77
- REVIEW_TIME = 'review_time'
78
- MESSAGE = 'message'
79
- CONTENT = 'content'
80
- GOLD = 'gold'
81
- PRED = 'pred'
82
- RESULT = 'result'
83
- REVIEW = 'review'
84
-
85
-
86
- class EvalConfigKeys:
87
- CLASS_REF = 'ref'
88
- CLASS_ARGS = 'args'
89
- ENABLE = 'enable'
90
- POSITION_BIAS_MITIGATION = 'position_bias_mitigation'
91
- RANDOM_SEED = 'random_seed'
92
- FN_COMPLETION_PARSER = 'fn_completion_parser'
93
- COMPLETION_PARSER_KWARGS = 'completion_parser_kwargs'
94
- OUTPUT_FILE = 'output_file'
95
- MODEL_ID_OR_PATH = 'model_id_or_path'
96
- MODEL_REVISION = 'revision'
97
- GENERATION_CONFIG = 'generation_config'
98
- PRECISION = 'precision'
99
- TEMPLATE_TYPE = 'template_type'
100
-
101
-
102
- class FnCompletionParser:
103
- LMSYS_PARSER: str = 'lmsys_parser'
104
- RANKING_PARSER: str = 'ranking_parser'
105
-
106
-
107
- class PositionBiasMitigation:
108
- NONE = 'none'
109
- RANDOMIZE_ORDER = 'randomize_order'
110
- SWAP_POSITION = 'swap_position'
111
-
112
-
113
- class EvalStage:
114
- # Enums: `all`, `infer`, `review`
115
- ALL = 'all'
116
- INFER = 'infer'
117
- REVIEW = 'review'
118
-
119
-
120
66
  class EvalType:
121
67
 
122
68
  CUSTOM = 'custom'
123
- CHECKPOINT = 'checkpoint' # native model checkpoint
124
- SERVICE = 'service' # model service
69
+ MOCK_LLM = 'mock_llm'
70
+ CHECKPOINT = 'llm_ckpt' # native model checkpoint
71
+ SERVICE = 'openai_api' # model service
72
+ TEXT2IMAGE = 'text2image' # image generation service
73
+ IMAGE_EDITING = 'image_editing' # image editing service
125
74
 
126
75
 
127
76
  class OutputType:
@@ -142,6 +91,7 @@ class EvalBackend:
142
91
 
143
92
  class DataCollection:
144
93
  NAME = 'data_collection'
94
+ INFO = 'collection_info'
145
95
 
146
96
 
147
97
  class JudgeStrategy:
@@ -159,3 +109,29 @@ class JudgeScoreType:
159
109
  class ModelTask:
160
110
  TEXT_GENERATION = 'text_generation'
161
111
  IMAGE_GENERATION = 'image_generation'
112
+
113
+
114
+ class Tags:
115
+ KNOWLEDGE = 'Knowledge'
116
+ MULTIPLE_CHOICE = 'MCQ'
117
+ MATH = 'Math'
118
+ REASONING = 'Reasoning'
119
+ CODING = 'Coding'
120
+ CHINESE = 'Chinese'
121
+ COMMONSENSE = 'Commonsense'
122
+ QA = 'QA'
123
+ READING_COMPREHENSION = 'ReadingComprehension'
124
+ CUSTOM = 'Custom'
125
+ INSTRUCTION_FOLLOWING = 'InstructionFollowing'
126
+ ARENA = 'Arena'
127
+ LONG_CONTEXT = 'LongContext'
128
+ RETRIEVAL = 'Retrieval'
129
+ FUNCTION_CALLING = 'FunctionCalling'
130
+ TEXT_TO_IMAGE = 'TextToImage'
131
+ IMAGE_EDITING = 'ImageEditing'
132
+ MULTI_MODAL = 'MultiModal'
133
+
134
+
135
+ class FileConstants:
136
+ IMAGE_PATH = 'image_path'
137
+ ID = 'id'
@@ -1,3 +1,3 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- from .evaluator import Evaluator
3
+ from .evaluator import DefaultEvaluator