evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +3 -0
  3. evalscope/api/benchmark/adapters/__init__.py +5 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
  8. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  9. evalscope/api/benchmark/benchmark.py +356 -0
  10. evalscope/api/benchmark/meta.py +121 -0
  11. evalscope/api/dataset/__init__.py +2 -0
  12. evalscope/api/dataset/dataset.py +349 -0
  13. evalscope/api/dataset/loader.py +262 -0
  14. evalscope/api/dataset/utils.py +143 -0
  15. evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope/api/evaluator/cache.py +378 -0
  17. evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope/api/evaluator/state.py +275 -0
  19. evalscope/api/filter/__init__.py +1 -0
  20. evalscope/api/filter/filter.py +72 -0
  21. evalscope/api/messages/__init__.py +12 -0
  22. evalscope/api/messages/chat_message.py +243 -0
  23. evalscope/api/messages/content.py +102 -0
  24. evalscope/api/messages/utils.py +35 -0
  25. evalscope/api/metric/__init__.py +2 -0
  26. evalscope/api/metric/metric.py +55 -0
  27. evalscope/api/metric/scorer.py +113 -0
  28. evalscope/api/mixin/__init__.py +1 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +155 -0
  32. evalscope/api/model/model.py +386 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/app.py +3 -0
  40. evalscope/app/ui/app_ui.py +2 -1
  41. evalscope/app/ui/multi_model.py +50 -25
  42. evalscope/app/ui/single_model.py +26 -14
  43. evalscope/app/utils/data_utils.py +43 -27
  44. evalscope/app/utils/env_utils.py +12 -0
  45. evalscope/app/utils/text_utils.py +14 -14
  46. evalscope/app/utils/visualization.py +9 -4
  47. evalscope/arguments.py +7 -10
  48. evalscope/backend/opencompass/api_meta_template.py +2 -1
  49. evalscope/backend/opencompass/backend_manager.py +6 -5
  50. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  51. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  52. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  53. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  55. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  56. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  57. evalscope/backend/rag_eval/utils/embedding.py +10 -1
  58. evalscope/backend/rag_eval/utils/llm.py +13 -12
  59. evalscope/benchmarks/__init__.py +0 -2
  60. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  61. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  62. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  63. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  64. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  65. evalscope/benchmarks/arena_hard/utils.py +37 -1
  66. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  67. evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
  68. evalscope/benchmarks/bfcl/generation.py +222 -0
  69. evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
  70. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  71. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  72. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  73. evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
  74. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  75. evalscope/benchmarks/docmath/utils.py +4 -5
  76. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  77. evalscope/benchmarks/frames/frames_adapter.py +136 -52
  78. evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
  79. evalscope/benchmarks/general_arena/utils.py +23 -27
  80. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  81. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  82. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  83. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  84. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  85. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  86. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  87. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  88. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  89. evalscope/benchmarks/ifeval/instructions.py +109 -64
  90. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  91. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  92. evalscope/benchmarks/ifeval/utils.py +6 -7
  93. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  94. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  95. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  96. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/math_vista/__init__.py +0 -0
  105. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  106. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  107. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  108. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  109. evalscope/benchmarks/mmmu/__init__.py +0 -0
  110. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  111. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  112. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  113. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  114. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
  115. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  116. evalscope/benchmarks/race/race_adapter.py +33 -119
  117. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  118. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  119. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  120. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  121. evalscope/benchmarks/tau_bench/generation.py +147 -0
  122. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
  123. evalscope/benchmarks/text2image/__init__.py +0 -0
  124. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  125. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  126. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  127. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  128. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  129. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  131. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
  132. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  133. evalscope/cli/cli.py +2 -0
  134. evalscope/cli/start_app.py +7 -1
  135. evalscope/cli/start_perf.py +7 -1
  136. evalscope/cli/start_server.py +6 -3
  137. evalscope/collections/__init__.py +2 -10
  138. evalscope/collections/sampler.py +10 -10
  139. evalscope/collections/schema.py +13 -11
  140. evalscope/config.py +157 -57
  141. evalscope/constants.py +37 -61
  142. evalscope/evaluator/__init__.py +1 -1
  143. evalscope/evaluator/evaluator.py +275 -419
  144. evalscope/filters/__init__.py +2 -0
  145. evalscope/filters/extraction.py +126 -0
  146. evalscope/filters/selection.py +57 -0
  147. evalscope/metrics/__init__.py +13 -13
  148. evalscope/metrics/llm_judge.py +47 -33
  149. evalscope/metrics/math_parser.py +27 -22
  150. evalscope/metrics/metric.py +307 -0
  151. evalscope/metrics/metrics.py +22 -18
  152. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  153. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  154. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  155. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  156. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  157. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  158. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  159. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  160. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  162. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  163. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  184. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  185. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  186. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  187. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  188. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  189. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  190. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  191. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  192. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  193. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  194. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  195. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  196. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  197. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  198. evalscope/models/__init__.py +6 -29
  199. evalscope/models/image_edit_model.py +125 -0
  200. evalscope/models/mockllm.py +65 -0
  201. evalscope/models/model_apis.py +67 -0
  202. evalscope/models/modelscope.py +455 -0
  203. evalscope/models/openai_compatible.py +126 -0
  204. evalscope/models/text2image_model.py +124 -0
  205. evalscope/models/utils/openai.py +701 -0
  206. evalscope/perf/benchmark.py +4 -1
  207. evalscope/perf/http_client.py +4 -2
  208. evalscope/perf/plugin/api/custom_api.py +5 -4
  209. evalscope/perf/plugin/api/openai_api.py +11 -9
  210. evalscope/perf/plugin/datasets/custom.py +2 -1
  211. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  212. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  213. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  214. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  215. evalscope/perf/plugin/datasets/openqa.py +4 -2
  216. evalscope/perf/utils/benchmark_util.py +15 -10
  217. evalscope/perf/utils/db_util.py +9 -6
  218. evalscope/perf/utils/local_server.py +11 -3
  219. evalscope/perf/utils/rich_display.py +16 -10
  220. evalscope/report/__init__.py +2 -3
  221. evalscope/report/combinator.py +18 -12
  222. evalscope/report/generator.py +51 -35
  223. evalscope/report/{utils.py → report.py} +8 -6
  224. evalscope/run.py +33 -47
  225. evalscope/summarizer.py +1 -1
  226. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  227. evalscope/utils/__init__.py +21 -2
  228. evalscope/utils/chat_service.py +3 -2
  229. evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope/utils/function_utils.py +29 -0
  231. evalscope/utils/import_utils.py +23 -1
  232. evalscope/utils/io_utils.py +142 -6
  233. evalscope/utils/json_schema.py +208 -0
  234. evalscope/utils/logger.py +51 -12
  235. evalscope/utils/model_utils.py +11 -7
  236. evalscope/utils/multi_choices.py +288 -0
  237. evalscope/utils/url_utils.py +65 -0
  238. evalscope/version.py +2 -2
  239. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
  240. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
  241. tests/benchmark/test_eval.py +385 -0
  242. tests/benchmark/test_image_edit.py +65 -0
  243. tests/{aigc → benchmark}/test_t2i.py +22 -4
  244. tests/benchmark/test_vlm.py +80 -0
  245. tests/cli/test_all.py +85 -47
  246. tests/cli/test_collection.py +20 -8
  247. tests/cli/test_custom.py +22 -15
  248. tests/cli/test_reasoning.py +81 -0
  249. tests/common.py +73 -0
  250. tests/perf/test_perf.py +4 -2
  251. tests/rag/test_clip_benchmark.py +0 -2
  252. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  253. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  254. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  255. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
  256. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  257. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  258. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  259. evalscope/benchmarks/benchmark.py +0 -81
  260. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  261. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  262. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  263. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  264. evalscope/benchmarks/data_adapter.py +0 -528
  265. evalscope/benchmarks/filters.py +0 -59
  266. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  267. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  268. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  269. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  270. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  271. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  272. evalscope/benchmarks/race/race.py +0 -104
  273. evalscope/benchmarks/race/samples.jsonl +0 -5
  274. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  275. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  276. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  277. evalscope/benchmarks/utils.py +0 -60
  278. evalscope/collections/evaluator.py +0 -375
  279. evalscope/metrics/completion_parsers.py +0 -227
  280. evalscope/metrics/named_metrics.py +0 -55
  281. evalscope/models/adapters/__init__.py +0 -14
  282. evalscope/models/adapters/base_adapter.py +0 -84
  283. evalscope/models/adapters/bfcl_adapter.py +0 -246
  284. evalscope/models/adapters/chat_adapter.py +0 -207
  285. evalscope/models/adapters/choice_adapter.py +0 -222
  286. evalscope/models/adapters/custom_adapter.py +0 -71
  287. evalscope/models/adapters/server_adapter.py +0 -236
  288. evalscope/models/adapters/t2i_adapter.py +0 -79
  289. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  290. evalscope/models/custom/__init__.py +0 -4
  291. evalscope/models/custom/custom_model.py +0 -50
  292. evalscope/models/custom/dummy_model.py +0 -99
  293. evalscope/models/local_model.py +0 -128
  294. evalscope/models/register.py +0 -41
  295. tests/cli/test_run.py +0 -489
  296. /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
  297. /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
  298. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  299. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  300. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  301. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  302. /tests/{aigc → benchmark}/__init__.py +0 -0
@@ -61,17 +61,18 @@ def t5_tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_IN
61
61
 
62
62
 
63
63
  def load_pretrained_model(
64
- model_cls,
65
- model_args,
66
- model_path=None,
67
- tokenizer_path=None,
68
- model_max_length=None,
69
- padding_side=None,
70
- image_aspect_ratio='pad', # or 'square'
71
- mmprojector_repo=None,
72
- mmprojector_name=None,
73
- device='cuda',
74
- cache_dir=CACHE_DIR):
64
+ model_cls,
65
+ model_args,
66
+ model_path=None,
67
+ tokenizer_path=None,
68
+ model_max_length=None,
69
+ padding_side=None,
70
+ image_aspect_ratio='pad', # or 'square'
71
+ mmprojector_repo=None,
72
+ mmprojector_name=None,
73
+ device='cuda',
74
+ cache_dir=CACHE_DIR
75
+ ):
75
76
  tokenizer_dict = {}
76
77
  if model_max_length:
77
78
  tokenizer_dict['model_max_length'] = model_max_length
@@ -80,7 +81,7 @@ def load_pretrained_model(
80
81
 
81
82
  from ..utils import download_file
82
83
 
83
- tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_fast=False, **tokenizer_dict)
84
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, **tokenizer_dict)
84
85
  # tokenizer.pad_token = tokenizer.unk_token # could be redundant
85
86
 
86
87
  model_path = download_file(model_path, cache_dir=cache_dir)
@@ -106,7 +107,8 @@ def load_pretrained_model(
106
107
  model_args.pretrain_mm_mlp_adapter = pretrain_mm_mlp_adapter # important to set to correct path
107
108
 
108
109
  model.get_model().initialize_vision_modules(
109
- model_args) # This will load the CLIP vision encoder and MLP projector
110
+ model_args
111
+ ) # This will load the CLIP vision encoder and MLP projector
110
112
  else:
111
113
  model.resize_token_embeddings(len(tokenizer)) # perhaps not needed
112
114
 
@@ -8,8 +8,9 @@ from ..model import ScoreModel
8
8
  class VQAScoreModel(ScoreModel):
9
9
 
10
10
  @abstractmethod
11
- def forward(self, images: List[str], texts: List[str], question_template: str,
12
- answer_template: str) -> torch.Tensor:
11
+ def forward(
12
+ self, images: List[str], texts: List[str], question_template: str, answer_template: str
13
+ ) -> torch.Tensor:
13
14
  """Forward pass of the model to return n scores for n (image, text) pairs (in PyTorch Tensor)
14
15
  question_template: a string with optional {} to be replaced with the 'text'
15
16
  answer_template: a string with optional {} to be replaced with the 'text'
@@ -4,38 +4,15 @@ from typing import TYPE_CHECKING
4
4
  from evalscope.utils.import_utils import _LazyModule
5
5
 
6
6
  if TYPE_CHECKING:
7
- from .adapters import (BaseModelAdapter, BFCLAdapter, ChatGenerationModelAdapter, ContinuationLogitsModelAdapter,
8
- CustomModelAdapter, MultiChoiceModelAdapter, ServerModelAdapter, T2IModelAdapter,
9
- TauBenchAdapter, initialize_model_adapter)
10
- from .custom import CustomModel, DummyCustomModel
11
- from .local_model import LocalModel, get_local_model
12
- from .register import get_model_adapter
7
+ from .model_apis import llm_ckpt, mockllm, openai_api
13
8
 
14
9
  else:
15
10
  _import_structure = {
16
- 'adapters': [
17
- 'BaseModelAdapter',
18
- 'initialize_model_adapter',
19
- 'ChatGenerationModelAdapter',
20
- 'ContinuationLogitsModelAdapter',
21
- 'MultiChoiceModelAdapter',
22
- 'CustomModelAdapter',
23
- 'ServerModelAdapter',
24
- 'T2IModelAdapter',
25
- 'TauBenchAdapter',
26
- 'BFCLAdapter',
27
- ],
28
- 'custom': [
29
- 'CustomModel',
30
- 'DummyCustomModel',
31
- ],
32
- 'local_model': [
33
- 'LocalModel',
34
- 'get_local_model',
35
- ],
36
- 'register': [
37
- 'get_model_adapter',
38
- ],
11
+ 'model_apis': [
12
+ 'openai_api',
13
+ 'mockllm',
14
+ 'llm_ckpt',
15
+ ]
39
16
  }
40
17
 
41
18
  import sys
@@ -0,0 +1,125 @@
1
+ from __future__ import annotations
2
+
3
+ import importlib
4
+ import time
5
+ import torch
6
+ from logging import getLogger
7
+ from typing import Any, Dict, List, Literal, Optional, Protocol, Tuple, Union, cast
8
+
9
+ from evalscope.api.messages import (
10
+ ChatMessage,
11
+ ChatMessageAssistant,
12
+ ContentAudio,
13
+ ContentImage,
14
+ ContentText,
15
+ ContentVideo,
16
+ )
17
+ from evalscope.api.model import (
18
+ ChatCompletionChoice,
19
+ GenerateConfig,
20
+ Logprob,
21
+ Logprobs,
22
+ ModelAPI,
23
+ ModelOutput,
24
+ ModelUsage,
25
+ TopLogprob,
26
+ )
27
+ from evalscope.api.tool import ToolChoice, ToolInfo
28
+ from evalscope.utils.io_utils import PIL_to_base64, base64_to_PIL
29
+ from evalscope.utils.model_utils import get_device
30
+
31
+ logger = getLogger()
32
+
33
+
34
+ class ImageEditAPI(ModelAPI):
35
+
36
+ def __init__(
37
+ self,
38
+ model_name: str,
39
+ base_url: Optional[str] = None,
40
+ api_key: Optional[str] = None,
41
+ config: GenerateConfig = GenerateConfig(),
42
+ **model_args: Any,
43
+ ):
44
+ super().__init__(
45
+ model_name=model_name,
46
+ base_url=base_url,
47
+ api_key=api_key,
48
+ config=config,
49
+ )
50
+
51
+ # collect known model_args (then delete them so we can pass the rest on)
52
+ def collect_model_arg(name: str) -> Optional[Any]:
53
+ nonlocal model_args
54
+ value = model_args.get(name, None)
55
+ if value is not None:
56
+ model_args.pop(name)
57
+ return value
58
+
59
+ model_path = collect_model_arg('model_path')
60
+ torch_dtype = collect_model_arg('precision') or collect_model_arg('torch_dtype')
61
+ device_map = collect_model_arg('device_map')
62
+ # torch dtype
63
+ DTYPE_MAP = {'float16': torch.float16, 'float32': torch.float32, 'bfloat16': torch.bfloat16, 'auto': 'auto'}
64
+
65
+ if isinstance(torch_dtype, str) and torch_dtype != 'auto':
66
+ torch_dtype = DTYPE_MAP.get(torch_dtype, torch.float32)
67
+ self.torch_dtype = torch_dtype
68
+ self.device = device_map or get_device()
69
+
70
+ self.pipeline_cls = collect_model_arg('pipeline_cls')
71
+ # default to DiffusionPipeline if not specified
72
+ if self.pipeline_cls is None:
73
+ if 'qwen' in model_name.lower():
74
+ self.pipeline_cls = 'QwenImageEditPipeline'
75
+ else:
76
+ logger.error('Pipeline class not found. Please provide a valid `pipeline_cls` in model args.')
77
+ raise ValueError('Invalid pipeline class.')
78
+
79
+ model_name_or_path = model_path or model_name
80
+
81
+ # from modelscope import pipeline_cls
82
+ module = getattr(importlib.import_module('modelscope'), self.pipeline_cls)
83
+ logger.info(f'Loading model {model_name_or_path} with {self.pipeline_cls} ...')
84
+
85
+ self.model = module.from_pretrained(
86
+ model_name_or_path,
87
+ torch_dtype=self.torch_dtype,
88
+ **model_args,
89
+ )
90
+
91
+ self.model.to(self.device)
92
+
93
+ def generate(
94
+ self,
95
+ input: List[ChatMessage],
96
+ tools: List[ToolInfo],
97
+ tool_choice: ToolChoice,
98
+ config: GenerateConfig,
99
+ ) -> ModelOutput:
100
+
101
+ # prepare generator
102
+ kwargs: Dict[str, Any] = {}
103
+ if config.num_inference_steps is not None:
104
+ kwargs['num_inference_steps'] = config.num_inference_steps
105
+ kwargs.update(config.model_extra)
106
+
107
+ # assume the first text as prompt
108
+ content = input[0].content
109
+ assert isinstance(content[0], ContentText) and isinstance(content[1], ContentImage), \
110
+ 'Invalid content types, expected (ContentText, ContentImage)'
111
+
112
+ prompt = content[0].text
113
+ input_image_base64 = content[1].image
114
+ input_image = base64_to_PIL(input_image_base64)
115
+ # get the first image as output
116
+ output = self.model(image=input_image, prompt=prompt, **kwargs)
117
+ image = output.images[0]
118
+
119
+ image_base64 = PIL_to_base64(image)
120
+
121
+ return ModelOutput(
122
+ model=self.model_name,
123
+ choices=[ChatCompletionChoice.from_content(content=[ContentImage(image=image_base64)])],
124
+ time=time.time(),
125
+ )
@@ -0,0 +1,65 @@
1
+ from typing import Any, Dict, Generator, Iterable, Iterator, List, Optional, Union
2
+
3
+ from evalscope.api.dataset import Dataset
4
+ from evalscope.api.messages import ChatMessage
5
+ from evalscope.api.model import GenerateConfig, ModelAPI, ModelOutput
6
+ from evalscope.api.tool import ToolChoice, ToolInfo
7
+ from evalscope.utils.function_utils import thread_safe
8
+
9
+
10
+ class MockLLM(ModelAPI):
11
+ """A mock implementation of the ModelAPI class for testing purposes.
12
+
13
+ Always returns default_output, unless you pass in a model_args
14
+ key "custom_outputs" with a value of an Iterable[ModelOutput]
15
+ """
16
+
17
+ default_output = 'Default output from mockllm/model'
18
+
19
+ outputs: Iterator[ModelOutput]
20
+
21
+ def __init__(
22
+ self,
23
+ model_name: str,
24
+ base_url: Optional[str] = None,
25
+ api_key: Optional[str] = None,
26
+ config: GenerateConfig = GenerateConfig(),
27
+ custom_outputs: Iterable[ModelOutput] = None,
28
+ **model_args: Dict[str, Any],
29
+ ) -> None:
30
+ super().__init__(model_name, base_url, api_key, config)
31
+ self.model_args = model_args
32
+ if custom_outputs is not None:
33
+ # We cannot rely on the user of this model giving custom_outputs
34
+ # the correct type since they do not call this constructor
35
+ # Hence this type check and the one in generate.
36
+ if not isinstance(custom_outputs, (Iterable, Generator)):
37
+ raise ValueError(
38
+ f"model_args['custom_outputs'] must be an Iterable or a Generator, got {custom_outputs}"
39
+ )
40
+ self.outputs = iter(custom_outputs)
41
+ else:
42
+ self.outputs = iter((
43
+ ModelOutput.from_content(model='mockllm', content=self.default_output)
44
+ for _ in iter(int, 1) # produce an infinite iterator
45
+ ))
46
+
47
+ @thread_safe
48
+ def generate(
49
+ self,
50
+ input: List[ChatMessage],
51
+ tools: List[ToolInfo],
52
+ tool_choice: ToolChoice,
53
+ config: GenerateConfig,
54
+ ) -> ModelOutput:
55
+ try:
56
+ output = next(self.outputs)
57
+ except StopIteration:
58
+ raise ValueError('custom_outputs ran out of values')
59
+
60
+ if not isinstance(output, ModelOutput):
61
+ raise ValueError(f'output must be an instance of ModelOutput; got {type(output)}; content: {repr(output)}')
62
+ return output
63
+
64
+ def batch_generate(inputs: Dataset, config: GenerateConfig) -> List[ModelOutput]:
65
+ return super().batch_generate(inputs, config)
@@ -0,0 +1,67 @@
1
+ from evalscope.api.model import ModelAPI
2
+ from evalscope.api.registry import register_model_api
3
+ from evalscope.utils.deprecation_utils import deprecated
4
+ from evalscope.utils.import_utils import check_import
5
+
6
+
7
+ @register_model_api(name='mock_llm')
8
+ def mockllm() -> type[ModelAPI]:
9
+ from .mockllm import MockLLM
10
+
11
+ return MockLLM
12
+
13
+
14
+ @register_model_api(name='openai_api')
15
+ def openai_api() -> type[ModelAPI]:
16
+ from .openai_compatible import OpenAICompatibleAPI
17
+
18
+ return OpenAICompatibleAPI
19
+
20
+
21
+ @register_model_api(name='server')
22
+ @deprecated(since='1.0.0', remove_in='1.1.0', alternative='openai_api')
23
+ def server() -> type[ModelAPI]:
24
+ from .openai_compatible import OpenAICompatibleAPI
25
+
26
+ return OpenAICompatibleAPI
27
+
28
+
29
+ @register_model_api(name='llm_ckpt')
30
+ def llm_ckpt() -> type[ModelAPI]:
31
+ check_import('torch', package='torch', raise_error=True)
32
+
33
+ from .modelscope import ModelScopeAPI
34
+
35
+ return ModelScopeAPI
36
+
37
+
38
+ @register_model_api(name='checkpoint')
39
+ @deprecated(since='1.0.0', remove_in='1.1.0', alternative='llm_ckpt')
40
+ def checkpoint() -> type[ModelAPI]:
41
+ check_import('torch', package='torch', raise_error=True)
42
+
43
+ from .modelscope import ModelScopeAPI
44
+
45
+ return ModelScopeAPI
46
+
47
+
48
+ @register_model_api(name='text2image')
49
+ def text2image() -> type[ModelAPI]:
50
+ check_import('torch', package='evalscope[aigc]', raise_error=True)
51
+ check_import('torchvision', package='evalscope[aigc]', raise_error=True)
52
+ check_import('diffusers', package='evalscope[aigc]', raise_error=True)
53
+
54
+ from .text2image_model import Text2ImageAPI
55
+
56
+ return Text2ImageAPI
57
+
58
+
59
+ @register_model_api(name='image_editing')
60
+ def image_editing() -> type[ModelAPI]:
61
+ check_import('torch', package='evalscope[aigc]', raise_error=True)
62
+ check_import('torchvision', package='evalscope[aigc]', raise_error=True)
63
+ check_import('diffusers', package='evalscope[aigc]', raise_error=True)
64
+
65
+ from .image_edit_model import ImageEditAPI
66
+
67
+ return ImageEditAPI