evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +3 -0
  3. evalscope/api/benchmark/adapters/__init__.py +5 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
  8. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  9. evalscope/api/benchmark/benchmark.py +356 -0
  10. evalscope/api/benchmark/meta.py +121 -0
  11. evalscope/api/dataset/__init__.py +2 -0
  12. evalscope/api/dataset/dataset.py +349 -0
  13. evalscope/api/dataset/loader.py +262 -0
  14. evalscope/api/dataset/utils.py +143 -0
  15. evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope/api/evaluator/cache.py +378 -0
  17. evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope/api/evaluator/state.py +275 -0
  19. evalscope/api/filter/__init__.py +1 -0
  20. evalscope/api/filter/filter.py +72 -0
  21. evalscope/api/messages/__init__.py +12 -0
  22. evalscope/api/messages/chat_message.py +243 -0
  23. evalscope/api/messages/content.py +102 -0
  24. evalscope/api/messages/utils.py +35 -0
  25. evalscope/api/metric/__init__.py +2 -0
  26. evalscope/api/metric/metric.py +55 -0
  27. evalscope/api/metric/scorer.py +113 -0
  28. evalscope/api/mixin/__init__.py +1 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +155 -0
  32. evalscope/api/model/model.py +386 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/app.py +3 -0
  40. evalscope/app/ui/app_ui.py +2 -1
  41. evalscope/app/ui/multi_model.py +50 -25
  42. evalscope/app/ui/single_model.py +26 -14
  43. evalscope/app/utils/data_utils.py +43 -27
  44. evalscope/app/utils/env_utils.py +12 -0
  45. evalscope/app/utils/text_utils.py +14 -14
  46. evalscope/app/utils/visualization.py +9 -4
  47. evalscope/arguments.py +7 -10
  48. evalscope/backend/opencompass/api_meta_template.py +2 -1
  49. evalscope/backend/opencompass/backend_manager.py +6 -5
  50. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  51. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  52. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  53. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  55. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  56. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  57. evalscope/backend/rag_eval/utils/embedding.py +10 -1
  58. evalscope/backend/rag_eval/utils/llm.py +13 -12
  59. evalscope/benchmarks/__init__.py +0 -2
  60. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  61. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  62. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  63. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  64. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  65. evalscope/benchmarks/arena_hard/utils.py +37 -1
  66. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  67. evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
  68. evalscope/benchmarks/bfcl/generation.py +222 -0
  69. evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
  70. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  71. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  72. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  73. evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
  74. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  75. evalscope/benchmarks/docmath/utils.py +4 -5
  76. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  77. evalscope/benchmarks/frames/frames_adapter.py +136 -52
  78. evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
  79. evalscope/benchmarks/general_arena/utils.py +23 -27
  80. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  81. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  82. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  83. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  84. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  85. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  86. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  87. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  88. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  89. evalscope/benchmarks/ifeval/instructions.py +109 -64
  90. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  91. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  92. evalscope/benchmarks/ifeval/utils.py +6 -7
  93. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  94. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  95. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  96. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/math_vista/__init__.py +0 -0
  105. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  106. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  107. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  108. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  109. evalscope/benchmarks/mmmu/__init__.py +0 -0
  110. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  111. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  112. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  113. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  114. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
  115. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  116. evalscope/benchmarks/race/race_adapter.py +33 -119
  117. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  118. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  119. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  120. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  121. evalscope/benchmarks/tau_bench/generation.py +147 -0
  122. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
  123. evalscope/benchmarks/text2image/__init__.py +0 -0
  124. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  125. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  126. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  127. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  128. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  129. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  131. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
  132. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  133. evalscope/cli/cli.py +2 -0
  134. evalscope/cli/start_app.py +7 -1
  135. evalscope/cli/start_perf.py +7 -1
  136. evalscope/cli/start_server.py +6 -3
  137. evalscope/collections/__init__.py +2 -10
  138. evalscope/collections/sampler.py +10 -10
  139. evalscope/collections/schema.py +13 -11
  140. evalscope/config.py +157 -57
  141. evalscope/constants.py +37 -61
  142. evalscope/evaluator/__init__.py +1 -1
  143. evalscope/evaluator/evaluator.py +275 -419
  144. evalscope/filters/__init__.py +2 -0
  145. evalscope/filters/extraction.py +126 -0
  146. evalscope/filters/selection.py +57 -0
  147. evalscope/metrics/__init__.py +13 -13
  148. evalscope/metrics/llm_judge.py +47 -33
  149. evalscope/metrics/math_parser.py +27 -22
  150. evalscope/metrics/metric.py +307 -0
  151. evalscope/metrics/metrics.py +22 -18
  152. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  153. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  154. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  155. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  156. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  157. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  158. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  159. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  160. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  162. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  163. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  184. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  185. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  186. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  187. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  188. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  189. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  190. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  191. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  192. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  193. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  194. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  195. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  196. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  197. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  198. evalscope/models/__init__.py +6 -29
  199. evalscope/models/image_edit_model.py +125 -0
  200. evalscope/models/mockllm.py +65 -0
  201. evalscope/models/model_apis.py +67 -0
  202. evalscope/models/modelscope.py +455 -0
  203. evalscope/models/openai_compatible.py +126 -0
  204. evalscope/models/text2image_model.py +124 -0
  205. evalscope/models/utils/openai.py +701 -0
  206. evalscope/perf/benchmark.py +4 -1
  207. evalscope/perf/http_client.py +4 -2
  208. evalscope/perf/plugin/api/custom_api.py +5 -4
  209. evalscope/perf/plugin/api/openai_api.py +11 -9
  210. evalscope/perf/plugin/datasets/custom.py +2 -1
  211. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  212. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  213. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  214. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  215. evalscope/perf/plugin/datasets/openqa.py +4 -2
  216. evalscope/perf/utils/benchmark_util.py +15 -10
  217. evalscope/perf/utils/db_util.py +9 -6
  218. evalscope/perf/utils/local_server.py +11 -3
  219. evalscope/perf/utils/rich_display.py +16 -10
  220. evalscope/report/__init__.py +2 -3
  221. evalscope/report/combinator.py +18 -12
  222. evalscope/report/generator.py +51 -35
  223. evalscope/report/{utils.py → report.py} +8 -6
  224. evalscope/run.py +33 -47
  225. evalscope/summarizer.py +1 -1
  226. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  227. evalscope/utils/__init__.py +21 -2
  228. evalscope/utils/chat_service.py +3 -2
  229. evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope/utils/function_utils.py +29 -0
  231. evalscope/utils/import_utils.py +23 -1
  232. evalscope/utils/io_utils.py +142 -6
  233. evalscope/utils/json_schema.py +208 -0
  234. evalscope/utils/logger.py +51 -12
  235. evalscope/utils/model_utils.py +11 -7
  236. evalscope/utils/multi_choices.py +288 -0
  237. evalscope/utils/url_utils.py +65 -0
  238. evalscope/version.py +2 -2
  239. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
  240. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
  241. tests/benchmark/test_eval.py +385 -0
  242. tests/benchmark/test_image_edit.py +65 -0
  243. tests/{aigc → benchmark}/test_t2i.py +22 -4
  244. tests/benchmark/test_vlm.py +80 -0
  245. tests/cli/test_all.py +85 -47
  246. tests/cli/test_collection.py +20 -8
  247. tests/cli/test_custom.py +22 -15
  248. tests/cli/test_reasoning.py +81 -0
  249. tests/common.py +73 -0
  250. tests/perf/test_perf.py +4 -2
  251. tests/rag/test_clip_benchmark.py +0 -2
  252. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  253. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  254. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  255. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
  256. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  257. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  258. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  259. evalscope/benchmarks/benchmark.py +0 -81
  260. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  261. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  262. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  263. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  264. evalscope/benchmarks/data_adapter.py +0 -528
  265. evalscope/benchmarks/filters.py +0 -59
  266. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  267. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  268. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  269. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  270. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  271. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  272. evalscope/benchmarks/race/race.py +0 -104
  273. evalscope/benchmarks/race/samples.jsonl +0 -5
  274. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  275. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  276. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  277. evalscope/benchmarks/utils.py +0 -60
  278. evalscope/collections/evaluator.py +0 -375
  279. evalscope/metrics/completion_parsers.py +0 -227
  280. evalscope/metrics/named_metrics.py +0 -55
  281. evalscope/models/adapters/__init__.py +0 -14
  282. evalscope/models/adapters/base_adapter.py +0 -84
  283. evalscope/models/adapters/bfcl_adapter.py +0 -246
  284. evalscope/models/adapters/chat_adapter.py +0 -207
  285. evalscope/models/adapters/choice_adapter.py +0 -222
  286. evalscope/models/adapters/custom_adapter.py +0 -71
  287. evalscope/models/adapters/server_adapter.py +0 -236
  288. evalscope/models/adapters/t2i_adapter.py +0 -79
  289. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  290. evalscope/models/custom/__init__.py +0 -4
  291. evalscope/models/custom/custom_model.py +0 -50
  292. evalscope/models/custom/dummy_model.py +0 -99
  293. evalscope/models/local_model.py +0 -128
  294. evalscope/models/register.py +0 -41
  295. tests/cli/test_run.py +0 -489
  296. /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
  297. /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
  298. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  299. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  300. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  301. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  302. /tests/{aigc → benchmark}/__init__.py +0 -0
@@ -1,207 +0,0 @@
1
- import os
2
- import time
3
- import torch
4
- from typing import Any, Dict, List, Optional, Tuple, Union
5
-
6
- from evalscope.constants import OutputType
7
- from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage, Usage
8
- from evalscope.utils.logger import get_logger
9
- from evalscope.utils.model_utils import fix_do_sample_warning
10
- from ..local_model import LocalModel
11
- from ..register import register_model_adapter
12
- from .base_adapter import BaseModelAdapter
13
-
14
- logger = get_logger()
15
-
16
-
17
- @register_model_adapter(name=OutputType.GENERATION)
18
- class ChatGenerationModelAdapter(BaseModelAdapter):
19
- """
20
- Chat generation model adapter.
21
- """
22
-
23
- def __init__(self, model: LocalModel, **kwargs):
24
- super().__init__(model)
25
-
26
- self.generation_config = self._parse_generation_config(self.tokenizer, self.model)
27
-
28
- custom_generation_config = kwargs.pop('generation_config', None)
29
- custom_chat_template = kwargs.pop('chat_template', None)
30
-
31
- if custom_generation_config:
32
- logger.info('Updating generation config ...')
33
- self.generation_config.update(**custom_generation_config)
34
-
35
- if custom_chat_template:
36
- self.tokenizer.chat_template = custom_chat_template
37
- logger.info(f'Using custom chat template: {custom_chat_template}')
38
-
39
- def _parse_generation_config(self, tokenizer, model):
40
- from modelscope import GenerationConfig
41
-
42
- generation_config = getattr(model, 'generation_config', GenerationConfig(do_sample=False))
43
-
44
- try:
45
- remote_config = GenerationConfig.from_pretrained(
46
- self.model_id, revision=self.model_revision, trust_remote_code=True)
47
- generation_config.update(**remote_config.to_dict())
48
- except Exception:
49
- logger.warning(f'Failed to get generation config of {self.model_id} from model hub, use default.')
50
-
51
- if isinstance(self.model_id, str) and os.path.exists(self.model_id):
52
- logger.warning(f'Got local model dir: {self.model_id}')
53
-
54
- if tokenizer.eos_token_id is not None:
55
- generation_config.eos_token_id = tokenizer.eos_token_id
56
- if tokenizer.pad_token_id is not None:
57
- generation_config.pad_token_id = tokenizer.pad_token_id
58
- if generation_config.max_new_tokens is None:
59
- generation_config.max_new_tokens = 2048
60
-
61
- return generation_config
62
-
63
- def _model_generate(self,
64
- formatted_prompts: List[str],
65
- infer_cfg: Dict[str, Any] = None) -> Tuple[List[List[str]], List[int]]:
66
- """
67
- Args:
68
- formatted_prompts: The formatted prompts.
69
- infer_cfg: The inference configuration.
70
- Returns:
71
- The prediction results.
72
- """
73
- if infer_cfg is None:
74
- infer_cfg = {}
75
-
76
- # Process infer_cfg
77
- num_return_sequences = infer_cfg.get('num_return_sequences', 1)
78
- if num_return_sequences > 1:
79
- infer_cfg['do_sample'] = True
80
-
81
- # stop settings
82
- stop = infer_cfg.get('stop', [])
83
- if stop:
84
- eos_token_id = self.tokenizer.encode(stop, add_special_tokens=False)[0]
85
- else:
86
- eos_token_id = self.tokenizer.eos_token_id
87
-
88
- if eos_token_id is not None:
89
- infer_cfg['eos_token_id'] = eos_token_id
90
-
91
- self.generation_config.update(**infer_cfg)
92
- fix_do_sample_warning(self.generation_config)
93
-
94
- # Get input ids
95
- inputs = self.tokenizer(
96
- formatted_prompts, return_tensors='pt', padding=True, truncation=True,
97
- padding_side='left').to(self.model.device) # padding_side='left' is important for chat model
98
- input_ids = inputs['input_ids']
99
-
100
- # Run inference
101
- output_ids = self.model.generate(**inputs, generation_config=self.generation_config)
102
-
103
- # Decode output
104
- responses = []
105
- input_lengths = [len(self.tokenizer.encode(prompt)) for prompt in formatted_prompts]
106
- for i in range(0, len(output_ids), num_return_sequences):
107
- query_responses = []
108
- for j in range(num_return_sequences):
109
- output = output_ids[i + j]
110
- response = self.tokenizer.decode(
111
- output[len(input_ids[i // num_return_sequences]):], skip_special_tokens=True)
112
- query_responses.append(response)
113
- responses.append(query_responses)
114
-
115
- return responses, input_lengths
116
-
117
- def _prepare_inputs(self, inputs: List[dict], infer_cfg: dict = {}) -> List[str]:
118
- """
119
- Prepare the inputs for the model.
120
- Args:
121
- inputs: The input data.
122
- infer_cfg: The inference configuration.
123
- Returns:
124
- The prepared inputs and system prompts.
125
- """
126
- queries = []
127
- system_prompts = []
128
- message_list = []
129
-
130
- for input_item in inputs:
131
- queries.append(input_item['data'][0])
132
- system_prompts.append(input_item.get('system_prompt', None))
133
- if input_item.get('messages', None):
134
- message_list.append(input_item.get('messages', None))
135
-
136
- # For non chat model, use the original queries as the input
137
- if self.tokenizer.chat_template is None:
138
- return queries
139
-
140
- # For chat model, use the messages as the input
141
- # if message_list is None, use the queries as the input
142
- if len(message_list) == 0:
143
- for i, query in enumerate(queries):
144
- messages = [ChatMessage(role='user', content=query)]
145
- if i < len(system_prompts) and system_prompts[i]:
146
- messages = [ChatMessage(role='system', content=system_prompts[i])] + messages
147
- message_list.append(messages)
148
-
149
- # Format the messages
150
- formatted_prompts = []
151
- for messages in message_list:
152
- # apply chat template
153
- chat_template_kwargs = infer_cfg.get('chat_template_kwargs', None)
154
- if chat_template_kwargs is not None:
155
- prompts = self.tokenizer.apply_chat_template(
156
- messages, tokenize=False, add_generation_prompt=True, **chat_template_kwargs)
157
- else:
158
- prompts = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
159
- formatted_prompts.append(prompts)
160
-
161
- logger.debug(f'formatted_prompts: {formatted_prompts}')
162
- return formatted_prompts
163
-
164
- @torch.no_grad()
165
- def predict(self, inputs: List[dict], infer_cfg: Optional[dict] = {}) -> List[dict]:
166
- """
167
- Args:
168
- inputs: The input data.
169
- infer_cfg: The inference configuration.
170
- Returns:
171
- The prediction results.
172
- """
173
-
174
- # Process inputs
175
- formatted_prompts = self._prepare_inputs(inputs, infer_cfg)
176
-
177
- # Run inference
178
- responses, input_lengths = self._model_generate(formatted_prompts, infer_cfg)
179
-
180
- # Process outputs
181
- results = []
182
- for response, input_length in zip(responses, input_lengths):
183
- choices_list = []
184
- completion_tokens = 0
185
-
186
- for index, one_response in enumerate(response):
187
- choice = ChatCompletionResponseChoice(
188
- index=index, message=ChatMessage(content=one_response, role='assistant'), finish_reason='stop')
189
- choices_list.append(choice)
190
-
191
- completion_tokens += len(self.tokenizer.encode(one_response))
192
-
193
- usage = Usage(
194
- prompt_tokens=input_length,
195
- completion_tokens=completion_tokens,
196
- total_tokens=input_length + completion_tokens)
197
-
198
- res_d = ChatCompletionResponse(
199
- model=self.model_id,
200
- choices=choices_list,
201
- object='chat.completion',
202
- created=int(time.time()),
203
- usage=usage).model_dump(exclude_unset=True)
204
-
205
- results.append(res_d)
206
-
207
- return results
@@ -1,222 +0,0 @@
1
- import numpy as np
2
- import time
3
- import torch
4
- from typing import List
5
-
6
- from evalscope.constants import OutputType
7
- from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage
8
- from ..local_model import LocalModel
9
- from ..register import register_model_adapter
10
- from .base_adapter import BaseModelAdapter
11
-
12
-
13
- @register_model_adapter(name=OutputType.MULTIPLE_CHOICE)
14
- class MultiChoiceModelAdapter(BaseModelAdapter):
15
- """ The multi-choice model adapter. """
16
-
17
- _DEFAULT_MAX_LENGTH = 2048
18
-
19
- def __init__(self, model: LocalModel, **kwargs):
20
- super().__init__(model)
21
-
22
- self._max_length = kwargs.get('max_length')
23
-
24
- @property
25
- def max_length(self):
26
- if self._max_length:
27
- return self._max_length
28
- seqlen_config_attrs = ('n_positions', 'max_position_embeddings', 'n_ctx')
29
- for attr in seqlen_config_attrs:
30
- if hasattr(self.model.config, attr):
31
- return getattr(self.model.config, attr)
32
- if hasattr(self.tokenizer, 'model_max_length'):
33
- if self.tokenizer.model_max_length == 1000000000000000019884624838656:
34
- return self._DEFAULT_MAX_LENGTH
35
- return self.tokenizer.model_max_length
36
- return self._DEFAULT_MAX_LENGTH
37
-
38
- @torch.no_grad()
39
- def predict(self, inputs: List[dict], infer_cfg: dict = None) -> dict:
40
- """
41
- Multi-choice model prediction func.
42
-
43
- Args:
44
- inputs (List[dict]): The inputs for a doc. Format:
45
- {'data': [full_prompt], 'multi_choices': ['A', 'B', 'C', 'D']}
46
-
47
- infer_cfg (dict): inference configuration.
48
-
49
- Returns:
50
- res (dict): The model prediction results. Format:
51
- {
52
- 'choices': [
53
- {
54
- 'index': 0,
55
- 'message': {
56
- 'content': [-14.9609, -13.6015, ...], # loglikelihood values for inputs context-continuation pairs.
57
- 'role': 'assistant'
58
- }
59
- }
60
- ],
61
- 'created': 1677664795,
62
- # For models on the ModelScope or HuggingFace, concat model_id and revision with "-".
63
- 'model': 'gpt-3.5-turbo-0613',
64
- 'object': 'chat.completion',
65
- 'usage': {
66
- 'completion_tokens': 17,
67
- 'prompt_tokens': 57,
68
- 'total_tokens': 74
69
- }
70
- }
71
- """
72
- infer_cfg = infer_cfg or {}
73
- self.model.generation_config.update(**infer_cfg)
74
-
75
- input_data = [inp['data'][0] for inp in inputs]
76
- multi_choices = [inp['multi_choices'] for inp in inputs]
77
-
78
- outputs, input_info = self._get_logits(self.tokenizer, self.model, input_data)
79
-
80
- results = []
81
- for i, (logits, choices) in enumerate(zip(outputs, multi_choices)):
82
- choice_logits = [logits[self.tokenizer(ch)['input_ids'][-1:]] for ch in choices]
83
- softval = torch.nn.functional.softmax(torch.tensor(choice_logits).float(), dim=0)
84
-
85
- if softval.dtype in {torch.bfloat16, torch.float16}:
86
- softval = softval.to(dtype=torch.float32)
87
- probs = softval.detach().cpu().numpy()
88
- pred: str = choices[int(np.argmax(probs))] # Format: A or B or C or D
89
-
90
- res_d = ChatCompletionResponse(
91
- model=self.model_id,
92
- choices=[
93
- ChatCompletionResponseChoice(
94
- index=0, message=ChatMessage(content=pred, role='assistant'), finish_reason='stop')
95
- ],
96
- object='chat.completion',
97
- created=int(time.time()),
98
- usage=None).model_dump(exclude_unset=True)
99
-
100
- results.append(res_d)
101
-
102
- return results
103
-
104
- @staticmethod
105
- def _get_logits(tokenizer, model, inputs: List[str]):
106
- input_ids = tokenizer(
107
- inputs, padding=True, return_tensors='pt', padding_side='left')['input_ids'].to(model.device)
108
- tokens = {'input_ids': input_ids}
109
-
110
- outputs = model(input_ids)['logits']
111
- logits = outputs[:, -1, :]
112
- log_probs = torch.nn.functional.softmax(logits, dim=-1)
113
- return log_probs, {'tokens': tokens}
114
-
115
-
116
- @register_model_adapter(name=OutputType.CONTINUOUS)
117
- class ContinuationLogitsModelAdapter(MultiChoiceModelAdapter):
118
- """
119
- Continuation-logits model adapter.
120
- """
121
-
122
- def __init__(self, model: LocalModel, **kwargs):
123
- super().__init__(model, **kwargs)
124
-
125
- @torch.no_grad()
126
- def predict(self, inputs: List[dict], infer_cfg: dict = None) -> dict:
127
- """
128
- Multi-choice model prediction func.
129
- Args:
130
- inputs (List[dict]): The inputs for a doc. Format:
131
- {'data': [(context, continuation), ...]}
132
- infer_cfg (dict): inference configuration.
133
- Returns:
134
- res (dict): The model prediction results. Format:
135
- {
136
- 'choices': [
137
- {
138
- 'index': 0,
139
- 'message': {
140
- 'content': [-14.9609, -13.6015, ...], # loglikelihood values for inputs context-continuation pairs.
141
- 'role': 'assistant'
142
- }
143
- }
144
- ],
145
- 'created': 1677664795,
146
- # For models on the ModelScope or HuggingFace, concat model_id and revision with "-".
147
- 'model': 'gpt-3.5-turbo-0613',
148
- 'object': 'chat.completion',
149
- 'usage': {
150
- 'completion_tokens': 17,
151
- 'prompt_tokens': 57,
152
- 'total_tokens': 74
153
- }
154
- }
155
- """
156
- infer_cfg = infer_cfg or {}
157
-
158
- pred_list: list = []
159
- for inp in inputs:
160
- pred_list.append(self.loglikelihood(inputs=inp['data'], infer_cfg=infer_cfg))
161
-
162
- results = []
163
- for pred in pred_list:
164
- res_d = ChatCompletionResponse(
165
- model=self.model_id,
166
- choices=[{
167
- 'index': 0,
168
- 'message': {
169
- 'content': pred,
170
- 'role': 'assistant'
171
- }
172
- }],
173
- object='chat.completion',
174
- created=int(time.time()),
175
- usage=None).model_dump(exclude_unset=True)
176
- results.append(res_d)
177
-
178
- return results
179
-
180
- def loglikelihood(self, inputs: List[tuple], infer_cfg: dict = None) -> list:
181
- self.model.generation_config.update(**infer_cfg)
182
- # To predict one doc
183
- doc_ele_pred = []
184
- for ctx, continuation in inputs:
185
-
186
- # ctx_enc shape: [context_tok_len] cont_enc shape: [continuation_tok_len]
187
- ctx_enc, cont_enc = self._encode_pair(ctx, continuation)
188
-
189
- inputs_tokens = torch.tensor(
190
- (ctx_enc.tolist() + cont_enc.tolist())[-(self.max_length + 1):][:-1],
191
- dtype=torch.long,
192
- device=self.model.device).unsqueeze(0)
193
-
194
- logits = self.model(inputs_tokens)[0]
195
- logits = torch.nn.functional.log_softmax(logits.float(), dim=-1)
196
-
197
- logits = logits[:, -len(cont_enc):, :]
198
- cont_enc = cont_enc.unsqueeze(0).unsqueeze(-1)
199
- logits = torch.gather(logits.cpu(), 2, cont_enc.cpu()).squeeze(-1)
200
-
201
- choice_score = float(logits.sum())
202
- doc_ele_pred.append(choice_score)
203
-
204
- # e.g. [-2.3, -9.2, -12.9, 1.1], length=len(choices)
205
- return doc_ele_pred
206
-
207
- def _encode_pair(self, context, continuation):
208
- n_spaces = len(context) - len(context.rstrip())
209
- if n_spaces > 0:
210
- continuation = context[-n_spaces:] + continuation
211
- context = context[:-n_spaces]
212
-
213
- whole_enc = self.tokenizer(context + continuation, padding=False)['input_ids']
214
- whole_enc = torch.tensor(whole_enc, device=self.device)
215
-
216
- context_enc = self.tokenizer(context, padding=False)['input_ids']
217
- context_enc = torch.tensor(context_enc, device=self.device)
218
-
219
- context_enc_len = len(context_enc)
220
- continuation_enc = whole_enc[context_enc_len:]
221
-
222
- return context_enc, continuation_enc
@@ -1,71 +0,0 @@
1
- from typing import TYPE_CHECKING, Any, Dict, List, Union
2
-
3
- from ..register import register_model_adapter
4
- from .base_adapter import BaseModelAdapter
5
-
6
- if TYPE_CHECKING:
7
- from ..custom import CustomModel
8
-
9
-
10
- @register_model_adapter(name='custom')
11
- class CustomModelAdapter(BaseModelAdapter):
12
-
13
- def __init__(self, custom_model: 'CustomModel', **kwargs):
14
- """
15
- Custom model adapter.
16
-
17
- Args:
18
- custom_model: The custom model instance.
19
- **kwargs: Other args.
20
- """
21
- self.custom_model = custom_model
22
- super(CustomModelAdapter, self).__init__(model=custom_model)
23
-
24
- def predict(self, inputs: List[Union[str, dict, list]], **kwargs) -> List[Dict[str, Any]]:
25
- """
26
- Model prediction func.
27
-
28
- Args:
29
- inputs (List[Union[str, dict, list]]): The input data. Depending on the specific model.
30
- str: 'xxx'
31
- dict: {'data': [full_prompt]}
32
- list: ['xxx', 'yyy', 'zzz']
33
- **kwargs: kwargs
34
-
35
- Returns:
36
- res (dict): The model prediction results. Format:
37
- {
38
- 'choices': [
39
- {
40
- 'index': 0,
41
- 'message': {
42
- 'content': 'xxx',
43
- 'role': 'assistant'
44
- }
45
- }
46
- ],
47
- 'created': 1677664795,
48
- 'model': 'gpt-3.5-turbo-0613', # should be model_id
49
- 'object': 'chat.completion',
50
- 'usage': {
51
- 'completion_tokens': 17,
52
- 'prompt_tokens': 57,
53
- 'total_tokens': 74
54
- }
55
- }
56
- """
57
- in_prompts = []
58
-
59
- # Note: here we assume the inputs are all prompts for the benchmark.
60
- for input_prompt in inputs:
61
- if isinstance(input_prompt, str):
62
- in_prompts.append(input_prompt)
63
- elif isinstance(input_prompt, dict):
64
- # TODO: to be supported for continuation list like truthful_qa
65
- in_prompts.append(input_prompt['data'][0])
66
- elif isinstance(input_prompt, list):
67
- in_prompts.append('\n'.join(input_prompt))
68
- else:
69
- raise TypeError(f'Unsupported inputs type: {type(input_prompt)}')
70
-
71
- return self.custom_model.predict(prompts=in_prompts, origin_inputs=inputs, **kwargs)