evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (606) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +11 -0
  3. evalscope/api/benchmark/adapters/__init__.py +7 -0
  4. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
  6. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  7. evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
  8. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  9. evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
  10. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  11. evalscope/api/benchmark/benchmark.py +404 -0
  12. evalscope/api/benchmark/meta.py +124 -0
  13. evalscope/api/dataset/__init__.py +2 -0
  14. evalscope/api/dataset/dataset.py +370 -0
  15. evalscope/api/dataset/loader.py +266 -0
  16. evalscope/api/dataset/utils.py +143 -0
  17. evalscope/api/evaluator/__init__.py +3 -0
  18. evalscope/api/evaluator/cache.py +382 -0
  19. evalscope/api/evaluator/evaluator.py +61 -0
  20. evalscope/api/evaluator/state.py +280 -0
  21. evalscope/api/filter/__init__.py +1 -0
  22. evalscope/api/filter/filter.py +72 -0
  23. evalscope/api/messages/__init__.py +12 -0
  24. evalscope/api/messages/chat_message.py +248 -0
  25. evalscope/api/messages/content.py +102 -0
  26. evalscope/api/messages/utils.py +35 -0
  27. evalscope/api/metric/__init__.py +2 -0
  28. evalscope/api/metric/metric.py +60 -0
  29. evalscope/api/metric/scorer.py +113 -0
  30. evalscope/api/mixin/__init__.py +2 -0
  31. evalscope/api/mixin/llm_judge_mixin.py +170 -0
  32. evalscope/api/mixin/sandbox_mixin.py +182 -0
  33. evalscope/api/model/__init__.py +12 -0
  34. evalscope/api/model/generate_config.py +161 -0
  35. evalscope/api/model/model.py +386 -0
  36. evalscope/api/model/model_output.py +285 -0
  37. evalscope/api/registry.py +182 -0
  38. evalscope/api/tool/__init__.py +3 -0
  39. evalscope/api/tool/tool_call.py +101 -0
  40. evalscope/api/tool/tool_info.py +173 -0
  41. evalscope/api/tool/utils.py +64 -0
  42. evalscope/app/__init__.py +28 -0
  43. evalscope/app/app.py +38 -0
  44. evalscope/app/arguments.py +11 -0
  45. evalscope/app/constants.py +22 -0
  46. evalscope/app/ui/__init__.py +20 -0
  47. evalscope/app/ui/app_ui.py +53 -0
  48. evalscope/app/ui/multi_model.py +353 -0
  49. evalscope/app/ui/sidebar.py +42 -0
  50. evalscope/app/ui/single_model.py +220 -0
  51. evalscope/app/ui/visualization.py +36 -0
  52. evalscope/app/utils/data_utils.py +195 -0
  53. evalscope/app/utils/env_utils.py +12 -0
  54. evalscope/app/utils/localization.py +221 -0
  55. evalscope/app/utils/text_utils.py +119 -0
  56. evalscope/app/utils/visualization.py +96 -0
  57. evalscope/arguments.py +32 -9
  58. evalscope/backend/opencompass/api_meta_template.py +2 -1
  59. evalscope/backend/opencompass/backend_manager.py +10 -7
  60. evalscope/backend/rag_eval/__init__.py +1 -1
  61. evalscope/backend/rag_eval/backend_manager.py +23 -6
  62. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
  63. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  64. evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
  65. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  66. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  67. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  68. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  69. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  70. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  71. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
  72. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
  73. evalscope/backend/rag_eval/utils/embedding.py +125 -32
  74. evalscope/backend/rag_eval/utils/llm.py +16 -16
  75. evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
  76. evalscope/benchmarks/__init__.py +17 -5
  77. evalscope/benchmarks/aa_lcr/__init__.py +0 -0
  78. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  79. evalscope/benchmarks/ai2d/__init__.py +0 -0
  80. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  81. evalscope/benchmarks/aime/__init__.py +0 -0
  82. evalscope/benchmarks/aime/aime24_adapter.py +55 -0
  83. evalscope/benchmarks/aime/aime25_adapter.py +181 -0
  84. evalscope/benchmarks/aime/grader.py +307 -0
  85. evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
  86. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  87. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
  88. evalscope/benchmarks/amc/__init__.py +0 -0
  89. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  90. evalscope/benchmarks/arc/arc_adapter.py +34 -149
  91. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  92. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
  93. evalscope/benchmarks/arena_hard/utils.py +186 -0
  94. evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
  95. evalscope/benchmarks/bfcl/__init__.py +0 -0
  96. evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
  97. evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
  98. evalscope/benchmarks/bfcl/v3/generation.py +222 -0
  99. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  100. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  101. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  102. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  103. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  104. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  105. evalscope/benchmarks/blink/__init__.py +0 -0
  106. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  107. evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
  108. evalscope/benchmarks/chartqa/__init__.py +0 -0
  109. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  110. evalscope/benchmarks/chartqa/utils.py +38 -0
  111. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  112. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
  113. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
  114. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  115. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  116. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  117. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  118. evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
  119. evalscope/benchmarks/data_collection/__init__.py +0 -0
  120. evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
  121. evalscope/benchmarks/docmath/__init__.py +0 -0
  122. evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
  123. evalscope/benchmarks/docmath/utils.py +219 -0
  124. evalscope/benchmarks/docvqa/__init__.py +0 -0
  125. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  126. evalscope/benchmarks/drivelology/__init__.py +0 -0
  127. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  128. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  129. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  130. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  131. evalscope/benchmarks/drop/__init__.py +0 -0
  132. evalscope/benchmarks/drop/drop_adapter.py +155 -0
  133. evalscope/benchmarks/drop/utils.py +156 -0
  134. evalscope/benchmarks/frames/__init__.py +0 -0
  135. evalscope/benchmarks/frames/frames_adapter.py +175 -0
  136. evalscope/benchmarks/frames/utils.py +37 -0
  137. evalscope/benchmarks/general_arena/__init__.py +0 -0
  138. evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
  139. evalscope/benchmarks/general_arena/utils.py +223 -0
  140. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  141. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
  142. evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
  143. evalscope/benchmarks/gpqa/__init__.py +0 -0
  144. evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
  145. evalscope/benchmarks/gpqa/prompt.py +88 -0
  146. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
  147. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  148. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  149. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  150. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  151. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  152. evalscope/benchmarks/healthbench/__init__.py +0 -0
  153. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  154. evalscope/benchmarks/healthbench/utils.py +102 -0
  155. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
  156. evalscope/benchmarks/hle/__init__.py +0 -0
  157. evalscope/benchmarks/hle/hle_adapter.py +153 -0
  158. evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
  159. evalscope/benchmarks/humaneval/utils.py +235 -0
  160. evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
  161. evalscope/benchmarks/ifeval/instructions.py +112 -68
  162. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  163. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  164. evalscope/benchmarks/ifeval/utils.py +6 -7
  165. evalscope/benchmarks/image_edit/__init__.py +0 -0
  166. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  167. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  168. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  169. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  170. evalscope/benchmarks/infovqa/__init__.py +0 -0
  171. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  172. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
  173. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  174. evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
  175. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  176. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
  177. evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
  178. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  179. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  180. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  181. evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
  182. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  183. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  184. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  185. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
  186. evalscope/benchmarks/math_500/__init__.py +0 -0
  187. evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
  188. evalscope/benchmarks/math_qa/__init__.py +0 -0
  189. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  190. evalscope/benchmarks/math_verse/__init__.py +0 -0
  191. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  192. evalscope/benchmarks/math_vision/__init__.py +0 -0
  193. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  194. evalscope/benchmarks/math_vista/__init__.py +0 -0
  195. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  196. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  197. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  198. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  199. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  200. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  201. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  202. evalscope/benchmarks/mm_star/__init__.py +0 -0
  203. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  204. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
  205. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
  206. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  207. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
  208. evalscope/benchmarks/mmmu/__init__.py +0 -0
  209. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  210. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  211. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  212. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  213. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  214. evalscope/benchmarks/multi_if/__init__.py +0 -0
  215. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  216. evalscope/benchmarks/multi_if/metrics.py +120 -0
  217. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  218. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  219. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  220. evalscope/benchmarks/musr/__init__.py +0 -0
  221. evalscope/benchmarks/musr/musr_adapter.py +43 -0
  222. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  223. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
  224. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  225. evalscope/benchmarks/ner/__init__.py +0 -0
  226. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  227. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  228. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  229. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  230. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  231. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  232. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  233. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  234. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  235. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  236. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  237. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  238. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  239. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  240. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  241. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  242. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  243. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  244. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  245. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  246. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  247. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  248. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  249. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  250. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  251. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  252. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  253. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  254. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  255. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  256. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  257. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  258. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  259. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  260. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  261. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  262. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  263. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  264. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  265. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  266. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  267. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  268. evalscope/benchmarks/piqa/__init__.py +0 -0
  269. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  270. evalscope/benchmarks/poly_math/__init__.py +0 -0
  271. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  272. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  273. evalscope/benchmarks/pope/__init__.py +0 -0
  274. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  275. evalscope/benchmarks/process_bench/__init__.py +0 -0
  276. evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
  277. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  278. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  279. evalscope/benchmarks/qasc/__init__.py +0 -0
  280. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  281. evalscope/benchmarks/race/race_adapter.py +33 -120
  282. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  283. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  284. evalscope/benchmarks/sciq/__init__.py +0 -0
  285. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  286. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  287. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  288. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  289. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
  290. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  291. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  292. evalscope/benchmarks/siqa/__init__.py +0 -0
  293. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  294. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  295. evalscope/benchmarks/super_gpqa/prompt.py +88 -0
  296. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
  297. evalscope/benchmarks/super_gpqa/utils.py +86 -0
  298. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  299. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  300. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  301. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  302. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  303. evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
  304. evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
  305. evalscope/benchmarks/text2image/__init__.py +0 -0
  306. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  307. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  308. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  309. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  310. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  311. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  312. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
  313. evalscope/benchmarks/tool_bench/utils.py +203 -0
  314. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
  315. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
  316. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  317. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  318. evalscope/benchmarks/winogrande/__init__.py +0 -0
  319. evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
  320. evalscope/benchmarks/wmt/__init__.py +0 -0
  321. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  322. evalscope/benchmarks/zerobench/__init__.py +0 -0
  323. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  324. evalscope/cli/cli.py +2 -0
  325. evalscope/cli/start_app.py +12 -2
  326. evalscope/cli/start_eval.py +4 -3
  327. evalscope/cli/start_perf.py +10 -2
  328. evalscope/cli/start_server.py +6 -3
  329. evalscope/collections/__init__.py +27 -3
  330. evalscope/collections/sampler.py +12 -11
  331. evalscope/collections/schema.py +13 -12
  332. evalscope/config.py +218 -147
  333. evalscope/constants.py +78 -82
  334. evalscope/evaluator/__init__.py +1 -1
  335. evalscope/evaluator/evaluator.py +334 -318
  336. evalscope/filters/__init__.py +2 -0
  337. evalscope/filters/extraction.py +126 -0
  338. evalscope/filters/selection.py +57 -0
  339. evalscope/metrics/__init__.py +59 -3
  340. evalscope/metrics/bert_score/__init__.py +0 -0
  341. evalscope/metrics/bert_score/scorer.py +338 -0
  342. evalscope/metrics/bert_score/utils.py +697 -0
  343. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  344. evalscope/metrics/llm_judge.py +211 -0
  345. evalscope/metrics/math_parser.py +545 -0
  346. evalscope/metrics/metric.py +611 -0
  347. evalscope/metrics/metrics.py +112 -23
  348. evalscope/metrics/rouge_metric.py +11 -13
  349. evalscope/metrics/t2v_metrics/__init__.py +0 -0
  350. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  351. evalscope/metrics/t2v_metrics/constants.py +12 -0
  352. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  353. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  354. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  355. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  356. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  357. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
  358. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
  359. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
  360. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
  361. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
  362. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  363. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  364. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
  365. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
  366. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
  367. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  368. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
  369. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
  370. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  371. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  372. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  373. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  374. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  375. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
  376. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  377. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
  378. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  379. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
  380. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
  381. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  382. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  383. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  384. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
  385. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
  386. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
  387. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
  388. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  389. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  390. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
  391. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
  392. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  393. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  394. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  395. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  396. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  397. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  398. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  399. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  400. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  401. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  402. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  403. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  404. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  405. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  406. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  407. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  408. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  409. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  410. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  411. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  412. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  413. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  414. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  415. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
  416. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  417. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
  418. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  419. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  420. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  421. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
  422. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
  423. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
  424. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
  425. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
  426. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
  427. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
  428. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
  429. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  430. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  431. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  432. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
  433. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
  434. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
  435. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  436. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
  437. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
  438. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
  439. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
  440. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
  441. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
  442. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
  443. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  444. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  445. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
  446. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
  447. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
  448. evalscope/metrics/t2v_metrics/score.py +78 -0
  449. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  450. evalscope/models/__init__.py +23 -13
  451. evalscope/models/image_edit_model.py +125 -0
  452. evalscope/models/mockllm.py +65 -0
  453. evalscope/models/model_apis.py +69 -0
  454. evalscope/models/modelscope.py +455 -0
  455. evalscope/models/openai_compatible.py +144 -0
  456. evalscope/models/text2image_model.py +124 -0
  457. evalscope/models/utils/openai.py +708 -0
  458. evalscope/perf/__init__.py +0 -1
  459. evalscope/perf/arguments.py +103 -69
  460. evalscope/perf/benchmark.py +114 -163
  461. evalscope/perf/http_client.py +59 -89
  462. evalscope/perf/main.py +91 -18
  463. evalscope/perf/plugin/__init__.py +3 -2
  464. evalscope/perf/plugin/api/__init__.py +4 -3
  465. evalscope/perf/plugin/api/base.py +27 -7
  466. evalscope/perf/plugin/api/custom_api.py +170 -57
  467. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  468. evalscope/perf/plugin/api/default_api.py +214 -0
  469. evalscope/perf/plugin/api/openai_api.py +120 -41
  470. evalscope/perf/plugin/datasets/__init__.py +10 -6
  471. evalscope/perf/plugin/datasets/base.py +43 -1
  472. evalscope/perf/plugin/datasets/custom.py +22 -3
  473. evalscope/perf/plugin/datasets/flickr8k.py +5 -27
  474. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  475. evalscope/perf/plugin/datasets/line_by_line.py +7 -3
  476. evalscope/perf/plugin/datasets/longalpaca.py +7 -3
  477. evalscope/perf/plugin/datasets/openqa.py +13 -14
  478. evalscope/perf/plugin/datasets/random_dataset.py +67 -0
  479. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  480. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  481. evalscope/perf/plugin/registry.py +36 -16
  482. evalscope/perf/utils/analysis_result.py +24 -23
  483. evalscope/perf/utils/benchmark_util.py +95 -55
  484. evalscope/perf/utils/db_util.py +115 -78
  485. evalscope/perf/utils/local_server.py +12 -47
  486. evalscope/perf/utils/log_utils.py +63 -0
  487. evalscope/perf/utils/rich_display.py +192 -0
  488. evalscope/report/__init__.py +46 -3
  489. evalscope/report/combinator.py +143 -32
  490. evalscope/report/generator.py +74 -34
  491. evalscope/report/report.py +238 -0
  492. evalscope/run.py +71 -46
  493. evalscope/summarizer.py +5 -5
  494. evalscope/third_party/longbench_write/infer.py +1 -1
  495. evalscope/third_party/thinkbench/__init__.py +3 -0
  496. evalscope/third_party/thinkbench/eval.py +441 -0
  497. evalscope/third_party/thinkbench/infer.py +130 -0
  498. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  499. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  500. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  501. evalscope/third_party/thinkbench/tools/llm.py +48 -0
  502. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  503. evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
  504. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  505. evalscope/utils/__init__.py +82 -2
  506. evalscope/utils/argument_utils.py +64 -0
  507. evalscope/utils/chat_service.py +8 -6
  508. evalscope/utils/deprecation_utils.py +53 -0
  509. evalscope/utils/function_utils.py +266 -0
  510. evalscope/utils/import_utils.py +154 -0
  511. evalscope/utils/io_utils.py +336 -8
  512. evalscope/utils/json_schema.py +231 -0
  513. evalscope/utils/logger.py +121 -31
  514. evalscope/utils/model_utils.py +57 -1
  515. evalscope/utils/multi_choices.py +303 -0
  516. evalscope/utils/ner.py +377 -0
  517. evalscope/utils/url_utils.py +65 -0
  518. evalscope/version.py +2 -2
  519. evalscope-1.2.0.dist-info/METADATA +553 -0
  520. evalscope-1.2.0.dist-info/RECORD +628 -0
  521. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  522. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  523. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  524. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  525. evalscope/benchmarks/benchmark.py +0 -76
  526. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  527. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  528. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  529. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  530. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  531. evalscope/benchmarks/data_adapter.py +0 -291
  532. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  533. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  534. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  535. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  536. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  537. evalscope/benchmarks/race/race.py +0 -104
  538. evalscope/benchmarks/race/samples.jsonl +0 -5
  539. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  540. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  541. evalscope/collections/evaluator.py +0 -198
  542. evalscope/evaluator/rating_eval.py +0 -157
  543. evalscope/evaluator/reviewer/__init__.py +0 -1
  544. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  545. evalscope/metrics/code_metric.py +0 -98
  546. evalscope/metrics/named_metrics.py +0 -17
  547. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  548. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  549. evalscope/models/base_adapter.py +0 -52
  550. evalscope/models/chat_adapter.py +0 -138
  551. evalscope/models/choice_adapter.py +0 -211
  552. evalscope/models/custom/__init__.py +0 -3
  553. evalscope/models/custom/custom_model.py +0 -53
  554. evalscope/models/custom/dummy_model.py +0 -63
  555. evalscope/models/custom_adapter.py +0 -67
  556. evalscope/models/local_model.py +0 -74
  557. evalscope/models/model.py +0 -229
  558. evalscope/models/server_adapter.py +0 -111
  559. evalscope/registry/__init__.py +0 -1
  560. evalscope/registry/config/cfg_arena.yaml +0 -77
  561. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  562. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  563. evalscope/registry/config/cfg_single.yaml +0 -78
  564. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  565. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  566. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  567. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  568. evalscope/registry/data/question.jsonl +0 -80
  569. evalscope/registry/tasks/arc.yaml +0 -28
  570. evalscope/registry/tasks/bbh.yaml +0 -26
  571. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  572. evalscope/registry/tasks/ceval.yaml +0 -27
  573. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  574. evalscope/registry/tasks/cmmlu.yaml +0 -27
  575. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  576. evalscope/registry/tasks/general_qa.yaml +0 -27
  577. evalscope/registry/tasks/gsm8k.yaml +0 -29
  578. evalscope/registry/tasks/mmlu.yaml +0 -29
  579. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  580. evalscope/report/app.py +0 -506
  581. evalscope/report/utils.py +0 -133
  582. evalscope/run_arena.py +0 -202
  583. evalscope/utils/arena_utils.py +0 -217
  584. evalscope/utils/completion_parsers.py +0 -82
  585. evalscope/utils/utils.py +0 -301
  586. evalscope-0.10.0.dist-info/METADATA +0 -565
  587. evalscope-0.10.0.dist-info/RECORD +0 -286
  588. tests/__init__.py +0 -1
  589. tests/cli/__init__.py +0 -1
  590. tests/cli/test_collection.py +0 -57
  591. tests/cli/test_run.py +0 -165
  592. tests/perf/__init__.py +0 -1
  593. tests/perf/test_perf.py +0 -101
  594. tests/rag/test_clip_benchmark.py +0 -85
  595. tests/rag/test_mteb.py +0 -138
  596. tests/rag/test_ragas.py +0 -120
  597. tests/swift/__init__.py +0 -1
  598. tests/swift/test_run_swift_eval.py +0 -145
  599. tests/swift/test_run_swift_vlm_eval.py +0 -127
  600. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
  601. tests/test_run_all.py +0 -12
  602. tests/vlm/__init__.py +0 -1
  603. tests/vlm/test_vlmeval.py +0 -60
  604. {tests/rag → evalscope/api}/__init__.py +0 -0
  605. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  606. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,1876 @@
1
+ # Copyright 2018 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """ PyTorch T5 model."""
15
+
16
+ import copy
17
+ import math
18
+ import os
19
+ import torch
20
+ import warnings
21
+ from torch import nn
22
+ from torch.nn import CrossEntropyLoss
23
+ from torch.utils.checkpoint import checkpoint
24
+ from transformers.activations import ACT2FN
25
+ from transformers.modeling_outputs import (
26
+ BaseModelOutput,
27
+ BaseModelOutputWithPastAndCrossAttentions,
28
+ Seq2SeqLMOutput,
29
+ Seq2SeqModelOutput,
30
+ )
31
+ from transformers.modeling_utils import PreTrainedModel
32
+ from transformers.models.t5.configuration_t5 import T5Config
33
+ from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer
34
+ from transformers.utils import (
35
+ DUMMY_INPUTS,
36
+ DUMMY_MASK,
37
+ add_start_docstrings,
38
+ add_start_docstrings_to_model_forward,
39
+ is_torch_fx_proxy,
40
+ logging,
41
+ replace_return_docstrings,
42
+ )
43
+ from transformers.utils.model_parallel_utils import assert_device_map, get_device_map
44
+ from typing import Optional, Tuple, Union
45
+
46
+ logger = logging.get_logger(__name__)
47
+
48
+ _CONFIG_FOR_DOC = 'T5Config'
49
+ _TOKENIZER_FOR_DOC = 'T5Tokenizer'
50
+ _CHECKPOINT_FOR_DOC = 't5-small'
51
+
52
+ ####################################################
53
+ # This dict contains ids and associated url
54
+ # for the pretrained weights provided with the models
55
+ ####################################################
56
+ T5_PRETRAINED_MODEL_ARCHIVE_LIST = [
57
+ 't5-small',
58
+ 't5-base',
59
+ 't5-large',
60
+ 't5-3b',
61
+ 't5-11b',
62
+ # See all T5 models at https://huggingface.co/models?filter=t5
63
+ ]
64
+
65
+
66
+ ####################################################
67
+ # This is a conversion method from TF 1.0 to PyTorch
68
+ # More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28
69
+ ####################################################
70
+ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
71
+ """Load tf checkpoints in a pytorch model."""
72
+ try:
73
+ import numpy as np
74
+ import re
75
+ import tensorflow as tf
76
+ except ImportError:
77
+ logger.error(
78
+ 'Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see '
79
+ 'https://www.tensorflow.org/install/ for installation instructions.'
80
+ )
81
+ raise
82
+ tf_path = os.path.abspath(tf_checkpoint_path)
83
+ logger.info(f'Converting TensorFlow checkpoint from {tf_path}')
84
+ # Load weights from TF model
85
+ init_vars = tf.train.list_variables(tf_path)
86
+ names = []
87
+ tf_weights = {}
88
+ for name, shape in init_vars:
89
+ logger.info(f'Loading TF weight {name} with shape {shape}')
90
+ array = tf.train.load_variable(tf_path, name)
91
+ names.append(name)
92
+ tf_weights[name] = array
93
+
94
+ for txt_name in names:
95
+ name = txt_name.split('/')
96
+ # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
97
+ # which are not required for using pretrained model
98
+ if any(
99
+ n in [
100
+ 'adam_v',
101
+ 'adam_m',
102
+ 'AdamWeightDecayOptimizer',
103
+ 'AdamWeightDecayOptimizer_1',
104
+ 'global_step',
105
+ ] for n in name
106
+ ):
107
+ logger.info(f"Skipping {'/'.join(name)}")
108
+ tf_weights.pop(txt_name, None)
109
+ continue
110
+ if '_slot_' in name[-1]:
111
+ logger.info(f"Skipping {'/'.join(name)}")
112
+ tf_weights.pop(txt_name, None)
113
+ continue
114
+ pointer = model
115
+ array = tf_weights[txt_name]
116
+
117
+ for m_name in name:
118
+ if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
119
+ scope_names = re.split(r'_(\d+)', m_name)
120
+ else:
121
+ scope_names = [m_name]
122
+ if scope_names[0] in ['kernel', 'scale', 'embedding']:
123
+ pointer = getattr(pointer, 'weight')
124
+ elif scope_names[0] == 'self_attention':
125
+ pointer = getattr(pointer, 'layer')
126
+ pointer = pointer[0]
127
+ elif scope_names[0] == 'enc_dec_attention':
128
+ pointer = getattr(pointer, 'layer')
129
+ pointer = pointer[1]
130
+ elif scope_names[0] == 'dense_relu_dense':
131
+ pointer = getattr(pointer, 'layer')
132
+ pointer = pointer[2]
133
+ elif scope_names[0] == 'rms_norm':
134
+ if hasattr(pointer, 'layer_norm'):
135
+ pointer = getattr(pointer, 'layer_norm')
136
+ elif hasattr(pointer, 'final_layer_norm'):
137
+ pointer = getattr(pointer, 'final_layer_norm')
138
+ elif scope_names[0] == 'scale':
139
+ pointer = getattr(pointer, 'weight')
140
+ elif scope_names[0] == 'output_bias' or scope_names[0] == 'beta':
141
+ pointer = getattr(pointer, 'bias')
142
+ elif scope_names[0] == 'squad':
143
+ pointer = getattr(pointer, 'classifier')
144
+ elif scope_names[0] == 'decoder' and name[1] == 'logits':
145
+ continue
146
+ elif scope_names[0] == 'logits':
147
+ pointer = getattr(pointer, 'lm_head')
148
+ elif (scope_names[0] == 'wi' and len(scope_names) > 1 and scope_names[1].isdigit()):
149
+ pointer = getattr(pointer, f'wi_{scope_names[1]}')
150
+ continue
151
+ else:
152
+ try:
153
+ pointer = getattr(pointer, scope_names[0])
154
+ except AttributeError:
155
+ logger.info(f"Skipping {'/'.join(name)}")
156
+ continue
157
+ if len(scope_names) >= 2:
158
+ num = int(scope_names[1])
159
+ pointer = pointer[num]
160
+ if scope_names[0] not in ['kernel', 'scale', 'embedding']:
161
+ pointer = getattr(pointer, 'weight')
162
+ if scope_names[0] != 'embedding':
163
+ logger.info(f'Transposing numpy weight of shape {array.shape} for {name}')
164
+ array = np.transpose(array)
165
+ try:
166
+ assert (
167
+ pointer.shape == array.shape
168
+ ), f'Pointer shape {pointer.shape} and array shape {array.shape} mismatched'
169
+ except AssertionError as e:
170
+ e.args += (pointer.shape, array.shape)
171
+ raise
172
+ logger.info(f'Initialize PyTorch weight {name}')
173
+ pointer.data = torch.from_numpy(array.astype(np.float32))
174
+ tf_weights.pop(txt_name, None)
175
+
176
+ logger.info(f"Weights not copied to PyTorch model: {', '.join(tf_weights.keys())}.")
177
+ return model
178
+
179
+
180
+ ####################################################
181
+ # PyTorch Models are constructed by sub-classing
182
+ # - torch.nn.Module for the layers and
183
+ # - PreTrainedModel for the models (it-self a sub-class of nn.Module)
184
+ ####################################################
185
+ PARALLELIZE_DOCSTRING = r"""
186
+ This is an experimental feature and is a subject to change at a moment's notice.
187
+
188
+ Uses a device map to distribute attention modules of the model across several devices. If no device map is given,
189
+ it will evenly distribute blocks across all devices.
190
+
191
+ Args:
192
+ device_map (`Dict[int, list]`, optional, defaults to None):
193
+ A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
194
+ automatically mapped to the first device (for esoteric reasons). That means that the first device should
195
+ have fewer attention modules mapped to it than other devices. For reference, the t5 models have the
196
+ following number of attention modules:
197
+
198
+ - t5-small: 6
199
+ - t5-base: 12
200
+ - t5-large: 24
201
+ - t5-3b: 24
202
+ - t5-11b: 24
203
+
204
+ Example:
205
+
206
+ ```python
207
+ # Here is an example of a device map on a machine with 4 GPUs using t5-3b, which has a total of 24 attention modules:
208
+ model = T5ForConditionalGeneration.from_pretrained("t5-3b")
209
+ device_map = {
210
+ 0: [0, 1, 2],
211
+ 1: [3, 4, 5, 6, 7, 8, 9],
212
+ 2: [10, 11, 12, 13, 14, 15, 16],
213
+ 3: [17, 18, 19, 20, 21, 22, 23],
214
+ }
215
+ model.parallelize(device_map)
216
+ ```
217
+ """
218
+ DEPARALLELIZE_DOCSTRING = r"""
219
+ Moves the model to cpu from a model parallel state.
220
+
221
+ Example:
222
+
223
+ ```python
224
+ # On a 4 GPU machine with t5-3b:
225
+ model = T5ForConditionalGeneration.from_pretrained("t5-3b")
226
+ device_map = {
227
+ 0: [0, 1, 2],
228
+ 1: [3, 4, 5, 6, 7, 8, 9],
229
+ 2: [10, 11, 12, 13, 14, 15, 16],
230
+ 3: [17, 18, 19, 20, 21, 22, 23],
231
+ }
232
+ model.parallelize(device_map) # Splits the model across several devices
233
+ model.deparallelize() # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
234
+ ```
235
+ """
236
+
237
+
238
+ class T5LayerNorm(nn.Module):
239
+
240
+ def __init__(self, hidden_size, eps=1e-6):
241
+ """
242
+ Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
243
+ """
244
+ super().__init__()
245
+ self.weight = nn.Parameter(torch.ones(hidden_size))
246
+ self.variance_epsilon = eps
247
+
248
+ def forward(self, hidden_states):
249
+
250
+ # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
251
+ # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
252
+ # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
253
+ # half-precision inputs is done in fp32
254
+
255
+ variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
256
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
257
+
258
+ # convert into half-precision if necessary
259
+ if self.weight.dtype in [torch.float16, torch.bfloat16]:
260
+ hidden_states = hidden_states.to(self.weight.dtype)
261
+
262
+ return self.weight * hidden_states
263
+
264
+
265
+ try:
266
+ from apex.normalization import FusedRMSNorm
267
+
268
+ T5LayerNorm = FusedRMSNorm # noqa
269
+
270
+ logger.info('Discovered apex.normalization.FusedRMSNorm - will use it instead of T5LayerNorm')
271
+ except ImportError:
272
+ # using the normal T5LayerNorm
273
+ pass
274
+ except Exception:
275
+ logger.warning('discovered apex but it failed to load, falling back to T5LayerNorm')
276
+ pass
277
+
278
+ ALL_LAYERNORM_LAYERS.append(T5LayerNorm)
279
+
280
+
281
+ class T5DenseActDense(nn.Module):
282
+
283
+ def __init__(self, config: T5Config):
284
+ super().__init__()
285
+ self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)
286
+ self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
287
+ self.dropout = nn.Dropout(config.dropout_rate)
288
+ self.act = ACT2FN[config.dense_act_fn]
289
+
290
+ def forward(self, hidden_states):
291
+ hidden_states = self.wi(hidden_states)
292
+ hidden_states = self.act(hidden_states)
293
+ hidden_states = self.dropout(hidden_states)
294
+ hidden_states = self.wo(hidden_states)
295
+ return hidden_states
296
+
297
+
298
+ class T5DenseGatedActDense(nn.Module):
299
+
300
+ def __init__(self, config: T5Config):
301
+ super().__init__()
302
+ self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
303
+ self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
304
+ self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
305
+ self.dropout = nn.Dropout(config.dropout_rate)
306
+ self.act = ACT2FN[config.dense_act_fn]
307
+
308
+ def forward(self, hidden_states):
309
+ hidden_gelu = self.act(self.wi_0(hidden_states))
310
+ hidden_linear = self.wi_1(hidden_states)
311
+ hidden_states = hidden_gelu * hidden_linear
312
+ hidden_states = self.dropout(hidden_states)
313
+ hidden_states = self.wo(hidden_states)
314
+ return hidden_states
315
+
316
+
317
+ class T5LayerFF(nn.Module):
318
+
319
+ def __init__(self, config: T5Config):
320
+ super().__init__()
321
+ if config.is_gated_act:
322
+ self.DenseReluDense = T5DenseGatedActDense(config)
323
+ else:
324
+ self.DenseReluDense = T5DenseActDense(config)
325
+
326
+ self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
327
+ self.dropout = nn.Dropout(config.dropout_rate)
328
+
329
+ def forward(self, hidden_states):
330
+ forwarded_states = self.layer_norm(hidden_states)
331
+ forwarded_states = self.DenseReluDense(forwarded_states)
332
+ hidden_states = hidden_states + self.dropout(forwarded_states)
333
+ return hidden_states
334
+
335
+
336
+ class T5Attention(nn.Module):
337
+
338
+ def __init__(self, config: T5Config, has_relative_attention_bias=False):
339
+ super().__init__()
340
+ self.is_decoder = config.is_decoder
341
+ self.has_relative_attention_bias = has_relative_attention_bias
342
+ self.relative_attention_num_buckets = config.relative_attention_num_buckets
343
+ self.relative_attention_max_distance = config.relative_attention_max_distance
344
+ self.d_model = config.d_model
345
+ self.key_value_proj_dim = config.d_kv
346
+ self.n_heads = config.num_heads
347
+ self.dropout = config.dropout_rate
348
+ self.inner_dim = self.n_heads * self.key_value_proj_dim
349
+
350
+ # Mesh TensorFlow initialization to avoid scaling before softmax
351
+ self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
352
+ self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
353
+ self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
354
+ self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)
355
+
356
+ if self.has_relative_attention_bias:
357
+ self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
358
+ self.pruned_heads = set()
359
+ self.gradient_checkpointing = False
360
+
361
+ def prune_heads(self, heads):
362
+ if len(heads) == 0:
363
+ return
364
+ heads, index = find_pruneable_heads_and_indices(heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads)
365
+ # Prune linear layers
366
+ self.q = prune_linear_layer(self.q, index)
367
+ self.k = prune_linear_layer(self.k, index)
368
+ self.v = prune_linear_layer(self.v, index)
369
+ self.o = prune_linear_layer(self.o, index, dim=1)
370
+ # Update hyper params
371
+ self.n_heads = self.n_heads - len(heads)
372
+ self.inner_dim = self.key_value_proj_dim * self.n_heads
373
+ self.pruned_heads = self.pruned_heads.union(heads)
374
+
375
+ @staticmethod
376
+ def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
377
+ """
378
+ Adapted from Mesh Tensorflow:
379
+ https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
380
+
381
+ Translate relative position to a bucket number for relative attention. The relative position is defined as
382
+ memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
383
+ position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
384
+ small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
385
+ positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
386
+ This should allow for more graceful generalization to longer sequences than the model has been trained on
387
+
388
+ Args:
389
+ relative_position: an int32 Tensor
390
+ bidirectional: a boolean - whether the attention is bidirectional
391
+ num_buckets: an integer
392
+ max_distance: an integer
393
+
394
+ Returns:
395
+ a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
396
+ """
397
+ relative_buckets = 0
398
+ if bidirectional:
399
+ num_buckets //= 2
400
+ relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
401
+ relative_position = torch.abs(relative_position)
402
+ else:
403
+ relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))
404
+ # now relative_position is in the range [0, inf)
405
+
406
+ # half of the buckets are for exact increments in positions
407
+ max_exact = num_buckets // 2
408
+ is_small = relative_position < max_exact
409
+
410
+ # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
411
+ relative_position_if_large = max_exact + (
412
+ torch.log(relative_position.float() / max_exact) / math.log(max_distance / max_exact) *
413
+ (num_buckets - max_exact)
414
+ ).to(torch.long)
415
+ relative_position_if_large = torch.min(
416
+ relative_position_if_large,
417
+ torch.full_like(relative_position_if_large, num_buckets - 1),
418
+ )
419
+
420
+ relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)
421
+ return relative_buckets
422
+
423
+ def compute_bias(self, query_length, key_length, device=None):
424
+ """Compute binned relative position bias"""
425
+ if device is None:
426
+ device = self.relative_attention_bias.weight.device
427
+ context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None]
428
+ memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :]
429
+ relative_position = (memory_position - context_position) # shape (query_length, key_length)
430
+ relative_position_bucket = self._relative_position_bucket(
431
+ relative_position, # shape (query_length, key_length)
432
+ bidirectional=(not self.is_decoder),
433
+ num_buckets=self.relative_attention_num_buckets,
434
+ max_distance=self.relative_attention_max_distance,
435
+ )
436
+ values = self.relative_attention_bias(relative_position_bucket) # shape (query_length, key_length, num_heads)
437
+ values = values.permute([2, 0, 1]).unsqueeze(0) # shape (1, num_heads, query_length, key_length)
438
+ return values
439
+
440
+ def forward(
441
+ self,
442
+ hidden_states,
443
+ mask=None,
444
+ key_value_states=None,
445
+ position_bias=None,
446
+ past_key_value=None,
447
+ layer_head_mask=None,
448
+ query_length=None,
449
+ use_cache=False,
450
+ output_attentions=False,
451
+ ):
452
+ """
453
+ Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
454
+ """
455
+ # Input is (batch_size, seq_length, dim)
456
+ # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
457
+ # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
458
+ batch_size, seq_length = hidden_states.shape[:2]
459
+
460
+ real_seq_length = seq_length
461
+
462
+ if past_key_value is not None:
463
+ assert (
464
+ len(past_key_value) == 2
465
+ ), f'past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states'
466
+ real_seq_length += (past_key_value[0].shape[2] if query_length is None else query_length)
467
+
468
+ key_length = (real_seq_length if key_value_states is None else key_value_states.shape[1])
469
+
470
+ def shape(states):
471
+ """projection"""
472
+ return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
473
+
474
+ def unshape(states):
475
+ """reshape"""
476
+ return (states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim))
477
+
478
+ def project(hidden_states, proj_layer, key_value_states, past_key_value):
479
+ """projects hidden states correctly to key/query states"""
480
+ if key_value_states is None:
481
+ # self-attn
482
+ # (batch_size, n_heads, seq_length, dim_per_head)
483
+ hidden_states = shape(proj_layer(hidden_states))
484
+ elif past_key_value is None:
485
+ # cross-attn
486
+ # (batch_size, n_heads, seq_length, dim_per_head)
487
+ hidden_states = shape(proj_layer(key_value_states))
488
+
489
+ if past_key_value is not None:
490
+ if key_value_states is None:
491
+ # self-attn
492
+ # (batch_size, n_heads, key_length, dim_per_head)
493
+ hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
494
+ else:
495
+ # cross-attn
496
+ hidden_states = past_key_value
497
+ return hidden_states
498
+
499
+ # get query states
500
+ query_states = shape(self.q(hidden_states)) # (batch_size, n_heads, seq_length, dim_per_head)
501
+
502
+ # get key/value states
503
+ key_states = project(
504
+ hidden_states,
505
+ self.k,
506
+ key_value_states,
507
+ past_key_value[0] if past_key_value is not None else None,
508
+ )
509
+ value_states = project(
510
+ hidden_states,
511
+ self.v,
512
+ key_value_states,
513
+ past_key_value[1] if past_key_value is not None else None,
514
+ )
515
+
516
+ # compute scores
517
+ scores = torch.matmul(
518
+ query_states, key_states.transpose(3, 2)
519
+ ) # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
520
+
521
+ if position_bias is None:
522
+ if not self.has_relative_attention_bias:
523
+ position_bias = torch.zeros(
524
+ (1, self.n_heads, real_seq_length, key_length),
525
+ device=scores.device,
526
+ dtype=scores.dtype,
527
+ )
528
+ if self.gradient_checkpointing and self.training:
529
+ position_bias.requires_grad = True
530
+ else:
531
+ position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device)
532
+
533
+ # if key and values are already calculated
534
+ # we want only the last query position bias
535
+ if past_key_value is not None:
536
+ position_bias = position_bias[:, :, -hidden_states.size(1):, :]
537
+
538
+ if mask is not None:
539
+ position_bias = (position_bias + mask) # (batch_size, n_heads, seq_length, key_length)
540
+
541
+ if self.pruned_heads:
542
+ mask = torch.ones(position_bias.shape[1])
543
+ mask[list(self.pruned_heads)] = 0
544
+ position_bias_masked = position_bias[:, mask.bool()]
545
+ else:
546
+ position_bias_masked = position_bias
547
+
548
+ scores += position_bias_masked
549
+ attn_weights = nn.functional.softmax(scores.float(),
550
+ dim=-1).type_as(scores) # (batch_size, n_heads, seq_length, key_length)
551
+ attn_weights = nn.functional.dropout(
552
+ attn_weights, p=self.dropout, training=self.training
553
+ ) # (batch_size, n_heads, seq_length, key_length)
554
+
555
+ # Mask heads if we want to
556
+ if layer_head_mask is not None:
557
+ attn_weights = attn_weights * layer_head_mask
558
+
559
+ attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim)
560
+ attn_output = self.o(attn_output)
561
+
562
+ present_key_value_state = ((key_states, value_states) if (self.is_decoder and use_cache) else None)
563
+ outputs = (attn_output, ) + (present_key_value_state, ) + (position_bias, )
564
+
565
+ if output_attentions:
566
+ outputs = outputs + (attn_weights, )
567
+ return outputs
568
+
569
+
570
+ class T5LayerSelfAttention(nn.Module):
571
+
572
+ def __init__(self, config, has_relative_attention_bias=False):
573
+ super().__init__()
574
+ self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
575
+ self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
576
+ self.dropout = nn.Dropout(config.dropout_rate)
577
+
578
+ def forward(
579
+ self,
580
+ hidden_states,
581
+ attention_mask=None,
582
+ position_bias=None,
583
+ layer_head_mask=None,
584
+ past_key_value=None,
585
+ use_cache=False,
586
+ output_attentions=False,
587
+ ):
588
+ normed_hidden_states = self.layer_norm(hidden_states)
589
+ attention_output = self.SelfAttention(
590
+ normed_hidden_states,
591
+ mask=attention_mask,
592
+ position_bias=position_bias,
593
+ layer_head_mask=layer_head_mask,
594
+ past_key_value=past_key_value,
595
+ use_cache=use_cache,
596
+ output_attentions=output_attentions,
597
+ )
598
+ hidden_states = hidden_states + self.dropout(attention_output[0])
599
+ outputs = (hidden_states, ) + attention_output[1:] # add attentions if we output them
600
+ return outputs
601
+
602
+
603
+ class T5LayerCrossAttention(nn.Module):
604
+
605
+ def __init__(self, config):
606
+ super().__init__()
607
+ self.EncDecAttention = T5Attention(config, has_relative_attention_bias=False)
608
+ self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
609
+ self.dropout = nn.Dropout(config.dropout_rate)
610
+
611
+ def forward(
612
+ self,
613
+ hidden_states,
614
+ key_value_states,
615
+ attention_mask=None,
616
+ position_bias=None,
617
+ layer_head_mask=None,
618
+ past_key_value=None,
619
+ use_cache=False,
620
+ query_length=None,
621
+ output_attentions=False,
622
+ ):
623
+ normed_hidden_states = self.layer_norm(hidden_states)
624
+ attention_output = self.EncDecAttention(
625
+ normed_hidden_states,
626
+ mask=attention_mask,
627
+ key_value_states=key_value_states,
628
+ position_bias=position_bias,
629
+ layer_head_mask=layer_head_mask,
630
+ past_key_value=past_key_value,
631
+ use_cache=use_cache,
632
+ query_length=query_length,
633
+ output_attentions=output_attentions,
634
+ )
635
+ layer_output = hidden_states + self.dropout(attention_output[0])
636
+ outputs = (layer_output, ) + attention_output[1:] # add attentions if we output them
637
+ return outputs
638
+
639
+
640
+ class T5Block(nn.Module):
641
+
642
+ def __init__(self, config, has_relative_attention_bias=False):
643
+ super().__init__()
644
+ self.is_decoder = config.is_decoder
645
+ self.layer = nn.ModuleList()
646
+ self.layer.append(T5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias))
647
+ if self.is_decoder:
648
+ self.layer.append(T5LayerCrossAttention(config))
649
+
650
+ self.layer.append(T5LayerFF(config))
651
+
652
+ def forward(
653
+ self,
654
+ hidden_states,
655
+ attention_mask=None,
656
+ position_bias=None,
657
+ encoder_hidden_states=None,
658
+ encoder_attention_mask=None,
659
+ encoder_decoder_position_bias=None,
660
+ layer_head_mask=None,
661
+ cross_attn_layer_head_mask=None,
662
+ past_key_value=None,
663
+ use_cache=False,
664
+ output_attentions=False,
665
+ return_dict=True,
666
+ ):
667
+
668
+ if past_key_value is not None:
669
+ if not self.is_decoder:
670
+ logger.warning('`past_key_values` is passed to the encoder. Please make sure this is intended.')
671
+ expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
672
+
673
+ if len(past_key_value) != expected_num_past_key_values:
674
+ raise ValueError(
675
+ f'There should be {expected_num_past_key_values} past states. '
676
+ f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
677
+ f'Got {len(past_key_value)} past key / value states'
678
+ )
679
+
680
+ self_attn_past_key_value = past_key_value[:2]
681
+ cross_attn_past_key_value = past_key_value[2:]
682
+ else:
683
+ self_attn_past_key_value, cross_attn_past_key_value = None, None
684
+
685
+ self_attention_outputs = self.layer[0](
686
+ hidden_states,
687
+ attention_mask=attention_mask,
688
+ position_bias=position_bias,
689
+ layer_head_mask=layer_head_mask,
690
+ past_key_value=self_attn_past_key_value,
691
+ use_cache=use_cache,
692
+ output_attentions=output_attentions,
693
+ )
694
+ hidden_states, present_key_value_state = self_attention_outputs[:2]
695
+ attention_outputs = self_attention_outputs[2:] # Keep self-attention outputs and relative position weights
696
+
697
+ # clamp inf values to enable fp16 training
698
+ if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
699
+ clamp_value = torch.finfo(hidden_states.dtype).max - 1000
700
+ hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
701
+
702
+ do_cross_attention = self.is_decoder and encoder_hidden_states is not None
703
+ if do_cross_attention:
704
+ # the actual query length is unknown for cross attention
705
+ # if using past key value states. Need to inject it here
706
+ if present_key_value_state is not None:
707
+ query_length = present_key_value_state[0].shape[2]
708
+ else:
709
+ query_length = None
710
+
711
+ cross_attention_outputs = self.layer[1](
712
+ hidden_states,
713
+ key_value_states=encoder_hidden_states,
714
+ attention_mask=encoder_attention_mask,
715
+ position_bias=encoder_decoder_position_bias,
716
+ layer_head_mask=cross_attn_layer_head_mask,
717
+ past_key_value=cross_attn_past_key_value,
718
+ query_length=query_length,
719
+ use_cache=use_cache,
720
+ output_attentions=output_attentions,
721
+ )
722
+ hidden_states = cross_attention_outputs[0]
723
+
724
+ # clamp inf values to enable fp16 training
725
+ if (hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any()):
726
+ clamp_value = torch.finfo(hidden_states.dtype).max - 1000
727
+ hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
728
+
729
+ # Combine self attn and cross attn key value states
730
+ if present_key_value_state is not None:
731
+ present_key_value_state = (present_key_value_state + cross_attention_outputs[1])
732
+
733
+ # Keep cross-attention outputs and relative position weights
734
+ attention_outputs = attention_outputs + cross_attention_outputs[2:]
735
+
736
+ # Apply Feed Forward layer
737
+ hidden_states = self.layer[-1](hidden_states)
738
+
739
+ # clamp inf values to enable fp16 training
740
+ if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
741
+ clamp_value = torch.finfo(hidden_states.dtype).max - 1000
742
+ hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
743
+
744
+ outputs = (hidden_states, )
745
+
746
+ if use_cache:
747
+ outputs = outputs + (present_key_value_state, ) + attention_outputs
748
+ else:
749
+ outputs = outputs + attention_outputs
750
+
751
+ return outputs # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
752
+
753
+
754
+ class T5PreTrainedModel(PreTrainedModel):
755
+ """
756
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
757
+ models.
758
+ """
759
+
760
+ config_class = T5Config
761
+ load_tf_weights = load_tf_weights_in_t5
762
+ base_model_prefix = 'transformer'
763
+ is_parallelizable = True
764
+ supports_gradient_checkpointing = True
765
+ _no_split_modules = ['T5Block']
766
+
767
+ @property
768
+ def dummy_inputs(self):
769
+ input_ids = torch.tensor(DUMMY_INPUTS)
770
+ input_mask = torch.tensor(DUMMY_MASK)
771
+ dummy_inputs = {
772
+ 'decoder_input_ids': input_ids,
773
+ 'input_ids': input_ids,
774
+ 'decoder_attention_mask': input_mask,
775
+ }
776
+ return dummy_inputs
777
+
778
+ def _init_weights(self, module):
779
+ """Initialize the weights"""
780
+ factor = (self.config.initializer_factor) # Used for testing weights initialization
781
+ if isinstance(module, T5LayerNorm):
782
+ module.weight.data.fill_(factor * 1.0)
783
+ elif isinstance(module, (T5Model, T5ForConditionalGeneration, T5EncoderModel)):
784
+ # Mesh TensorFlow embeddings initialization
785
+ # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
786
+ module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0)
787
+ if hasattr(module, 'lm_head') and not self.config.tie_word_embeddings:
788
+ module.lm_head.weight.data.normal_(mean=0.0, std=factor * 1.0)
789
+ elif isinstance(module, T5DenseActDense):
790
+ # Mesh TensorFlow FF initialization
791
+ # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
792
+ # and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89
793
+ module.wi.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model)**-0.5))
794
+ if hasattr(module.wi, 'bias') and module.wi.bias is not None:
795
+ module.wi.bias.data.zero_()
796
+ module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff)**-0.5))
797
+ if hasattr(module.wo, 'bias') and module.wo.bias is not None:
798
+ module.wo.bias.data.zero_()
799
+ elif isinstance(module, T5DenseGatedActDense):
800
+ module.wi_0.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model)**-0.5))
801
+ if hasattr(module.wi_0, 'bias') and module.wi_0.bias is not None:
802
+ module.wi_0.bias.data.zero_()
803
+ module.wi_1.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model)**-0.5))
804
+ if hasattr(module.wi_1, 'bias') and module.wi_1.bias is not None:
805
+ module.wi_1.bias.data.zero_()
806
+ module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff)**-0.5))
807
+ if hasattr(module.wo, 'bias') and module.wo.bias is not None:
808
+ module.wo.bias.data.zero_()
809
+ elif isinstance(module, T5Attention):
810
+ # Mesh TensorFlow attention initialization to avoid scaling before softmax
811
+ # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136
812
+ d_model = self.config.d_model
813
+ key_value_proj_dim = self.config.d_kv
814
+ n_heads = self.config.num_heads
815
+ module.q.weight.data.normal_(mean=0.0, std=factor * ((d_model * key_value_proj_dim)**-0.5))
816
+ module.k.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5))
817
+ module.v.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5))
818
+ module.o.weight.data.normal_(mean=0.0, std=factor * ((n_heads * key_value_proj_dim)**-0.5))
819
+ if module.has_relative_attention_bias:
820
+ module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor * ((d_model)**-0.5))
821
+
822
+ def _set_gradient_checkpointing(self, module, value=False):
823
+ if isinstance(module, (T5Attention, T5Stack)):
824
+ module.gradient_checkpointing = value
825
+
826
+ def _shift_right(self, input_ids):
827
+ decoder_start_token_id = self.config.decoder_start_token_id
828
+ pad_token_id = self.config.pad_token_id
829
+
830
+ assert decoder_start_token_id is not None, (
831
+ 'self.model.config.decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id.'
832
+ ' See T5 docs for more information'
833
+ )
834
+
835
+ # shift inputs to the right
836
+ if is_torch_fx_proxy(input_ids):
837
+ # Item assignment is not supported natively for proxies.
838
+ shifted_input_ids = torch.full(input_ids.shape[:-1] + (1, ), decoder_start_token_id)
839
+ shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1)
840
+ else:
841
+ shifted_input_ids = input_ids.new_zeros(input_ids.shape)
842
+ shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
843
+ shifted_input_ids[..., 0] = decoder_start_token_id
844
+
845
+ assert (pad_token_id is not None), 'self.model.config.pad_token_id has to be defined.'
846
+ # replace possible -100 values in labels by `pad_token_id`
847
+ shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
848
+
849
+ return shifted_input_ids
850
+
851
+
852
+ class T5Stack(T5PreTrainedModel):
853
+
854
+ def __init__(self, config, embed_tokens=None):
855
+ super().__init__(config)
856
+
857
+ self.embed_tokens = embed_tokens
858
+ self.is_decoder = config.is_decoder
859
+
860
+ self.block = nn.ModuleList([
861
+ T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)
862
+ ])
863
+ self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
864
+ self.dropout = nn.Dropout(config.dropout_rate)
865
+
866
+ # Initialize weights and apply final processing
867
+ self.post_init()
868
+ # Model parallel
869
+ self.model_parallel = False
870
+ self.device_map = None
871
+ self.gradient_checkpointing = False
872
+
873
+ @add_start_docstrings(PARALLELIZE_DOCSTRING)
874
+ def parallelize(self, device_map=None):
875
+ # Check validity of device_map
876
+ self.device_map = (
877
+ get_device_map(len(self.block), range(torch.cuda.device_count())) if device_map is None else device_map
878
+ )
879
+ assert_device_map(self.device_map, len(self.block))
880
+ self.model_parallel = True
881
+ self.first_device = ('cpu' if 'cpu' in self.device_map.keys() else 'cuda:' + str(min(self.device_map.keys())))
882
+ self.last_device = 'cuda:' + str(max(self.device_map.keys()))
883
+ # Load onto devices
884
+ for k, v in self.device_map.items():
885
+ for layer in v:
886
+ cuda_device = 'cuda:' + str(k)
887
+ self.block[layer] = self.block[layer].to(cuda_device)
888
+
889
+ # Set embed_tokens to first layer
890
+ self.embed_tokens = self.embed_tokens.to(self.first_device)
891
+ # Set final layer norm to last device
892
+ self.final_layer_norm = self.final_layer_norm.to(self.last_device)
893
+
894
+ @add_start_docstrings(PARALLELIZE_DOCSTRING)
895
+ def deparallelize(self):
896
+ self.model_parallel = False
897
+ self.device_map = None
898
+ self.first_device = 'cpu'
899
+ self.last_device = 'cpu'
900
+ for i in range(len(self.block)):
901
+ self.block[i] = self.block[i].to('cpu')
902
+ self.embed_tokens = self.embed_tokens.to('cpu')
903
+ self.final_layer_norm = self.final_layer_norm.to('cpu')
904
+ torch.cuda.empty_cache()
905
+
906
+ def get_input_embeddings(self):
907
+ return self.embed_tokens
908
+
909
+ def set_input_embeddings(self, new_embeddings):
910
+ self.embed_tokens = new_embeddings
911
+
912
+ def forward(
913
+ self,
914
+ input_ids=None,
915
+ attention_mask=None,
916
+ encoder_hidden_states=None,
917
+ encoder_attention_mask=None,
918
+ inputs_embeds=None,
919
+ head_mask=None,
920
+ cross_attn_head_mask=None,
921
+ past_key_values=None,
922
+ use_cache=None,
923
+ output_attentions=None,
924
+ output_hidden_states=None,
925
+ return_dict=None,
926
+ ):
927
+ # Model parallel
928
+ if self.model_parallel:
929
+ torch.cuda.set_device(self.first_device)
930
+ self.embed_tokens = self.embed_tokens.to(self.first_device)
931
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
932
+ output_attentions = (output_attentions if output_attentions is not None else self.config.output_attentions)
933
+ output_hidden_states = (
934
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
935
+ )
936
+ return_dict = (return_dict if return_dict is not None else self.config.use_return_dict)
937
+
938
+ if input_ids is not None and inputs_embeds is not None:
939
+ err_msg_prefix = 'decoder_' if self.is_decoder else ''
940
+ raise ValueError(
941
+ f'You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time'
942
+ )
943
+ elif input_ids is not None:
944
+ input_shape = input_ids.size()
945
+ input_ids = input_ids.view(-1, input_shape[-1])
946
+ elif inputs_embeds is not None:
947
+ input_shape = inputs_embeds.size()[:-1]
948
+ else:
949
+ err_msg_prefix = 'decoder_' if self.is_decoder else ''
950
+ raise ValueError(f'You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds')
951
+
952
+ if inputs_embeds is None:
953
+ assert (self.embed_tokens is not None), 'You have to initialize the model with valid token embeddings'
954
+ inputs_embeds = self.embed_tokens(input_ids)
955
+
956
+ batch_size, seq_length = input_shape
957
+
958
+ # required mask seq length can be calculated via length of past
959
+ mask_seq_length = (past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length)
960
+
961
+ if use_cache is True:
962
+ assert (self.is_decoder), f'`use_cache` can only be set to `True` if {self} is used as a decoder'
963
+
964
+ if attention_mask is None:
965
+ attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
966
+ if (self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None):
967
+ encoder_seq_length = encoder_hidden_states.shape[1]
968
+ encoder_attention_mask = torch.ones(
969
+ batch_size,
970
+ encoder_seq_length,
971
+ device=inputs_embeds.device,
972
+ dtype=torch.long,
973
+ )
974
+
975
+ # initialize past_key_values with `None` if past does not exist
976
+ if past_key_values is None:
977
+ past_key_values = [None] * len(self.block)
978
+
979
+ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
980
+ # ourselves in which case we just need to make it broadcastable to all heads.
981
+ extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
982
+
983
+ # If a 2D or 3D attention mask is provided for the cross-attention
984
+ # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
985
+ if self.is_decoder and encoder_hidden_states is not None:
986
+ (
987
+ encoder_batch_size,
988
+ encoder_sequence_length,
989
+ _,
990
+ ) = encoder_hidden_states.size()
991
+ encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
992
+ if encoder_attention_mask is None:
993
+ encoder_attention_mask = torch.ones(encoder_hidden_shape, device=inputs_embeds.device)
994
+ encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
995
+ else:
996
+ encoder_extended_attention_mask = None
997
+
998
+ # Prepare head mask if needed
999
+ head_mask = self.get_head_mask(head_mask, self.config.num_layers)
1000
+ cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
1001
+ present_key_value_states = () if use_cache else None
1002
+ all_hidden_states = () if output_hidden_states else None
1003
+ all_attentions = () if output_attentions else None
1004
+ all_cross_attentions = () if (output_attentions and self.is_decoder) else None
1005
+ position_bias = None
1006
+ encoder_decoder_position_bias = None
1007
+
1008
+ hidden_states = self.dropout(inputs_embeds)
1009
+
1010
+ for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)):
1011
+ layer_head_mask = head_mask[i]
1012
+ cross_attn_layer_head_mask = cross_attn_head_mask[i]
1013
+ # Model parallel
1014
+ if self.model_parallel:
1015
+ torch.cuda.set_device(hidden_states.device)
1016
+ # Ensure that attention_mask is always on the same device as hidden_states
1017
+ if attention_mask is not None:
1018
+ attention_mask = attention_mask.to(hidden_states.device)
1019
+ if position_bias is not None:
1020
+ position_bias = position_bias.to(hidden_states.device)
1021
+ if encoder_hidden_states is not None:
1022
+ encoder_hidden_states = encoder_hidden_states.to(hidden_states.device)
1023
+ if encoder_extended_attention_mask is not None:
1024
+ encoder_extended_attention_mask = (encoder_extended_attention_mask.to(hidden_states.device))
1025
+ if encoder_decoder_position_bias is not None:
1026
+ encoder_decoder_position_bias = encoder_decoder_position_bias.to(hidden_states.device)
1027
+ if layer_head_mask is not None:
1028
+ layer_head_mask = layer_head_mask.to(hidden_states.device)
1029
+ if cross_attn_layer_head_mask is not None:
1030
+ cross_attn_layer_head_mask = cross_attn_layer_head_mask.to(hidden_states.device)
1031
+ if output_hidden_states:
1032
+ all_hidden_states = all_hidden_states + (hidden_states, )
1033
+
1034
+ if self.gradient_checkpointing and self.training:
1035
+ if use_cache:
1036
+ logger.warning(
1037
+ '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
1038
+ )
1039
+ use_cache = False
1040
+
1041
+ def create_custom_forward(module):
1042
+
1043
+ def custom_forward(*inputs):
1044
+ return tuple(module(*inputs, use_cache, output_attentions))
1045
+
1046
+ return custom_forward
1047
+
1048
+ layer_outputs = checkpoint(
1049
+ create_custom_forward(layer_module),
1050
+ hidden_states,
1051
+ extended_attention_mask,
1052
+ position_bias,
1053
+ encoder_hidden_states,
1054
+ encoder_extended_attention_mask,
1055
+ encoder_decoder_position_bias,
1056
+ layer_head_mask,
1057
+ cross_attn_layer_head_mask,
1058
+ None, # past_key_value is always None with gradient checkpointing
1059
+ )
1060
+ else:
1061
+ layer_outputs = layer_module(
1062
+ hidden_states,
1063
+ attention_mask=extended_attention_mask,
1064
+ position_bias=position_bias,
1065
+ encoder_hidden_states=encoder_hidden_states,
1066
+ encoder_attention_mask=encoder_extended_attention_mask,
1067
+ encoder_decoder_position_bias=encoder_decoder_position_bias,
1068
+ layer_head_mask=layer_head_mask,
1069
+ cross_attn_layer_head_mask=cross_attn_layer_head_mask,
1070
+ past_key_value=past_key_value,
1071
+ use_cache=use_cache,
1072
+ output_attentions=output_attentions,
1073
+ )
1074
+
1075
+ # layer_outputs is a tuple with:
1076
+ # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
1077
+ if use_cache is False:
1078
+ layer_outputs = layer_outputs[:1] + (None, ) + layer_outputs[1:]
1079
+
1080
+ hidden_states, present_key_value_state = layer_outputs[:2]
1081
+
1082
+ # We share the position biases between the layers - the first layer store them
1083
+ # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
1084
+ # (cross-attention position bias), (cross-attention weights)
1085
+ position_bias = layer_outputs[2]
1086
+ if self.is_decoder and encoder_hidden_states is not None:
1087
+ encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3]
1088
+ # append next layer key value states
1089
+ if use_cache:
1090
+ present_key_value_states = present_key_value_states + (present_key_value_state, )
1091
+
1092
+ if output_attentions:
1093
+ all_attentions = all_attentions + (layer_outputs[3], )
1094
+ if self.is_decoder:
1095
+ all_cross_attentions = all_cross_attentions + (layer_outputs[5], )
1096
+
1097
+ # Model Parallel: If it's the last layer for that device, put things on the next device
1098
+ if self.model_parallel:
1099
+ for k, v in self.device_map.items():
1100
+ if i == v[-1] and 'cuda:' + str(k) != self.last_device:
1101
+ hidden_states = hidden_states.to('cuda:' + str(k + 1))
1102
+
1103
+ hidden_states = self.final_layer_norm(hidden_states)
1104
+ hidden_states = self.dropout(hidden_states)
1105
+
1106
+ # Add last layer
1107
+ if output_hidden_states:
1108
+ all_hidden_states = all_hidden_states + (hidden_states, )
1109
+
1110
+ if not return_dict:
1111
+ return tuple(
1112
+ v for v in [
1113
+ hidden_states,
1114
+ present_key_value_states,
1115
+ all_hidden_states,
1116
+ all_attentions,
1117
+ all_cross_attentions,
1118
+ ] if v is not None
1119
+ )
1120
+ return BaseModelOutputWithPastAndCrossAttentions(
1121
+ last_hidden_state=hidden_states,
1122
+ past_key_values=present_key_value_states,
1123
+ hidden_states=all_hidden_states,
1124
+ attentions=all_attentions,
1125
+ cross_attentions=all_cross_attentions,
1126
+ )
1127
+
1128
+
1129
+ T5_START_DOCSTRING = r"""
1130
+
1131
+ The T5 model was proposed in [Exploring the Limits of Transfer Learning with a Unified Text-to-Text
1132
+ Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan
1133
+ Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. It's an encoder decoder transformer pre-trained in a
1134
+ text-to-text denoising generative setting.
1135
+
1136
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
1137
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
1138
+ etc.)
1139
+
1140
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
1141
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
1142
+ and behavior.
1143
+
1144
+ Parameters:
1145
+ config ([`T5Config`]): Model configuration class with all the parameters of the model.
1146
+ Initializing with a config file does not load the weights associated with the model, only the
1147
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
1148
+ """
1149
+
1150
+ T5_INPUTS_DOCSTRING = r"""
1151
+ Args:
1152
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
1153
+ Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
1154
+ should be able to pad the inputs on both the right and the left.
1155
+
1156
+ Indices can be obtained using [`T5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
1157
+ [`PreTrainedTokenizer.__call__`] for detail.
1158
+
1159
+ [What are input IDs?](../glossary#input-ids)
1160
+
1161
+ To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training).
1162
+ attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
1163
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
1164
+
1165
+ - 1 for tokens that are **not masked**,
1166
+ - 0 for tokens that are **masked**.
1167
+
1168
+ [What are attention masks?](../glossary#attention-mask)
1169
+ decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
1170
+ Indices of decoder input sequence tokens in the vocabulary.
1171
+
1172
+ Indices can be obtained using [`T5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
1173
+ [`PreTrainedTokenizer.__call__`] for details.
1174
+
1175
+ [What are decoder input IDs?](../glossary#decoder-input-ids)
1176
+
1177
+ T5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
1178
+ is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
1179
+
1180
+ To know more on how to prepare `decoder_input_ids` for pretraining take a look at [T5
1181
+ Training](./t5#training).
1182
+ decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
1183
+ Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
1184
+ be used by default.
1185
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
1186
+ Mask to nullify selected heads of the self-attention modules in the encoder. Mask values selected in `[0,
1187
+ 1]`:
1188
+
1189
+ - 1 indicates the head is **not masked**,
1190
+ - 0 indicates the head is **masked**.
1191
+
1192
+ decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
1193
+ Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
1194
+ 1]`:
1195
+
1196
+ - 1 indicates the head is **not masked**,
1197
+ - 0 indicates the head is **masked**.
1198
+
1199
+ cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
1200
+ Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
1201
+ `[0, 1]`:
1202
+
1203
+ - 1 indicates the head is **not masked**,
1204
+ - 0 indicates the head is **masked**.
1205
+
1206
+ encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
1207
+ Tuple consists of (`last_hidden_state`, `optional`: *hidden_states*, `optional`: *attentions*)
1208
+ `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` is a sequence of hidden states at
1209
+ the output of the last layer of the encoder. Used in the cross-attention of the decoder.
1210
+ past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
1211
+ Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
1212
+
1213
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
1214
+ don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
1215
+ `decoder_input_ids` of shape `(batch_size, sequence_length)`.
1216
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
1217
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
1218
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
1219
+ model's internal embedding lookup matrix.
1220
+ decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
1221
+ Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
1222
+ representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
1223
+ input (see `past_key_values`). This is useful if you want more control over how to convert
1224
+ `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
1225
+
1226
+ If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
1227
+ of `inputs_embeds`.
1228
+
1229
+ use_cache (`bool`, *optional*):
1230
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
1231
+ `past_key_values`).
1232
+
1233
+ output_attentions (`bool`, *optional*):
1234
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
1235
+ tensors for more detail.
1236
+ output_hidden_states (`bool`, *optional*):
1237
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
1238
+ more detail.
1239
+ return_dict (`bool`, *optional*):
1240
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
1241
+ """
1242
+
1243
+ T5_ENCODER_INPUTS_DOCSTRING = r"""
1244
+ Args:
1245
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
1246
+ Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
1247
+ should be able to pad the inputs on both the right and the left.
1248
+
1249
+ Indices can be obtained using [`T5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
1250
+ [`PreTrainedTokenizer.__call__`] for detail.
1251
+
1252
+ To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training).
1253
+ attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
1254
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
1255
+
1256
+ - 1 for tokens that are **not masked**,
1257
+ - 0 for tokens that are **masked**.
1258
+
1259
+ [What are attention masks?](../glossary#attention-mask)
1260
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
1261
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
1262
+
1263
+ - 1 indicates the head is **not masked**,
1264
+ - 0 indicates the head is **masked**.
1265
+
1266
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
1267
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
1268
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
1269
+ model's internal embedding lookup matrix.
1270
+ output_attentions (`bool`, *optional*):
1271
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
1272
+ tensors for more detail.
1273
+ output_hidden_states (`bool`, *optional*):
1274
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
1275
+ more detail.
1276
+ return_dict (`bool`, *optional*):
1277
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
1278
+ """
1279
+
1280
+ # Warning message for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
1281
+ __HEAD_MASK_WARNING_MSG = """
1282
+ The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently,
1283
+ `decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions.
1284
+ If you do not want to use any `decoder_head_mask` now, please set `decoder_head_mask = torch.ones(num_layers,
1285
+ num_heads)`.
1286
+ """
1287
+
1288
+
1289
+ @add_start_docstrings(
1290
+ 'The bare T5 Model transformer outputting raw hidden-states without any specific head on top.',
1291
+ T5_START_DOCSTRING,
1292
+ )
1293
+ class T5Model(T5PreTrainedModel):
1294
+ _keys_to_ignore_on_load_missing = [
1295
+ r'encoder.embed_tokens.weight',
1296
+ r'decoder.embed_tokens.weight',
1297
+ ]
1298
+ _keys_to_ignore_on_load_unexpected = [
1299
+ r'decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight',
1300
+ ]
1301
+
1302
+ def __init__(self, config: T5Config):
1303
+ super().__init__(config)
1304
+ self.shared = nn.Embedding(config.vocab_size, config.d_model)
1305
+
1306
+ encoder_config = copy.deepcopy(config)
1307
+ encoder_config.is_decoder = False
1308
+ encoder_config.use_cache = False
1309
+ encoder_config.is_encoder_decoder = False
1310
+ self.encoder = T5Stack(encoder_config, self.shared)
1311
+
1312
+ decoder_config = copy.deepcopy(config)
1313
+ decoder_config.is_decoder = True
1314
+ decoder_config.is_encoder_decoder = False
1315
+ decoder_config.num_layers = config.num_decoder_layers
1316
+ self.decoder = T5Stack(decoder_config, self.shared)
1317
+
1318
+ # Initialize weights and apply final processing
1319
+ self.post_init()
1320
+
1321
+ # Model parallel
1322
+ self.model_parallel = False
1323
+ self.device_map = None
1324
+
1325
+ @add_start_docstrings(PARALLELIZE_DOCSTRING)
1326
+ def parallelize(self, device_map=None):
1327
+ self.device_map = (
1328
+ get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
1329
+ if device_map is None else device_map
1330
+ )
1331
+ assert_device_map(self.device_map, len(self.encoder.block))
1332
+ self.encoder.parallelize(self.device_map)
1333
+ self.decoder.parallelize(self.device_map)
1334
+ self.model_parallel = True
1335
+
1336
+ @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
1337
+ def deparallelize(self):
1338
+ self.encoder.deparallelize()
1339
+ self.decoder.deparallelize()
1340
+ self.encoder = self.encoder.to('cpu')
1341
+ self.decoder = self.decoder.to('cpu')
1342
+ self.model_parallel = False
1343
+ self.device_map = None
1344
+ torch.cuda.empty_cache()
1345
+
1346
+ def get_input_embeddings(self):
1347
+ return self.shared
1348
+
1349
+ def set_input_embeddings(self, new_embeddings):
1350
+ self.shared = new_embeddings
1351
+ self.encoder.set_input_embeddings(new_embeddings)
1352
+ self.decoder.set_input_embeddings(new_embeddings)
1353
+
1354
+ def get_encoder(self):
1355
+ return self.encoder
1356
+
1357
+ def get_decoder(self):
1358
+ return self.decoder
1359
+
1360
+ def _prune_heads(self, heads_to_prune):
1361
+ """
1362
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
1363
+ class PreTrainedModel
1364
+ """
1365
+ for layer, heads in heads_to_prune.items():
1366
+ self.encoder.layer[layer].attention.prune_heads(heads)
1367
+
1368
+ @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
1369
+ @replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
1370
+ def forward(
1371
+ self,
1372
+ input_ids: Optional[torch.LongTensor] = None,
1373
+ attention_mask: Optional[torch.FloatTensor] = None,
1374
+ decoder_input_ids: Optional[torch.LongTensor] = None,
1375
+ decoder_attention_mask: Optional[torch.BoolTensor] = None,
1376
+ head_mask: Optional[torch.FloatTensor] = None,
1377
+ decoder_head_mask: Optional[torch.FloatTensor] = None,
1378
+ cross_attn_head_mask: Optional[torch.Tensor] = None,
1379
+ encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
1380
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
1381
+ inputs_embeds: Optional[torch.Tensor] = None,
1382
+ decoder_inputs_embeds: Optional[torch.Tensor] = None,
1383
+ use_cache: Optional[bool] = None,
1384
+ output_attentions: Optional[bool] = None,
1385
+ output_hidden_states: Optional[bool] = None,
1386
+ return_dict: Optional[bool] = None,
1387
+ ) -> Union[Tuple[torch.FloatTensor], Seq2SeqModelOutput]:
1388
+ r"""
1389
+ Returns:
1390
+
1391
+ Example:
1392
+
1393
+ ```python
1394
+ >>> from transformers import T5Tokenizer, T5Model
1395
+
1396
+ >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
1397
+ >>> model = T5Model.from_pretrained("t5-small")
1398
+
1399
+ >>> input_ids = tokenizer(
1400
+ ... "Studies have been shown that owning a dog is good for you", return_tensors="pt"
1401
+ ... ).input_ids # Batch size 1
1402
+ >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1
1403
+
1404
+ >>> # preprocess: Prepend decoder_input_ids with start token which is pad token for T5Model.
1405
+ >>> # This is not needed for torch's T5ForConditionalGeneration as it does this internally using labels arg.
1406
+ >>> decoder_input_ids = model._shift_right(decoder_input_ids)
1407
+
1408
+ >>> # forward pass
1409
+ >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
1410
+ >>> last_hidden_states = outputs.last_hidden_state
1411
+ ```"""
1412
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
1413
+ return_dict = (return_dict if return_dict is not None else self.config.use_return_dict)
1414
+
1415
+ # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
1416
+ if head_mask is not None and decoder_head_mask is None:
1417
+ if self.config.num_layers == self.config.num_decoder_layers:
1418
+ warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
1419
+ decoder_head_mask = head_mask
1420
+
1421
+ # Encode if needed (training, first prediction pass)
1422
+ if encoder_outputs is None:
1423
+ encoder_outputs = self.encoder(
1424
+ input_ids=input_ids,
1425
+ attention_mask=attention_mask,
1426
+ inputs_embeds=inputs_embeds,
1427
+ head_mask=head_mask,
1428
+ output_attentions=output_attentions,
1429
+ output_hidden_states=output_hidden_states,
1430
+ return_dict=return_dict,
1431
+ )
1432
+ elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
1433
+ encoder_outputs = BaseModelOutput(
1434
+ last_hidden_state=encoder_outputs[0],
1435
+ hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
1436
+ attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
1437
+ )
1438
+
1439
+ hidden_states = encoder_outputs[0]
1440
+
1441
+ # Set device for model parallelism
1442
+ if self.model_parallel:
1443
+ torch.cuda.set_device(self.decoder.first_device)
1444
+ hidden_states = hidden_states.to(self.decoder.first_device)
1445
+ if decoder_input_ids is not None:
1446
+ decoder_input_ids = decoder_input_ids.to(self.decoder.first_device)
1447
+ if attention_mask is not None:
1448
+ attention_mask = attention_mask.to(self.decoder.first_device)
1449
+ if decoder_attention_mask is not None:
1450
+ decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)
1451
+
1452
+ # Decode
1453
+ decoder_outputs = self.decoder(
1454
+ input_ids=decoder_input_ids,
1455
+ attention_mask=decoder_attention_mask,
1456
+ inputs_embeds=decoder_inputs_embeds,
1457
+ past_key_values=past_key_values,
1458
+ encoder_hidden_states=hidden_states,
1459
+ encoder_attention_mask=attention_mask,
1460
+ head_mask=decoder_head_mask,
1461
+ cross_attn_head_mask=cross_attn_head_mask,
1462
+ use_cache=use_cache,
1463
+ output_attentions=output_attentions,
1464
+ output_hidden_states=output_hidden_states,
1465
+ return_dict=return_dict,
1466
+ )
1467
+
1468
+ if not return_dict:
1469
+ return decoder_outputs + encoder_outputs
1470
+
1471
+ return Seq2SeqModelOutput(
1472
+ last_hidden_state=decoder_outputs.last_hidden_state,
1473
+ past_key_values=decoder_outputs.past_key_values,
1474
+ decoder_hidden_states=decoder_outputs.hidden_states,
1475
+ decoder_attentions=decoder_outputs.attentions,
1476
+ cross_attentions=decoder_outputs.cross_attentions,
1477
+ encoder_last_hidden_state=encoder_outputs.last_hidden_state,
1478
+ encoder_hidden_states=encoder_outputs.hidden_states,
1479
+ encoder_attentions=encoder_outputs.attentions,
1480
+ )
1481
+
1482
+
1483
+ @add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING)
1484
+ class T5ForConditionalGeneration(T5PreTrainedModel):
1485
+ _keys_to_ignore_on_load_missing = [
1486
+ r'encoder.embed_tokens.weight',
1487
+ r'decoder.embed_tokens.weight',
1488
+ r'lm_head.weight',
1489
+ ]
1490
+ _keys_to_ignore_on_load_unexpected = [
1491
+ r'decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight',
1492
+ ]
1493
+
1494
+ def __init__(self, config: T5Config):
1495
+ super().__init__(config)
1496
+ self.model_dim = config.d_model
1497
+
1498
+ self.shared = nn.Embedding(config.vocab_size, config.d_model)
1499
+
1500
+ encoder_config = copy.deepcopy(config)
1501
+ encoder_config.is_decoder = False
1502
+ encoder_config.use_cache = False
1503
+ encoder_config.is_encoder_decoder = False
1504
+ self.encoder = T5Stack(encoder_config, self.shared)
1505
+
1506
+ decoder_config = copy.deepcopy(config)
1507
+ decoder_config.is_decoder = True
1508
+ decoder_config.is_encoder_decoder = False
1509
+ decoder_config.num_layers = config.num_decoder_layers
1510
+ self.decoder = T5Stack(decoder_config, self.shared)
1511
+
1512
+ self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
1513
+
1514
+ # Initialize weights and apply final processing
1515
+ self.post_init()
1516
+
1517
+ # Model parallel
1518
+ self.model_parallel = False
1519
+ self.device_map = None
1520
+
1521
+ @add_start_docstrings(PARALLELIZE_DOCSTRING)
1522
+ def parallelize(self, device_map=None):
1523
+ self.device_map = (
1524
+ get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
1525
+ if device_map is None else device_map
1526
+ )
1527
+ assert_device_map(self.device_map, len(self.encoder.block))
1528
+ self.encoder.parallelize(self.device_map)
1529
+ self.decoder.parallelize(self.device_map)
1530
+ self.lm_head = self.lm_head.to(self.decoder.first_device)
1531
+ self.model_parallel = True
1532
+
1533
+ @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
1534
+ def deparallelize(self):
1535
+ self.encoder.deparallelize()
1536
+ self.decoder.deparallelize()
1537
+ self.encoder = self.encoder.to('cpu')
1538
+ self.decoder = self.decoder.to('cpu')
1539
+ self.lm_head = self.lm_head.to('cpu')
1540
+ self.model_parallel = False
1541
+ self.device_map = None
1542
+ torch.cuda.empty_cache()
1543
+
1544
+ def get_input_embeddings(self):
1545
+ return self.shared
1546
+
1547
+ def set_input_embeddings(self, new_embeddings):
1548
+ self.shared = new_embeddings
1549
+ self.encoder.set_input_embeddings(new_embeddings)
1550
+ self.decoder.set_input_embeddings(new_embeddings)
1551
+
1552
+ def set_output_embeddings(self, new_embeddings):
1553
+ self.lm_head = new_embeddings
1554
+
1555
+ def get_output_embeddings(self):
1556
+ return self.lm_head
1557
+
1558
+ def get_encoder(self):
1559
+ return self.encoder
1560
+
1561
+ def get_decoder(self):
1562
+ return self.decoder
1563
+
1564
+ @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
1565
+ @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
1566
+ def forward(
1567
+ self,
1568
+ input_ids: Optional[torch.LongTensor] = None,
1569
+ attention_mask: Optional[torch.FloatTensor] = None,
1570
+ decoder_input_ids: Optional[torch.LongTensor] = None,
1571
+ decoder_attention_mask: Optional[torch.BoolTensor] = None,
1572
+ head_mask: Optional[torch.FloatTensor] = None,
1573
+ decoder_head_mask: Optional[torch.FloatTensor] = None,
1574
+ cross_attn_head_mask: Optional[torch.Tensor] = None,
1575
+ encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
1576
+ past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
1577
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1578
+ decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
1579
+ labels: Optional[torch.LongTensor] = None,
1580
+ use_cache: Optional[bool] = None,
1581
+ output_attentions: Optional[bool] = None,
1582
+ output_hidden_states: Optional[bool] = None,
1583
+ return_dict: Optional[bool] = None,
1584
+ reduction: Optional[str] = 'mean',
1585
+ ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
1586
+ r"""
1587
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1588
+ Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
1589
+ config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
1590
+ labels in `[0, ..., config.vocab_size]`
1591
+
1592
+ Returns:
1593
+
1594
+ Examples:
1595
+
1596
+ ```python
1597
+ >>> from transformers import T5Tokenizer, T5ForConditionalGeneration
1598
+
1599
+ >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
1600
+ >>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
1601
+
1602
+ >>> # training
1603
+ >>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
1604
+ >>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
1605
+ >>> outputs = model(input_ids=input_ids, labels=labels)
1606
+ >>> loss = outputs.loss
1607
+ >>> logits = outputs.logits
1608
+
1609
+ >>> # inference
1610
+ >>> input_ids = tokenizer(
1611
+ ... "summarize: studies have shown that owning a dog is good for you", return_tensors="pt"
1612
+ ... ).input_ids # Batch size 1
1613
+ >>> outputs = model.generate(input_ids)
1614
+ >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
1615
+ >>> # studies have shown that owning a dog is good for you.
1616
+ ```"""
1617
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
1618
+ return_dict = (return_dict if return_dict is not None else self.config.use_return_dict)
1619
+
1620
+ # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
1621
+ if head_mask is not None and decoder_head_mask is None:
1622
+ if self.config.num_layers == self.config.num_decoder_layers:
1623
+ warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
1624
+ decoder_head_mask = head_mask
1625
+
1626
+ # Encode if needed (training, first prediction pass)
1627
+ if encoder_outputs is None:
1628
+ # Convert encoder inputs in embeddings if needed
1629
+ encoder_outputs = self.encoder(
1630
+ input_ids=input_ids,
1631
+ attention_mask=attention_mask,
1632
+ inputs_embeds=inputs_embeds,
1633
+ head_mask=head_mask,
1634
+ output_attentions=output_attentions,
1635
+ output_hidden_states=output_hidden_states,
1636
+ return_dict=return_dict,
1637
+ )
1638
+ elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
1639
+ encoder_outputs = BaseModelOutput(
1640
+ last_hidden_state=encoder_outputs[0],
1641
+ hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
1642
+ attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
1643
+ )
1644
+
1645
+ hidden_states = encoder_outputs[0]
1646
+
1647
+ if self.model_parallel:
1648
+ torch.cuda.set_device(self.decoder.first_device)
1649
+
1650
+ if (labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None):
1651
+ # get decoder inputs from shifting lm labels to the right
1652
+ decoder_input_ids = self._shift_right(labels)
1653
+
1654
+ # Set device for model parallelism
1655
+ if self.model_parallel:
1656
+ torch.cuda.set_device(self.decoder.first_device)
1657
+ hidden_states = hidden_states.to(self.decoder.first_device)
1658
+ if decoder_input_ids is not None:
1659
+ decoder_input_ids = decoder_input_ids.to(self.decoder.first_device)
1660
+ if attention_mask is not None:
1661
+ attention_mask = attention_mask.to(self.decoder.first_device)
1662
+ if decoder_attention_mask is not None:
1663
+ decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)
1664
+
1665
+ # Decode
1666
+ decoder_outputs = self.decoder(
1667
+ input_ids=decoder_input_ids,
1668
+ attention_mask=decoder_attention_mask,
1669
+ inputs_embeds=decoder_inputs_embeds,
1670
+ past_key_values=past_key_values,
1671
+ encoder_hidden_states=hidden_states,
1672
+ encoder_attention_mask=attention_mask,
1673
+ head_mask=decoder_head_mask,
1674
+ cross_attn_head_mask=cross_attn_head_mask,
1675
+ use_cache=use_cache,
1676
+ output_attentions=output_attentions,
1677
+ output_hidden_states=output_hidden_states,
1678
+ return_dict=return_dict,
1679
+ )
1680
+
1681
+ sequence_output = decoder_outputs[0]
1682
+
1683
+ # Set device for model parallelism
1684
+ if self.model_parallel:
1685
+ torch.cuda.set_device(self.encoder.first_device)
1686
+ self.lm_head = self.lm_head.to(self.encoder.first_device)
1687
+ sequence_output = sequence_output.to(self.lm_head.weight.device)
1688
+
1689
+ if self.config.tie_word_embeddings:
1690
+ # Rescale output before projecting on vocab
1691
+ # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
1692
+ sequence_output = sequence_output * (self.model_dim**-0.5)
1693
+
1694
+ lm_logits = self.lm_head(sequence_output)
1695
+
1696
+ loss = None
1697
+ if labels is not None:
1698
+ loss_fct = CrossEntropyLoss(ignore_index=-100, reduction=reduction)
1699
+ loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
1700
+ if reduction == 'none':
1701
+ loss = loss.view(lm_logits.size(0), -1).sum(1)
1702
+
1703
+ if not return_dict:
1704
+ output = (lm_logits, ) + decoder_outputs[1:] + encoder_outputs
1705
+ return ((loss, ) + output) if loss is not None else output
1706
+
1707
+ return Seq2SeqLMOutput(
1708
+ loss=loss,
1709
+ logits=lm_logits,
1710
+ past_key_values=decoder_outputs.past_key_values,
1711
+ decoder_hidden_states=decoder_outputs.hidden_states,
1712
+ decoder_attentions=decoder_outputs.attentions,
1713
+ cross_attentions=decoder_outputs.cross_attentions,
1714
+ encoder_last_hidden_state=encoder_outputs.last_hidden_state,
1715
+ encoder_hidden_states=encoder_outputs.hidden_states,
1716
+ encoder_attentions=encoder_outputs.attentions,
1717
+ )
1718
+
1719
+ def prepare_inputs_for_generation(
1720
+ self,
1721
+ input_ids,
1722
+ past=None,
1723
+ attention_mask=None,
1724
+ head_mask=None,
1725
+ decoder_head_mask=None,
1726
+ cross_attn_head_mask=None,
1727
+ use_cache=None,
1728
+ encoder_outputs=None,
1729
+ **kwargs,
1730
+ ):
1731
+
1732
+ # cut decoder_input_ids if past is used
1733
+ if past is not None:
1734
+ input_ids = input_ids[:, -1:]
1735
+
1736
+ return {
1737
+ 'decoder_input_ids': input_ids,
1738
+ 'past_key_values': past,
1739
+ 'encoder_outputs': encoder_outputs,
1740
+ 'attention_mask': attention_mask,
1741
+ 'head_mask': head_mask,
1742
+ 'decoder_head_mask': decoder_head_mask,
1743
+ 'cross_attn_head_mask': cross_attn_head_mask,
1744
+ 'use_cache': use_cache,
1745
+ }
1746
+
1747
+ def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
1748
+ return self._shift_right(labels)
1749
+
1750
+ def _reorder_cache(self, past, beam_idx):
1751
+ # if decoder past is not included in output
1752
+ # speedy decoding is disabled and no need to reorder
1753
+ if past is None:
1754
+ logger.warning('You might want to consider setting `use_cache=True` to speed up decoding')
1755
+ return past
1756
+
1757
+ reordered_decoder_past = ()
1758
+ for layer_past_states in past:
1759
+ # get the correct batch idx from layer past batch dim
1760
+ # batch dim of `past` is at 2nd position
1761
+ reordered_layer_past_states = ()
1762
+ for layer_past_state in layer_past_states:
1763
+ # need to set correct `past` for each of the four key / value states
1764
+ reordered_layer_past_states = reordered_layer_past_states + (
1765
+ layer_past_state.index_select(0, beam_idx.to(layer_past_state.device)),
1766
+ )
1767
+
1768
+ assert reordered_layer_past_states[0].shape == layer_past_states[0].shape
1769
+ assert len(reordered_layer_past_states) == len(layer_past_states)
1770
+
1771
+ reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states, )
1772
+ return reordered_decoder_past
1773
+
1774
+
1775
+ @add_start_docstrings(
1776
+ "The bare T5 Model transformer outputting encoder's raw hidden-states without any specific head on top.",
1777
+ T5_START_DOCSTRING,
1778
+ )
1779
+ class T5EncoderModel(T5PreTrainedModel):
1780
+ authorized_missing_keys = [
1781
+ r'encoder.embed_tokens.weight',
1782
+ ]
1783
+
1784
+ def __init__(self, config: T5Config):
1785
+ super().__init__(config)
1786
+ self.shared = nn.Embedding(config.vocab_size, config.d_model)
1787
+
1788
+ encoder_config = copy.deepcopy(config)
1789
+ encoder_config.use_cache = False
1790
+ encoder_config.is_encoder_decoder = False
1791
+ self.encoder = T5Stack(encoder_config, self.shared)
1792
+
1793
+ # Initialize weights and apply final processing
1794
+ self.post_init()
1795
+
1796
+ # Model parallel
1797
+ self.model_parallel = False
1798
+ self.device_map = None
1799
+
1800
+ @add_start_docstrings(PARALLELIZE_DOCSTRING)
1801
+ def parallelize(self, device_map=None):
1802
+ self.device_map = (
1803
+ get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
1804
+ if device_map is None else device_map
1805
+ )
1806
+ assert_device_map(self.device_map, len(self.encoder.block))
1807
+ self.encoder.parallelize(self.device_map)
1808
+ self.model_parallel = True
1809
+
1810
+ @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
1811
+ def deparallelize(self):
1812
+ self.encoder.deparallelize()
1813
+ self.encoder = self.encoder.to('cpu')
1814
+ self.model_parallel = False
1815
+ self.device_map = None
1816
+ torch.cuda.empty_cache()
1817
+
1818
+ def get_input_embeddings(self):
1819
+ return self.shared
1820
+
1821
+ def set_input_embeddings(self, new_embeddings):
1822
+ self.shared = new_embeddings
1823
+ self.encoder.set_input_embeddings(new_embeddings)
1824
+
1825
+ def get_encoder(self):
1826
+ return self.encoder
1827
+
1828
+ def _prune_heads(self, heads_to_prune):
1829
+ """
1830
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
1831
+ class PreTrainedModel
1832
+ """
1833
+ for layer, heads in heads_to_prune.items():
1834
+ self.encoder.block[layer].layer[0].SelfAttention.prune_heads(heads)
1835
+
1836
+ @add_start_docstrings_to_model_forward(T5_ENCODER_INPUTS_DOCSTRING)
1837
+ @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
1838
+ def forward(
1839
+ self,
1840
+ input_ids: Optional[torch.LongTensor] = None,
1841
+ attention_mask: Optional[torch.FloatTensor] = None,
1842
+ head_mask: Optional[torch.FloatTensor] = None,
1843
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1844
+ output_attentions: Optional[bool] = None,
1845
+ output_hidden_states: Optional[bool] = None,
1846
+ return_dict: Optional[bool] = None,
1847
+ ) -> Union[Tuple[torch.FloatTensor], BaseModelOutput]:
1848
+ r"""
1849
+ Returns:
1850
+
1851
+ Example:
1852
+
1853
+ ```python
1854
+ >>> from transformers import T5Tokenizer, T5EncoderModel
1855
+
1856
+ >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
1857
+ >>> model = T5EncoderModel.from_pretrained("t5-small")
1858
+ >>> input_ids = tokenizer(
1859
+ ... "Studies have been shown that owning a dog is good for you", return_tensors="pt"
1860
+ ... ).input_ids # Batch size 1
1861
+ >>> outputs = model(input_ids=input_ids)
1862
+ >>> last_hidden_states = outputs.last_hidden_state
1863
+ ```"""
1864
+ return_dict = (return_dict if return_dict is not None else self.config.use_return_dict)
1865
+
1866
+ encoder_outputs = self.encoder(
1867
+ input_ids=input_ids,
1868
+ attention_mask=attention_mask,
1869
+ inputs_embeds=inputs_embeds,
1870
+ head_mask=head_mask,
1871
+ output_attentions=output_attentions,
1872
+ output_hidden_states=output_hidden_states,
1873
+ return_dict=return_dict,
1874
+ )
1875
+
1876
+ return encoder_outputs