paddlex 3.0.0b2__py3-none-any.whl → 3.0.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1211) hide show
  1. paddlex/.version +1 -1
  2. paddlex/__init__.py +17 -33
  3. paddlex/__main__.py +4 -5
  4. paddlex/configs/modules/3d_bev_detection/BEVFusion.yaml +38 -0
  5. paddlex/configs/modules/doc_vlm/PP-DocBee-2B.yaml +14 -0
  6. paddlex/configs/modules/doc_vlm/PP-DocBee-7B.yaml +14 -0
  7. paddlex/configs/modules/face_feature/MobileFaceNet.yaml +41 -0
  8. paddlex/configs/modules/face_feature/ResNet50_face.yaml +41 -0
  9. paddlex/configs/modules/formula_recognition/LaTeX_OCR_rec.yaml +40 -0
  10. paddlex/configs/modules/formula_recognition/PP-FormulaNet-L.yaml +40 -0
  11. paddlex/configs/modules/formula_recognition/PP-FormulaNet-S.yaml +40 -0
  12. paddlex/configs/modules/formula_recognition/UniMERNet.yaml +40 -0
  13. paddlex/configs/modules/image_classification/CLIP_vit_base_patch16_224.yaml +41 -0
  14. paddlex/configs/modules/image_classification/CLIP_vit_large_patch14_224.yaml +41 -0
  15. paddlex/configs/modules/image_classification/ConvNeXt_large_384.yaml +41 -0
  16. paddlex/configs/modules/keypoint_detection/PP-TinyPose_128x96.yaml +40 -0
  17. paddlex/configs/modules/keypoint_detection/PP-TinyPose_256x192.yaml +40 -0
  18. paddlex/configs/modules/layout_detection/PP-DocLayout-L.yaml +40 -0
  19. paddlex/configs/modules/layout_detection/PP-DocLayout-M.yaml +40 -0
  20. paddlex/configs/modules/layout_detection/PP-DocLayout-S.yaml +40 -0
  21. paddlex/configs/modules/multilingual_speech_recognition/whisper_base.yaml +12 -0
  22. paddlex/configs/modules/multilingual_speech_recognition/whisper_large.yaml +12 -0
  23. paddlex/configs/modules/multilingual_speech_recognition/whisper_medium.yaml +12 -0
  24. paddlex/configs/modules/multilingual_speech_recognition/whisper_small.yaml +12 -0
  25. paddlex/configs/modules/multilingual_speech_recognition/whisper_tiny.yaml +12 -0
  26. paddlex/configs/modules/object_detection/Co-DINO-R50.yaml +40 -0
  27. paddlex/configs/modules/object_detection/Co-DINO-Swin-L.yaml +40 -0
  28. paddlex/configs/modules/object_detection/Co-Deformable-DETR-R50.yaml +40 -0
  29. paddlex/configs/modules/object_detection/Co-Deformable-DETR-Swin-T.yaml +40 -0
  30. paddlex/configs/modules/object_detection/YOLOX-X.yaml +40 -0
  31. paddlex/configs/modules/open_vocabulary_detection/GroundingDINO-T.yaml +13 -0
  32. paddlex/configs/modules/open_vocabulary_detection/YOLO-Worldv2-L.yaml +13 -0
  33. paddlex/configs/modules/open_vocabulary_segmentation/SAM-H_box.yaml +17 -0
  34. paddlex/configs/modules/open_vocabulary_segmentation/SAM-H_point.yaml +15 -0
  35. paddlex/configs/modules/rotated_object_detection/PP-YOLOE-R-L.yaml +40 -0
  36. paddlex/configs/modules/semantic_segmentation/MaskFormer_small.yaml +42 -0
  37. paddlex/configs/modules/semantic_segmentation/MaskFormer_tiny.yaml +42 -0
  38. paddlex/configs/modules/semantic_segmentation/SeaFormer_base.yaml +40 -0
  39. paddlex/configs/modules/semantic_segmentation/SeaFormer_large.yaml +40 -0
  40. paddlex/configs/modules/semantic_segmentation/SeaFormer_small.yaml +40 -0
  41. paddlex/configs/modules/semantic_segmentation/SeaFormer_tiny.yaml +40 -0
  42. paddlex/configs/modules/table_cells_detection/RT-DETR-L_wired_table_cell_det.yaml +40 -0
  43. paddlex/configs/modules/table_cells_detection/RT-DETR-L_wireless_table_cell_det.yaml +40 -0
  44. paddlex/configs/modules/table_classification/PP-LCNet_x1_0_table_cls.yaml +41 -0
  45. paddlex/configs/modules/table_structure_recognition/SLANeXt_wired.yaml +39 -0
  46. paddlex/configs/modules/table_structure_recognition/SLANeXt_wireless.yaml +39 -0
  47. paddlex/configs/modules/text_detection/PP-OCRv3_mobile_det.yaml +40 -0
  48. paddlex/configs/modules/text_detection/PP-OCRv3_server_det.yaml +40 -0
  49. paddlex/configs/modules/text_recognition/PP-OCRv3_mobile_rec.yaml +39 -0
  50. paddlex/configs/modules/text_recognition/PP-OCRv4_server_rec_doc.yaml +39 -0
  51. paddlex/configs/modules/text_recognition/arabic_PP-OCRv3_mobile_rec.yaml +39 -0
  52. paddlex/configs/modules/text_recognition/chinese_cht_PP-OCRv3_mobile_rec.yaml +39 -0
  53. paddlex/configs/modules/text_recognition/cyrillic_PP-OCRv3_mobile_rec.yaml +39 -0
  54. paddlex/configs/modules/text_recognition/devanagari_PP-OCRv3_mobile_rec.yaml +39 -0
  55. paddlex/configs/modules/text_recognition/en_PP-OCRv3_mobile_rec.yaml +39 -0
  56. paddlex/configs/modules/text_recognition/en_PP-OCRv4_mobile_rec.yaml +39 -0
  57. paddlex/configs/modules/text_recognition/japan_PP-OCRv3_mobile_rec.yaml +39 -0
  58. paddlex/configs/modules/text_recognition/ka_PP-OCRv3_mobile_rec.yaml +39 -0
  59. paddlex/configs/modules/text_recognition/korean_PP-OCRv3_mobile_rec.yaml +39 -0
  60. paddlex/configs/modules/text_recognition/latin_PP-OCRv3_mobile_rec.yaml +39 -0
  61. paddlex/configs/modules/text_recognition/ta_PP-OCRv3_mobile_rec.yaml +39 -0
  62. paddlex/configs/modules/text_recognition/te_PP-OCRv3_mobile_rec.yaml +39 -0
  63. paddlex/configs/modules/textline_orientation/PP-LCNet_x0_25_textline_ori.yaml +41 -0
  64. paddlex/configs/modules/video_classification/PP-TSM-R50_8frames_uniform.yaml +42 -0
  65. paddlex/configs/modules/video_classification/PP-TSMv2-LCNetV2_16frames_uniform.yaml +42 -0
  66. paddlex/configs/modules/video_classification/PP-TSMv2-LCNetV2_8frames_uniform.yaml +42 -0
  67. paddlex/configs/modules/video_detection/YOWO.yaml +40 -0
  68. paddlex/configs/pipelines/3d_bev_detection.yaml +9 -0
  69. paddlex/configs/pipelines/OCR.yaml +44 -0
  70. paddlex/configs/pipelines/PP-ChatOCRv3-doc.yaml +149 -0
  71. paddlex/configs/pipelines/PP-ChatOCRv4-doc.yaml +184 -0
  72. paddlex/configs/pipelines/PP-ShiTuV2.yaml +18 -0
  73. paddlex/configs/pipelines/PP-StructureV3.yaml +226 -0
  74. paddlex/configs/pipelines/anomaly_detection.yaml +8 -0
  75. paddlex/configs/pipelines/doc_preprocessor.yaml +15 -0
  76. paddlex/configs/pipelines/doc_understanding.yaml +9 -0
  77. paddlex/configs/pipelines/face_recognition.yaml +18 -0
  78. paddlex/configs/pipelines/formula_recognition.yaml +39 -0
  79. paddlex/configs/pipelines/human_keypoint_detection.yaml +17 -0
  80. paddlex/configs/pipelines/image_classification.yaml +10 -0
  81. paddlex/configs/pipelines/image_multilabel_classification.yaml +9 -0
  82. paddlex/configs/pipelines/instance_segmentation.yaml +10 -0
  83. paddlex/configs/pipelines/layout_parsing.yaml +101 -0
  84. paddlex/configs/pipelines/multilingual_speech_recognition.yaml +9 -0
  85. paddlex/configs/pipelines/object_detection.yaml +10 -0
  86. paddlex/configs/pipelines/open_vocabulary_detection.yaml +12 -0
  87. paddlex/configs/pipelines/open_vocabulary_segmentation.yaml +13 -0
  88. paddlex/configs/pipelines/pedestrian_attribute_recognition.yaml +15 -0
  89. paddlex/configs/pipelines/rotated_object_detection.yaml +10 -0
  90. paddlex/configs/pipelines/seal_recognition.yaml +51 -0
  91. paddlex/configs/pipelines/semantic_segmentation.yaml +10 -0
  92. paddlex/configs/pipelines/small_object_detection.yaml +10 -0
  93. paddlex/configs/pipelines/table_recognition.yaml +56 -0
  94. paddlex/configs/pipelines/table_recognition_v2.yaml +76 -0
  95. paddlex/configs/pipelines/ts_anomaly_detection.yaml +8 -0
  96. paddlex/configs/pipelines/ts_classification.yaml +8 -0
  97. paddlex/configs/pipelines/ts_forecast.yaml +8 -0
  98. paddlex/configs/pipelines/vehicle_attribute_recognition.yaml +15 -0
  99. paddlex/configs/pipelines/video_classification.yaml +9 -0
  100. paddlex/configs/pipelines/video_detection.yaml +10 -0
  101. paddlex/constants.py +17 -0
  102. paddlex/engine.py +8 -6
  103. paddlex/hpip_links.html +31 -0
  104. paddlex/inference/__init__.py +4 -2
  105. paddlex/inference/common/__init__.py +13 -0
  106. paddlex/inference/common/batch_sampler/__init__.py +21 -0
  107. paddlex/inference/common/batch_sampler/audio_batch_sampler.py +83 -0
  108. paddlex/inference/common/batch_sampler/base_batch_sampler.py +94 -0
  109. paddlex/inference/common/batch_sampler/det_3d_batch_sampler.py +144 -0
  110. paddlex/inference/common/batch_sampler/doc_vlm_batch_sampler.py +64 -0
  111. paddlex/inference/common/batch_sampler/image_batch_sampler.py +112 -0
  112. paddlex/inference/common/batch_sampler/ts_batch_sampler.py +109 -0
  113. paddlex/inference/common/batch_sampler/video_batch_sampler.py +74 -0
  114. paddlex/inference/common/reader/__init__.py +19 -0
  115. paddlex/inference/common/reader/audio_reader.py +46 -0
  116. paddlex/inference/common/reader/det_3d_reader.py +241 -0
  117. paddlex/inference/common/reader/image_reader.py +73 -0
  118. paddlex/inference/common/reader/ts_reader.py +46 -0
  119. paddlex/inference/common/reader/video_reader.py +42 -0
  120. paddlex/inference/common/result/__init__.py +29 -0
  121. paddlex/inference/common/result/base_cv_result.py +41 -0
  122. paddlex/inference/common/result/base_result.py +72 -0
  123. paddlex/inference/common/result/base_ts_result.py +41 -0
  124. paddlex/inference/common/result/base_video_result.py +36 -0
  125. paddlex/inference/common/result/mixin.py +702 -0
  126. paddlex/inference/models/__init__.py +55 -75
  127. paddlex/inference/models/anomaly_detection/__init__.py +15 -0
  128. paddlex/inference/models/anomaly_detection/predictor.py +135 -0
  129. paddlex/inference/models/anomaly_detection/processors.py +53 -0
  130. paddlex/inference/models/anomaly_detection/result.py +71 -0
  131. paddlex/inference/models/base/__init__.py +2 -3
  132. paddlex/inference/models/base/predictor/__init__.py +15 -0
  133. paddlex/inference/models/base/predictor/base_predictor.py +420 -0
  134. paddlex/inference/models/common/__init__.py +26 -0
  135. paddlex/inference/models/common/static_infer.py +850 -0
  136. paddlex/inference/models/common/tokenizer/__init__.py +19 -0
  137. paddlex/inference/models/common/tokenizer/bert_tokenizer.py +655 -0
  138. paddlex/inference/models/common/tokenizer/clip_tokenizer.py +609 -0
  139. paddlex/inference/models/common/tokenizer/gpt_tokenizer.py +453 -0
  140. paddlex/inference/models/common/tokenizer/qwen2_tokenizer.py +432 -0
  141. paddlex/inference/models/common/tokenizer/tokenizer_utils.py +2149 -0
  142. paddlex/inference/models/common/tokenizer/tokenizer_utils_base.py +3720 -0
  143. paddlex/inference/models/common/tokenizer/utils.py +66 -0
  144. paddlex/inference/models/common/tokenizer/vocab.py +647 -0
  145. paddlex/inference/models/common/ts/__init__.py +15 -0
  146. paddlex/inference/models/common/ts/funcs.py +540 -0
  147. paddlex/inference/models/common/ts/processors.py +322 -0
  148. paddlex/inference/models/common/vision/__init__.py +23 -0
  149. paddlex/inference/models/common/vision/funcs.py +98 -0
  150. paddlex/inference/models/common/vision/processors.py +285 -0
  151. paddlex/inference/models/common/vlm/__init__.py +13 -0
  152. paddlex/inference/models/common/vlm/activations.py +189 -0
  153. paddlex/inference/models/common/vlm/bert_padding.py +127 -0
  154. paddlex/inference/models/common/vlm/distributed.py +229 -0
  155. paddlex/inference/models/common/vlm/flash_attn_utils.py +119 -0
  156. paddlex/inference/models/common/vlm/generation/__init__.py +34 -0
  157. paddlex/inference/models/common/vlm/generation/configuration_utils.py +533 -0
  158. paddlex/inference/models/common/vlm/generation/logits_process.py +730 -0
  159. paddlex/inference/models/common/vlm/generation/stopping_criteria.py +106 -0
  160. paddlex/inference/models/common/vlm/generation/utils.py +2162 -0
  161. paddlex/inference/models/common/vlm/transformers/__init__.py +16 -0
  162. paddlex/inference/models/common/vlm/transformers/configuration_utils.py +1037 -0
  163. paddlex/inference/models/common/vlm/transformers/conversion_utils.py +408 -0
  164. paddlex/inference/models/common/vlm/transformers/model_outputs.py +1612 -0
  165. paddlex/inference/models/common/vlm/transformers/model_utils.py +2038 -0
  166. paddlex/inference/models/common/vlm/transformers/utils.py +178 -0
  167. paddlex/inference/models/common/vlm/utils.py +109 -0
  168. paddlex/inference/models/doc_vlm/__init__.py +15 -0
  169. paddlex/inference/models/doc_vlm/modeling/__init__.py +15 -0
  170. paddlex/inference/models/doc_vlm/modeling/qwen2_vl.py +2600 -0
  171. paddlex/inference/models/doc_vlm/predictor.py +198 -0
  172. paddlex/inference/models/doc_vlm/processors/__init__.py +15 -0
  173. paddlex/inference/models/doc_vlm/processors/common.py +372 -0
  174. paddlex/inference/models/doc_vlm/processors/qwen2_vl.py +698 -0
  175. paddlex/inference/models/doc_vlm/result.py +21 -0
  176. paddlex/inference/models/face_feature/__init__.py +15 -0
  177. paddlex/inference/models/face_feature/predictor.py +66 -0
  178. paddlex/inference/models/formula_recognition/__init__.py +15 -0
  179. paddlex/inference/models/formula_recognition/predictor.py +187 -0
  180. paddlex/inference/models/formula_recognition/processors.py +1002 -0
  181. paddlex/inference/models/formula_recognition/result.py +410 -0
  182. paddlex/inference/models/image_classification/__init__.py +15 -0
  183. paddlex/inference/models/image_classification/predictor.py +172 -0
  184. paddlex/inference/models/image_classification/processors.py +89 -0
  185. paddlex/inference/models/image_classification/result.py +93 -0
  186. paddlex/inference/models/image_feature/__init__.py +15 -0
  187. paddlex/inference/models/image_feature/predictor.py +146 -0
  188. paddlex/inference/models/image_feature/processors.py +32 -0
  189. paddlex/inference/models/image_feature/result.py +32 -0
  190. paddlex/inference/models/image_multilabel_classification/__init__.py +15 -0
  191. paddlex/inference/models/image_multilabel_classification/predictor.py +95 -0
  192. paddlex/inference/models/image_multilabel_classification/processors.py +89 -0
  193. paddlex/inference/models/image_multilabel_classification/result.py +96 -0
  194. paddlex/inference/models/image_unwarping/__init__.py +15 -0
  195. paddlex/inference/models/image_unwarping/predictor.py +97 -0
  196. paddlex/inference/models/image_unwarping/processors.py +92 -0
  197. paddlex/inference/models/image_unwarping/result.py +47 -0
  198. paddlex/inference/models/instance_segmentation/__init__.py +15 -0
  199. paddlex/inference/models/instance_segmentation/predictor.py +202 -0
  200. paddlex/inference/models/instance_segmentation/processors.py +102 -0
  201. paddlex/inference/models/instance_segmentation/result.py +162 -0
  202. paddlex/inference/models/keypoint_detection/__init__.py +15 -0
  203. paddlex/inference/models/keypoint_detection/predictor.py +187 -0
  204. paddlex/inference/models/keypoint_detection/processors.py +367 -0
  205. paddlex/inference/models/keypoint_detection/result.py +197 -0
  206. paddlex/inference/models/m_3d_bev_detection/__init__.py +15 -0
  207. paddlex/inference/models/m_3d_bev_detection/predictor.py +303 -0
  208. paddlex/inference/models/m_3d_bev_detection/processors.py +990 -0
  209. paddlex/inference/models/m_3d_bev_detection/result.py +68 -0
  210. paddlex/inference/models/m_3d_bev_detection/visualizer_3d.py +169 -0
  211. paddlex/inference/models/multilingual_speech_recognition/__init__.py +15 -0
  212. paddlex/inference/models/multilingual_speech_recognition/predictor.py +137 -0
  213. paddlex/inference/models/multilingual_speech_recognition/processors.py +1933 -0
  214. paddlex/inference/models/multilingual_speech_recognition/result.py +21 -0
  215. paddlex/inference/models/object_detection/__init__.py +15 -0
  216. paddlex/inference/models/object_detection/predictor.py +342 -0
  217. paddlex/inference/models/object_detection/processors.py +860 -0
  218. paddlex/inference/models/object_detection/result.py +114 -0
  219. paddlex/inference/models/object_detection/utils.py +68 -0
  220. paddlex/inference/models/open_vocabulary_detection/__init__.py +15 -0
  221. paddlex/inference/models/open_vocabulary_detection/predictor.py +172 -0
  222. paddlex/inference/models/open_vocabulary_detection/processors/__init__.py +16 -0
  223. paddlex/inference/models/open_vocabulary_detection/processors/common.py +114 -0
  224. paddlex/inference/models/open_vocabulary_detection/processors/groundingdino_processors.py +496 -0
  225. paddlex/inference/models/open_vocabulary_detection/processors/yoloworld_processors.py +209 -0
  226. paddlex/inference/models/open_vocabulary_segmentation/__init__.py +15 -0
  227. paddlex/inference/models/open_vocabulary_segmentation/predictor.py +113 -0
  228. paddlex/inference/models/open_vocabulary_segmentation/processors/__init__.py +15 -0
  229. paddlex/inference/models/open_vocabulary_segmentation/processors/sam_processer.py +249 -0
  230. paddlex/inference/models/open_vocabulary_segmentation/results/__init__.py +15 -0
  231. paddlex/inference/models/open_vocabulary_segmentation/results/sam_result.py +149 -0
  232. paddlex/inference/models/semantic_segmentation/__init__.py +15 -0
  233. paddlex/inference/models/semantic_segmentation/predictor.py +158 -0
  234. paddlex/inference/models/semantic_segmentation/processors.py +117 -0
  235. paddlex/inference/models/semantic_segmentation/result.py +73 -0
  236. paddlex/inference/models/table_structure_recognition/__init__.py +15 -0
  237. paddlex/inference/models/table_structure_recognition/predictor.py +161 -0
  238. paddlex/inference/models/table_structure_recognition/processors.py +229 -0
  239. paddlex/inference/models/table_structure_recognition/result.py +73 -0
  240. paddlex/inference/models/text_detection/__init__.py +15 -0
  241. paddlex/inference/models/text_detection/predictor.py +183 -0
  242. paddlex/inference/models/text_detection/processors.py +504 -0
  243. paddlex/inference/models/text_detection/result.py +56 -0
  244. paddlex/inference/models/text_recognition/__init__.py +15 -0
  245. paddlex/inference/models/text_recognition/predictor.py +98 -0
  246. paddlex/inference/models/text_recognition/processors.py +245 -0
  247. paddlex/inference/models/text_recognition/result.py +76 -0
  248. paddlex/inference/models/ts_anomaly_detection/__init__.py +15 -0
  249. paddlex/inference/models/ts_anomaly_detection/predictor.py +141 -0
  250. paddlex/inference/models/ts_anomaly_detection/processors.py +98 -0
  251. paddlex/inference/models/ts_anomaly_detection/result.py +83 -0
  252. paddlex/inference/models/ts_classification/__init__.py +15 -0
  253. paddlex/inference/models/ts_classification/predictor.py +122 -0
  254. paddlex/inference/models/ts_classification/processors.py +122 -0
  255. paddlex/inference/models/ts_classification/result.py +87 -0
  256. paddlex/inference/models/ts_forecasting/__init__.py +15 -0
  257. paddlex/inference/models/ts_forecasting/predictor.py +154 -0
  258. paddlex/inference/models/ts_forecasting/processors.py +158 -0
  259. paddlex/inference/models/ts_forecasting/result.py +96 -0
  260. paddlex/inference/models/video_classification/__init__.py +15 -0
  261. paddlex/inference/models/video_classification/predictor.py +141 -0
  262. paddlex/inference/models/video_classification/processors.py +409 -0
  263. paddlex/inference/models/video_classification/result.py +96 -0
  264. paddlex/inference/models/video_detection/__init__.py +15 -0
  265. paddlex/inference/models/video_detection/predictor.py +129 -0
  266. paddlex/inference/models/video_detection/processors.py +463 -0
  267. paddlex/inference/models/video_detection/result.py +109 -0
  268. paddlex/inference/pipelines/__init__.py +186 -78
  269. paddlex/inference/pipelines/anomaly_detection/__init__.py +15 -0
  270. paddlex/inference/pipelines/anomaly_detection/pipeline.py +72 -0
  271. paddlex/inference/pipelines/attribute_recognition/__init__.py +15 -0
  272. paddlex/inference/pipelines/attribute_recognition/pipeline.py +110 -0
  273. paddlex/inference/pipelines/attribute_recognition/result.py +102 -0
  274. paddlex/inference/pipelines/base.py +125 -59
  275. paddlex/inference/pipelines/components/__init__.py +29 -0
  276. paddlex/inference/pipelines/components/chat_server/__init__.py +16 -0
  277. paddlex/inference/pipelines/components/chat_server/base.py +39 -0
  278. paddlex/inference/pipelines/components/chat_server/openai_bot_chat.py +236 -0
  279. paddlex/inference/pipelines/components/common/__init__.py +19 -0
  280. paddlex/inference/pipelines/components/common/base_operator.py +37 -0
  281. paddlex/inference/pipelines/components/common/base_result.py +66 -0
  282. paddlex/inference/pipelines/components/common/convert_points_and_boxes.py +45 -0
  283. paddlex/inference/pipelines/components/common/crop_image_regions.py +556 -0
  284. paddlex/inference/pipelines/components/common/seal_det_warp.py +972 -0
  285. paddlex/inference/pipelines/components/common/sort_boxes.py +85 -0
  286. paddlex/inference/pipelines/components/common/warp_image.py +50 -0
  287. paddlex/inference/pipelines/components/faisser.py +357 -0
  288. paddlex/inference/pipelines/components/prompt_engineering/__init__.py +16 -0
  289. paddlex/inference/pipelines/components/prompt_engineering/base.py +35 -0
  290. paddlex/inference/pipelines/components/prompt_engineering/generate_ensemble_prompt.py +128 -0
  291. paddlex/inference/pipelines/components/prompt_engineering/generate_kie_prompt.py +148 -0
  292. paddlex/inference/pipelines/components/retriever/__init__.py +16 -0
  293. paddlex/inference/pipelines/components/retriever/base.py +228 -0
  294. paddlex/inference/pipelines/components/retriever/openai_bot_retriever.py +70 -0
  295. paddlex/inference/pipelines/components/retriever/qianfan_bot_retriever.py +166 -0
  296. paddlex/inference/pipelines/components/utils/__init__.py +13 -0
  297. paddlex/inference/pipelines/components/utils/mixin.py +206 -0
  298. paddlex/inference/pipelines/doc_preprocessor/__init__.py +15 -0
  299. paddlex/inference/pipelines/doc_preprocessor/pipeline.py +183 -0
  300. paddlex/inference/pipelines/doc_preprocessor/result.py +98 -0
  301. paddlex/inference/pipelines/doc_understanding/__init__.py +15 -0
  302. paddlex/inference/pipelines/doc_understanding/pipeline.py +71 -0
  303. paddlex/inference/pipelines/face_recognition/__init__.py +15 -0
  304. paddlex/inference/pipelines/face_recognition/pipeline.py +63 -0
  305. paddlex/inference/pipelines/face_recognition/result.py +44 -0
  306. paddlex/inference/pipelines/formula_recognition/__init__.py +15 -0
  307. paddlex/inference/pipelines/formula_recognition/pipeline.py +309 -0
  308. paddlex/inference/pipelines/formula_recognition/result.py +292 -0
  309. paddlex/inference/pipelines/image_classification/__init__.py +15 -0
  310. paddlex/inference/pipelines/image_classification/pipeline.py +80 -0
  311. paddlex/inference/pipelines/image_multilabel_classification/__init__.py +15 -0
  312. paddlex/inference/pipelines/image_multilabel_classification/pipeline.py +87 -0
  313. paddlex/inference/pipelines/instance_segmentation/__init__.py +15 -0
  314. paddlex/inference/pipelines/instance_segmentation/pipeline.py +81 -0
  315. paddlex/inference/pipelines/keypoint_detection/__init__.py +15 -0
  316. paddlex/inference/pipelines/keypoint_detection/pipeline.py +148 -0
  317. paddlex/inference/pipelines/layout_parsing/__init__.py +3 -2
  318. paddlex/inference/pipelines/layout_parsing/pipeline.py +581 -0
  319. paddlex/inference/pipelines/layout_parsing/pipeline_v2.py +749 -0
  320. paddlex/inference/pipelines/layout_parsing/result.py +204 -0
  321. paddlex/inference/pipelines/layout_parsing/result_v2.py +467 -0
  322. paddlex/inference/pipelines/layout_parsing/utils.py +2384 -0
  323. paddlex/inference/pipelines/m_3d_bev_detection/__init__.py +15 -0
  324. paddlex/inference/pipelines/m_3d_bev_detection/pipeline.py +74 -0
  325. paddlex/inference/pipelines/multilingual_speech_recognition/__init__.py +15 -0
  326. paddlex/inference/pipelines/multilingual_speech_recognition/pipeline.py +78 -0
  327. paddlex/inference/pipelines/object_detection/__init__.py +15 -0
  328. paddlex/inference/pipelines/object_detection/pipeline.py +105 -0
  329. paddlex/inference/pipelines/ocr/__init__.py +15 -0
  330. paddlex/inference/pipelines/ocr/pipeline.py +406 -0
  331. paddlex/inference/pipelines/ocr/result.py +252 -0
  332. paddlex/inference/pipelines/open_vocabulary_detection/__init__.py +15 -0
  333. paddlex/inference/pipelines/open_vocabulary_detection/pipeline.py +86 -0
  334. paddlex/inference/pipelines/open_vocabulary_segmentation/__init__.py +15 -0
  335. paddlex/inference/pipelines/open_vocabulary_segmentation/pipeline.py +100 -0
  336. paddlex/inference/pipelines/pp_chatocr/__init__.py +16 -0
  337. paddlex/inference/pipelines/pp_chatocr/pipeline_base.py +111 -0
  338. paddlex/inference/pipelines/pp_chatocr/pipeline_v3.py +784 -0
  339. paddlex/inference/pipelines/pp_chatocr/pipeline_v4.py +995 -0
  340. paddlex/inference/pipelines/pp_shitu_v2/__init__.py +15 -0
  341. paddlex/inference/pipelines/pp_shitu_v2/pipeline.py +156 -0
  342. paddlex/inference/pipelines/pp_shitu_v2/result.py +126 -0
  343. paddlex/inference/pipelines/rotated_object_detection/__init__.py +15 -0
  344. paddlex/inference/pipelines/rotated_object_detection/pipeline.py +85 -0
  345. paddlex/inference/pipelines/seal_recognition/__init__.py +15 -0
  346. paddlex/inference/pipelines/seal_recognition/pipeline.py +279 -0
  347. paddlex/inference/pipelines/seal_recognition/result.py +89 -0
  348. paddlex/inference/pipelines/semantic_segmentation/__init__.py +15 -0
  349. paddlex/inference/pipelines/semantic_segmentation/pipeline.py +85 -0
  350. paddlex/inference/pipelines/small_object_detection/__init__.py +15 -0
  351. paddlex/inference/pipelines/small_object_detection/pipeline.py +85 -0
  352. paddlex/inference/pipelines/table_recognition/__init__.py +3 -2
  353. paddlex/inference/pipelines/table_recognition/pipeline.py +478 -0
  354. paddlex/inference/pipelines/table_recognition/pipeline_v2.py +824 -0
  355. paddlex/inference/pipelines/table_recognition/result.py +218 -0
  356. paddlex/inference/pipelines/table_recognition/table_recognition_post_processing.py +366 -0
  357. paddlex/inference/pipelines/table_recognition/table_recognition_post_processing_v2.py +484 -0
  358. paddlex/inference/pipelines/table_recognition/utils.py +24 -437
  359. paddlex/inference/pipelines/ts_anomaly_detection/__init__.py +15 -0
  360. paddlex/inference/pipelines/ts_anomaly_detection/pipeline.py +72 -0
  361. paddlex/inference/pipelines/ts_classification/__init__.py +15 -0
  362. paddlex/inference/pipelines/ts_classification/pipeline.py +72 -0
  363. paddlex/inference/pipelines/ts_forecasting/__init__.py +15 -0
  364. paddlex/inference/pipelines/ts_forecasting/pipeline.py +72 -0
  365. paddlex/inference/pipelines/video_classification/__init__.py +15 -0
  366. paddlex/inference/pipelines/video_classification/pipeline.py +79 -0
  367. paddlex/inference/pipelines/video_detection/__init__.py +15 -0
  368. paddlex/inference/pipelines/video_detection/pipeline.py +86 -0
  369. paddlex/inference/serving/__init__.py +17 -0
  370. paddlex/inference/serving/basic_serving/__init__.py +18 -0
  371. paddlex/inference/serving/basic_serving/_app.py +221 -0
  372. paddlex/inference/serving/basic_serving/_pipeline_apps/__init__.py +44 -0
  373. paddlex/inference/serving/basic_serving/_pipeline_apps/_common/__init__.py +13 -0
  374. paddlex/inference/serving/basic_serving/_pipeline_apps/_common/common.py +100 -0
  375. paddlex/inference/serving/basic_serving/_pipeline_apps/_common/image_recognition.py +36 -0
  376. paddlex/inference/serving/basic_serving/_pipeline_apps/_common/ocr.py +95 -0
  377. paddlex/inference/serving/basic_serving/_pipeline_apps/anomaly_detection.py +67 -0
  378. paddlex/inference/serving/basic_serving/_pipeline_apps/doc_preprocessor.py +100 -0
  379. paddlex/inference/serving/basic_serving/_pipeline_apps/doc_understanding.py +153 -0
  380. paddlex/inference/serving/basic_serving/_pipeline_apps/face_recognition.py +226 -0
  381. paddlex/inference/serving/basic_serving/_pipeline_apps/formula_recognition.py +100 -0
  382. paddlex/inference/serving/basic_serving/_pipeline_apps/human_keypoint_detection.py +81 -0
  383. paddlex/inference/serving/basic_serving/_pipeline_apps/image_classification.py +69 -0
  384. paddlex/inference/serving/basic_serving/_pipeline_apps/image_multilabel_classification.py +73 -0
  385. paddlex/inference/serving/basic_serving/_pipeline_apps/instance_segmentation.py +87 -0
  386. paddlex/inference/serving/basic_serving/_pipeline_apps/layout_parsing.py +118 -0
  387. paddlex/inference/serving/basic_serving/_pipeline_apps/m_3d_bev_detection.py +79 -0
  388. paddlex/inference/serving/basic_serving/_pipeline_apps/multilingual_speech_recognition.py +92 -0
  389. paddlex/inference/serving/basic_serving/_pipeline_apps/object_detection.py +77 -0
  390. paddlex/inference/serving/basic_serving/_pipeline_apps/ocr.py +102 -0
  391. paddlex/inference/serving/basic_serving/_pipeline_apps/open_vocabulary_detection.py +81 -0
  392. paddlex/inference/serving/basic_serving/_pipeline_apps/open_vocabulary_segmentation.py +91 -0
  393. paddlex/inference/serving/basic_serving/_pipeline_apps/pedestrian_attribute_recognition.py +84 -0
  394. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv3_doc.py +194 -0
  395. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv4_doc.py +224 -0
  396. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_shituv2.py +221 -0
  397. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_structurev3.py +139 -0
  398. paddlex/inference/serving/basic_serving/_pipeline_apps/rotated_object_detection.py +81 -0
  399. paddlex/inference/serving/basic_serving/_pipeline_apps/seal_recognition.py +106 -0
  400. paddlex/inference/serving/basic_serving/_pipeline_apps/semantic_segmentation.py +67 -0
  401. paddlex/inference/serving/basic_serving/_pipeline_apps/small_object_detection.py +72 -0
  402. paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition.py +108 -0
  403. paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition_v2.py +110 -0
  404. paddlex/inference/serving/basic_serving/_pipeline_apps/ts_anomaly_detection.py +65 -0
  405. paddlex/inference/serving/basic_serving/_pipeline_apps/ts_classification.py +64 -0
  406. paddlex/inference/serving/basic_serving/_pipeline_apps/ts_forecast.py +65 -0
  407. paddlex/inference/serving/basic_serving/_pipeline_apps/vehicle_attribute_recognition.py +84 -0
  408. paddlex/inference/serving/basic_serving/_pipeline_apps/video_classification.py +76 -0
  409. paddlex/inference/serving/basic_serving/_pipeline_apps/video_detection.py +92 -0
  410. paddlex/inference/serving/basic_serving/_server.py +40 -0
  411. paddlex/inference/serving/infra/__init__.py +13 -0
  412. paddlex/inference/serving/infra/config.py +36 -0
  413. paddlex/inference/serving/infra/models.py +79 -0
  414. paddlex/inference/serving/infra/storage.py +180 -0
  415. paddlex/inference/serving/infra/utils.py +287 -0
  416. paddlex/inference/serving/schemas/__init__.py +13 -0
  417. paddlex/inference/serving/schemas/anomaly_detection.py +39 -0
  418. paddlex/inference/serving/schemas/doc_preprocessor.py +54 -0
  419. paddlex/inference/serving/schemas/doc_understanding.py +78 -0
  420. paddlex/inference/serving/schemas/face_recognition.py +124 -0
  421. paddlex/inference/serving/schemas/formula_recognition.py +56 -0
  422. paddlex/inference/serving/schemas/human_keypoint_detection.py +55 -0
  423. paddlex/inference/serving/schemas/image_classification.py +45 -0
  424. paddlex/inference/serving/schemas/image_multilabel_classification.py +47 -0
  425. paddlex/inference/serving/schemas/instance_segmentation.py +53 -0
  426. paddlex/inference/serving/schemas/layout_parsing.py +72 -0
  427. paddlex/inference/serving/schemas/m_3d_bev_detection.py +48 -0
  428. paddlex/inference/serving/schemas/multilingual_speech_recognition.py +57 -0
  429. paddlex/inference/serving/schemas/object_detection.py +52 -0
  430. paddlex/inference/serving/schemas/ocr.py +60 -0
  431. paddlex/inference/serving/schemas/open_vocabulary_detection.py +52 -0
  432. paddlex/inference/serving/schemas/open_vocabulary_segmentation.py +52 -0
  433. paddlex/inference/serving/schemas/pedestrian_attribute_recognition.py +61 -0
  434. paddlex/inference/serving/schemas/pp_chatocrv3_doc.py +134 -0
  435. paddlex/inference/serving/schemas/pp_chatocrv4_doc.py +151 -0
  436. paddlex/inference/serving/schemas/pp_shituv2.py +124 -0
  437. paddlex/inference/serving/schemas/pp_structurev3.py +84 -0
  438. paddlex/inference/serving/schemas/rotated_object_detection.py +52 -0
  439. paddlex/inference/serving/schemas/seal_recognition.py +62 -0
  440. paddlex/inference/serving/schemas/semantic_segmentation.py +45 -0
  441. paddlex/inference/serving/schemas/shared/__init__.py +13 -0
  442. paddlex/inference/serving/schemas/shared/classification.py +23 -0
  443. paddlex/inference/serving/schemas/shared/image_segmentation.py +28 -0
  444. paddlex/inference/serving/schemas/shared/object_detection.py +24 -0
  445. paddlex/inference/serving/schemas/shared/ocr.py +25 -0
  446. paddlex/inference/serving/schemas/small_object_detection.py +52 -0
  447. paddlex/inference/serving/schemas/table_recognition.py +64 -0
  448. paddlex/inference/serving/schemas/table_recognition_v2.py +66 -0
  449. paddlex/inference/serving/schemas/ts_anomaly_detection.py +37 -0
  450. paddlex/inference/serving/schemas/ts_classification.py +38 -0
  451. paddlex/inference/serving/schemas/ts_forecast.py +37 -0
  452. paddlex/inference/serving/schemas/vehicle_attribute_recognition.py +61 -0
  453. paddlex/inference/serving/schemas/video_classification.py +44 -0
  454. paddlex/inference/serving/schemas/video_detection.py +56 -0
  455. paddlex/inference/utils/__init__.py +1 -1
  456. paddlex/inference/utils/benchmark.py +333 -168
  457. paddlex/inference/utils/color_map.py +1 -1
  458. paddlex/inference/utils/get_pipeline_path.py +3 -2
  459. paddlex/inference/utils/hpi.py +251 -0
  460. paddlex/inference/utils/hpi_model_info_collection.json +2252 -0
  461. paddlex/inference/utils/io/__init__.py +11 -8
  462. paddlex/inference/utils/io/readers.py +178 -27
  463. paddlex/inference/utils/io/style.py +21 -14
  464. paddlex/inference/utils/io/tablepyxl.py +13 -5
  465. paddlex/inference/utils/io/writers.py +92 -10
  466. paddlex/inference/utils/model_paths.py +48 -0
  467. paddlex/inference/utils/new_ir_blocklist.py +27 -0
  468. paddlex/inference/utils/official_models.py +281 -213
  469. paddlex/inference/utils/pp_option.py +168 -77
  470. paddlex/inference/utils/trt_blocklist.py +43 -0
  471. paddlex/inference/utils/trt_config.py +420 -0
  472. paddlex/model.py +39 -14
  473. paddlex/modules/__init__.py +67 -57
  474. paddlex/modules/anomaly_detection/__init__.py +2 -2
  475. paddlex/modules/anomaly_detection/dataset_checker/__init__.py +2 -3
  476. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/__init__.py +2 -2
  477. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/analyse_dataset.py +6 -3
  478. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/check_dataset.py +8 -4
  479. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/convert_dataset.py +7 -4
  480. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/split_dataset.py +2 -2
  481. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/utils/__init__.py +1 -1
  482. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/utils/visualizer.py +7 -2
  483. paddlex/modules/anomaly_detection/evaluator.py +1 -1
  484. paddlex/modules/anomaly_detection/exportor.py +1 -1
  485. paddlex/modules/anomaly_detection/model_list.py +1 -1
  486. paddlex/modules/anomaly_detection/trainer.py +3 -4
  487. paddlex/modules/base/__init__.py +5 -5
  488. paddlex/modules/base/build_model.py +2 -3
  489. paddlex/modules/base/dataset_checker/__init__.py +2 -2
  490. paddlex/modules/base/dataset_checker/dataset_checker.py +9 -4
  491. paddlex/modules/base/dataset_checker/utils.py +1 -3
  492. paddlex/modules/base/evaluator.py +24 -8
  493. paddlex/modules/base/exportor.py +36 -12
  494. paddlex/modules/base/trainer.py +43 -10
  495. paddlex/modules/base/utils/__init__.py +13 -0
  496. paddlex/modules/base/utils/cinn_setting.py +89 -0
  497. paddlex/modules/base/utils/coco_eval.py +94 -0
  498. paddlex/modules/base/utils/topk_eval.py +118 -0
  499. paddlex/modules/doc_vlm/__init__.py +18 -0
  500. paddlex/modules/doc_vlm/dataset_checker.py +29 -0
  501. paddlex/modules/doc_vlm/evaluator.py +29 -0
  502. paddlex/modules/doc_vlm/exportor.py +29 -0
  503. paddlex/modules/doc_vlm/model_list.py +16 -0
  504. paddlex/modules/doc_vlm/trainer.py +41 -0
  505. paddlex/modules/face_recognition/__init__.py +2 -2
  506. paddlex/modules/face_recognition/dataset_checker/__init__.py +2 -2
  507. paddlex/modules/face_recognition/dataset_checker/dataset_src/__init__.py +1 -1
  508. paddlex/modules/face_recognition/dataset_checker/dataset_src/check_dataset.py +3 -5
  509. paddlex/modules/face_recognition/dataset_checker/dataset_src/utils/__init__.py +1 -1
  510. paddlex/modules/face_recognition/dataset_checker/dataset_src/utils/visualizer.py +2 -5
  511. paddlex/modules/face_recognition/evaluator.py +1 -1
  512. paddlex/modules/face_recognition/exportor.py +1 -1
  513. paddlex/modules/face_recognition/model_list.py +1 -1
  514. paddlex/modules/face_recognition/trainer.py +2 -24
  515. paddlex/modules/formula_recognition/__init__.py +6 -1
  516. paddlex/modules/formula_recognition/dataset_checker/__init__.py +113 -0
  517. paddlex/modules/formula_recognition/dataset_checker/dataset_src/__init__.py +19 -0
  518. paddlex/modules/formula_recognition/dataset_checker/dataset_src/analyse_dataset.py +158 -0
  519. paddlex/modules/formula_recognition/dataset_checker/dataset_src/check_dataset.py +76 -0
  520. paddlex/modules/formula_recognition/dataset_checker/dataset_src/convert_dataset.py +95 -0
  521. paddlex/modules/formula_recognition/dataset_checker/dataset_src/split_dataset.py +80 -0
  522. paddlex/modules/formula_recognition/evaluator.py +77 -0
  523. paddlex/modules/formula_recognition/exportor.py +22 -0
  524. paddlex/modules/formula_recognition/model_list.py +4 -1
  525. paddlex/modules/formula_recognition/trainer.py +120 -0
  526. paddlex/modules/general_recognition/__init__.py +2 -2
  527. paddlex/modules/general_recognition/dataset_checker/__init__.py +2 -2
  528. paddlex/modules/general_recognition/dataset_checker/dataset_src/__init__.py +2 -2
  529. paddlex/modules/general_recognition/dataset_checker/dataset_src/analyse_dataset.py +7 -9
  530. paddlex/modules/general_recognition/dataset_checker/dataset_src/check_dataset.py +4 -5
  531. paddlex/modules/general_recognition/dataset_checker/dataset_src/convert_dataset.py +6 -5
  532. paddlex/modules/general_recognition/dataset_checker/dataset_src/split_dataset.py +1 -1
  533. paddlex/modules/general_recognition/dataset_checker/dataset_src/utils/__init__.py +1 -1
  534. paddlex/modules/general_recognition/dataset_checker/dataset_src/utils/visualizer.py +2 -5
  535. paddlex/modules/general_recognition/evaluator.py +1 -1
  536. paddlex/modules/general_recognition/exportor.py +1 -1
  537. paddlex/modules/general_recognition/model_list.py +1 -1
  538. paddlex/modules/general_recognition/trainer.py +1 -1
  539. paddlex/modules/image_classification/__init__.py +2 -2
  540. paddlex/modules/image_classification/dataset_checker/__init__.py +2 -2
  541. paddlex/modules/image_classification/dataset_checker/dataset_src/__init__.py +2 -2
  542. paddlex/modules/image_classification/dataset_checker/dataset_src/analyse_dataset.py +8 -9
  543. paddlex/modules/image_classification/dataset_checker/dataset_src/check_dataset.py +4 -3
  544. paddlex/modules/image_classification/dataset_checker/dataset_src/convert_dataset.py +4 -4
  545. paddlex/modules/image_classification/dataset_checker/dataset_src/split_dataset.py +1 -1
  546. paddlex/modules/image_classification/dataset_checker/dataset_src/utils/__init__.py +1 -1
  547. paddlex/modules/image_classification/dataset_checker/dataset_src/utils/visualizer.py +2 -5
  548. paddlex/modules/image_classification/evaluator.py +1 -1
  549. paddlex/modules/image_classification/exportor.py +1 -1
  550. paddlex/modules/image_classification/model_list.py +3 -1
  551. paddlex/modules/image_classification/trainer.py +3 -3
  552. paddlex/modules/image_unwarping/__init__.py +1 -1
  553. paddlex/modules/image_unwarping/model_list.py +1 -1
  554. paddlex/modules/instance_segmentation/__init__.py +2 -2
  555. paddlex/modules/instance_segmentation/dataset_checker/__init__.py +17 -3
  556. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/__init__.py +2 -2
  557. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/analyse_dataset.py +9 -5
  558. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/check_dataset.py +8 -5
  559. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/convert_dataset.py +8 -8
  560. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/split_dataset.py +7 -4
  561. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/utils/__init__.py +1 -1
  562. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/utils/visualizer.py +10 -8
  563. paddlex/modules/instance_segmentation/evaluator.py +1 -1
  564. paddlex/modules/instance_segmentation/exportor.py +1 -1
  565. paddlex/modules/instance_segmentation/model_list.py +1 -1
  566. paddlex/modules/instance_segmentation/trainer.py +1 -1
  567. paddlex/modules/keypoint_detection/__init__.py +18 -0
  568. paddlex/modules/keypoint_detection/dataset_checker/__init__.py +56 -0
  569. paddlex/modules/keypoint_detection/dataset_checker/dataset_src/__init__.py +15 -0
  570. paddlex/modules/keypoint_detection/dataset_checker/dataset_src/check_dataset.py +91 -0
  571. paddlex/modules/keypoint_detection/dataset_checker/dataset_src/utils/__init__.py +13 -0
  572. paddlex/modules/keypoint_detection/dataset_checker/dataset_src/utils/visualizer.py +124 -0
  573. paddlex/modules/keypoint_detection/evaluator.py +41 -0
  574. paddlex/modules/keypoint_detection/exportor.py +22 -0
  575. paddlex/modules/keypoint_detection/model_list.py +16 -0
  576. paddlex/modules/keypoint_detection/trainer.py +39 -0
  577. paddlex/modules/m_3d_bev_detection/__init__.py +18 -0
  578. paddlex/modules/m_3d_bev_detection/dataset_checker/__init__.py +95 -0
  579. paddlex/modules/m_3d_bev_detection/dataset_checker/dataset_src/__init__.py +17 -0
  580. paddlex/modules/m_3d_bev_detection/dataset_checker/dataset_src/analyse_dataset.py +106 -0
  581. paddlex/modules/m_3d_bev_detection/dataset_checker/dataset_src/check_dataset.py +101 -0
  582. paddlex/modules/m_3d_bev_detection/evaluator.py +46 -0
  583. paddlex/modules/m_3d_bev_detection/exportor.py +22 -0
  584. paddlex/modules/m_3d_bev_detection/model_list.py +18 -0
  585. paddlex/modules/m_3d_bev_detection/trainer.py +68 -0
  586. paddlex/modules/multilabel_classification/__init__.py +2 -2
  587. paddlex/modules/multilabel_classification/dataset_checker/__init__.py +2 -2
  588. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/__init__.py +2 -2
  589. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/analyse_dataset.py +8 -9
  590. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/check_dataset.py +4 -3
  591. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/convert_dataset.py +10 -7
  592. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/split_dataset.py +1 -1
  593. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/utils/__init__.py +1 -1
  594. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/utils/visualizer.py +1 -5
  595. paddlex/modules/multilabel_classification/evaluator.py +1 -1
  596. paddlex/modules/multilabel_classification/exportor.py +1 -1
  597. paddlex/modules/multilabel_classification/model_list.py +1 -1
  598. paddlex/modules/multilabel_classification/trainer.py +3 -3
  599. paddlex/modules/multilingual_speech_recognition/__init__.py +18 -0
  600. paddlex/modules/multilingual_speech_recognition/dataset_checker.py +27 -0
  601. paddlex/modules/multilingual_speech_recognition/evaluator.py +27 -0
  602. paddlex/modules/multilingual_speech_recognition/exportor.py +27 -0
  603. paddlex/modules/multilingual_speech_recognition/model_list.py +22 -0
  604. paddlex/modules/multilingual_speech_recognition/trainer.py +42 -0
  605. paddlex/modules/object_detection/__init__.py +2 -2
  606. paddlex/modules/object_detection/dataset_checker/__init__.py +2 -11
  607. paddlex/modules/object_detection/dataset_checker/dataset_src/__init__.py +2 -2
  608. paddlex/modules/object_detection/dataset_checker/dataset_src/analyse_dataset.py +10 -8
  609. paddlex/modules/object_detection/dataset_checker/dataset_src/check_dataset.py +10 -5
  610. paddlex/modules/object_detection/dataset_checker/dataset_src/convert_dataset.py +13 -8
  611. paddlex/modules/object_detection/dataset_checker/dataset_src/split_dataset.py +8 -4
  612. paddlex/modules/object_detection/dataset_checker/dataset_src/utils/__init__.py +1 -1
  613. paddlex/modules/object_detection/dataset_checker/dataset_src/utils/visualizer.py +9 -8
  614. paddlex/modules/object_detection/evaluator.py +18 -2
  615. paddlex/modules/object_detection/exportor.py +1 -1
  616. paddlex/modules/object_detection/model_list.py +11 -1
  617. paddlex/modules/object_detection/trainer.py +19 -6
  618. paddlex/modules/open_vocabulary_detection/__init__.py +18 -0
  619. paddlex/modules/open_vocabulary_detection/dataset_checker.py +29 -0
  620. paddlex/modules/open_vocabulary_detection/evaluator.py +29 -0
  621. paddlex/modules/open_vocabulary_detection/exportor.py +29 -0
  622. paddlex/modules/open_vocabulary_detection/model_list.py +16 -0
  623. paddlex/modules/open_vocabulary_detection/trainer.py +44 -0
  624. paddlex/modules/open_vocabulary_segmentation/__init__.py +18 -0
  625. paddlex/modules/open_vocabulary_segmentation/dataset_checker.py +29 -0
  626. paddlex/modules/open_vocabulary_segmentation/evaluator.py +29 -0
  627. paddlex/modules/open_vocabulary_segmentation/exportor.py +29 -0
  628. paddlex/modules/open_vocabulary_segmentation/model_list.py +19 -0
  629. paddlex/modules/open_vocabulary_segmentation/trainer.py +44 -0
  630. paddlex/modules/semantic_segmentation/__init__.py +2 -2
  631. paddlex/modules/semantic_segmentation/dataset_checker/__init__.py +17 -3
  632. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/__init__.py +2 -2
  633. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/analyse_dataset.py +6 -3
  634. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/check_dataset.py +2 -2
  635. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/convert_dataset.py +7 -4
  636. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/split_dataset.py +2 -2
  637. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/utils/__init__.py +1 -1
  638. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/utils/visualizer.py +6 -2
  639. paddlex/modules/semantic_segmentation/evaluator.py +1 -1
  640. paddlex/modules/semantic_segmentation/exportor.py +10 -1
  641. paddlex/modules/semantic_segmentation/model_list.py +3 -1
  642. paddlex/modules/semantic_segmentation/trainer.py +5 -4
  643. paddlex/modules/table_recognition/__init__.py +2 -2
  644. paddlex/modules/table_recognition/dataset_checker/__init__.py +21 -6
  645. paddlex/modules/table_recognition/dataset_checker/dataset_src/__init__.py +2 -2
  646. paddlex/modules/table_recognition/dataset_checker/dataset_src/analyse_dataset.py +3 -2
  647. paddlex/modules/table_recognition/dataset_checker/dataset_src/check_dataset.py +20 -20
  648. paddlex/modules/table_recognition/dataset_checker/dataset_src/split_dataset.py +2 -1
  649. paddlex/modules/table_recognition/evaluator.py +1 -1
  650. paddlex/modules/table_recognition/exportor.py +1 -1
  651. paddlex/modules/table_recognition/model_list.py +3 -1
  652. paddlex/modules/table_recognition/trainer.py +2 -5
  653. paddlex/modules/text_detection/__init__.py +2 -2
  654. paddlex/modules/text_detection/dataset_checker/__init__.py +20 -7
  655. paddlex/modules/text_detection/dataset_checker/dataset_src/__init__.py +2 -2
  656. paddlex/modules/text_detection/dataset_checker/dataset_src/analyse_dataset.py +12 -9
  657. paddlex/modules/text_detection/dataset_checker/dataset_src/check_dataset.py +15 -5
  658. paddlex/modules/text_detection/dataset_checker/dataset_src/split_dataset.py +3 -3
  659. paddlex/modules/text_detection/evaluator.py +1 -1
  660. paddlex/modules/text_detection/exportor.py +1 -1
  661. paddlex/modules/text_detection/model_list.py +3 -1
  662. paddlex/modules/text_detection/trainer.py +2 -5
  663. paddlex/modules/text_recognition/__init__.py +2 -2
  664. paddlex/modules/text_recognition/dataset_checker/__init__.py +20 -9
  665. paddlex/modules/text_recognition/dataset_checker/dataset_src/__init__.py +2 -2
  666. paddlex/modules/text_recognition/dataset_checker/dataset_src/analyse_dataset.py +13 -12
  667. paddlex/modules/text_recognition/dataset_checker/dataset_src/check_dataset.py +15 -8
  668. paddlex/modules/text_recognition/dataset_checker/dataset_src/convert_dataset.py +11 -10
  669. paddlex/modules/text_recognition/dataset_checker/dataset_src/split_dataset.py +1 -2
  670. paddlex/modules/text_recognition/evaluator.py +5 -4
  671. paddlex/modules/text_recognition/exportor.py +1 -4
  672. paddlex/modules/text_recognition/model_list.py +15 -1
  673. paddlex/modules/text_recognition/trainer.py +6 -6
  674. paddlex/modules/ts_anomaly_detection/__init__.py +2 -2
  675. paddlex/modules/ts_anomaly_detection/dataset_checker/__init__.py +19 -5
  676. paddlex/modules/ts_anomaly_detection/dataset_checker/dataset_src/__init__.py +2 -2
  677. paddlex/modules/ts_anomaly_detection/dataset_checker/dataset_src/analyse_dataset.py +1 -9
  678. paddlex/modules/ts_anomaly_detection/dataset_checker/dataset_src/check_dataset.py +2 -2
  679. paddlex/modules/ts_anomaly_detection/dataset_checker/dataset_src/convert_dataset.py +2 -6
  680. paddlex/modules/ts_anomaly_detection/dataset_checker/dataset_src/split_dataset.py +4 -4
  681. paddlex/modules/ts_anomaly_detection/evaluator.py +1 -1
  682. paddlex/modules/ts_anomaly_detection/exportor.py +2 -3
  683. paddlex/modules/ts_anomaly_detection/model_list.py +1 -1
  684. paddlex/modules/ts_anomaly_detection/trainer.py +22 -6
  685. paddlex/modules/ts_classification/__init__.py +2 -2
  686. paddlex/modules/ts_classification/dataset_checker/__init__.py +19 -5
  687. paddlex/modules/ts_classification/dataset_checker/dataset_src/__init__.py +2 -2
  688. paddlex/modules/ts_classification/dataset_checker/dataset_src/analyse_dataset.py +8 -5
  689. paddlex/modules/ts_classification/dataset_checker/dataset_src/check_dataset.py +2 -2
  690. paddlex/modules/ts_classification/dataset_checker/dataset_src/convert_dataset.py +2 -6
  691. paddlex/modules/ts_classification/dataset_checker/dataset_src/split_dataset.py +4 -4
  692. paddlex/modules/ts_classification/evaluator.py +1 -1
  693. paddlex/modules/ts_classification/exportor.py +2 -3
  694. paddlex/modules/ts_classification/model_list.py +1 -1
  695. paddlex/modules/ts_classification/trainer.py +21 -5
  696. paddlex/modules/ts_forecast/__init__.py +2 -2
  697. paddlex/modules/ts_forecast/dataset_checker/__init__.py +19 -5
  698. paddlex/modules/ts_forecast/dataset_checker/dataset_src/__init__.py +2 -2
  699. paddlex/modules/ts_forecast/dataset_checker/dataset_src/analyse_dataset.py +1 -9
  700. paddlex/modules/ts_forecast/dataset_checker/dataset_src/check_dataset.py +2 -2
  701. paddlex/modules/ts_forecast/dataset_checker/dataset_src/convert_dataset.py +2 -6
  702. paddlex/modules/ts_forecast/dataset_checker/dataset_src/split_dataset.py +4 -4
  703. paddlex/modules/ts_forecast/evaluator.py +1 -1
  704. paddlex/modules/ts_forecast/exportor.py +2 -3
  705. paddlex/modules/ts_forecast/model_list.py +1 -1
  706. paddlex/modules/ts_forecast/trainer.py +21 -5
  707. paddlex/modules/video_classification/__init__.py +18 -0
  708. paddlex/modules/video_classification/dataset_checker/__init__.py +93 -0
  709. paddlex/modules/video_classification/dataset_checker/dataset_src/__init__.py +18 -0
  710. paddlex/modules/video_classification/dataset_checker/dataset_src/analyse_dataset.py +93 -0
  711. paddlex/modules/video_classification/dataset_checker/dataset_src/check_dataset.py +120 -0
  712. paddlex/modules/video_classification/dataset_checker/dataset_src/split_dataset.py +82 -0
  713. paddlex/modules/video_classification/evaluator.py +44 -0
  714. paddlex/modules/video_classification/exportor.py +22 -0
  715. paddlex/modules/video_classification/model_list.py +19 -0
  716. paddlex/modules/video_classification/trainer.py +88 -0
  717. paddlex/modules/video_detection/__init__.py +18 -0
  718. paddlex/modules/video_detection/dataset_checker/__init__.py +86 -0
  719. paddlex/modules/video_detection/dataset_checker/dataset_src/__init__.py +17 -0
  720. paddlex/modules/video_detection/dataset_checker/dataset_src/analyse_dataset.py +100 -0
  721. paddlex/modules/video_detection/dataset_checker/dataset_src/check_dataset.py +132 -0
  722. paddlex/modules/video_detection/evaluator.py +42 -0
  723. paddlex/modules/video_detection/exportor.py +22 -0
  724. paddlex/modules/video_detection/model_list.py +15 -0
  725. paddlex/modules/video_detection/trainer.py +82 -0
  726. paddlex/ops/__init__.py +152 -0
  727. paddlex/ops/iou3d_nms/iou3d_cpu.cpp +266 -0
  728. paddlex/ops/iou3d_nms/iou3d_cpu.h +28 -0
  729. paddlex/ops/iou3d_nms/iou3d_nms.cpp +206 -0
  730. paddlex/ops/iou3d_nms/iou3d_nms.h +35 -0
  731. paddlex/ops/iou3d_nms/iou3d_nms_api.cpp +114 -0
  732. paddlex/ops/iou3d_nms/iou3d_nms_kernel.cu +484 -0
  733. paddlex/ops/setup.py +37 -0
  734. paddlex/ops/voxel/voxelize_op.cc +194 -0
  735. paddlex/ops/voxel/voxelize_op.cu +346 -0
  736. paddlex/paddlex_cli.py +352 -74
  737. paddlex/repo_apis/Paddle3D_api/__init__.py +17 -0
  738. paddlex/repo_apis/Paddle3D_api/bev_fusion/__init__.py +18 -0
  739. paddlex/repo_apis/Paddle3D_api/bev_fusion/config.py +118 -0
  740. paddlex/repo_apis/Paddle3D_api/bev_fusion/model.py +238 -0
  741. paddlex/repo_apis/Paddle3D_api/bev_fusion/register.py +55 -0
  742. paddlex/repo_apis/Paddle3D_api/bev_fusion/runner.py +104 -0
  743. paddlex/repo_apis/Paddle3D_api/pp3d_config.py +145 -0
  744. paddlex/repo_apis/PaddleClas_api/__init__.py +1 -1
  745. paddlex/repo_apis/PaddleClas_api/cls/__init__.py +3 -3
  746. paddlex/repo_apis/PaddleClas_api/cls/config.py +4 -3
  747. paddlex/repo_apis/PaddleClas_api/cls/model.py +9 -3
  748. paddlex/repo_apis/PaddleClas_api/cls/register.py +22 -5
  749. paddlex/repo_apis/PaddleClas_api/cls/runner.py +1 -2
  750. paddlex/repo_apis/PaddleClas_api/shitu_rec/__init__.py +2 -2
  751. paddlex/repo_apis/PaddleClas_api/shitu_rec/config.py +2 -2
  752. paddlex/repo_apis/PaddleClas_api/shitu_rec/model.py +1 -4
  753. paddlex/repo_apis/PaddleClas_api/shitu_rec/register.py +2 -2
  754. paddlex/repo_apis/PaddleClas_api/shitu_rec/runner.py +1 -6
  755. paddlex/repo_apis/PaddleDetection_api/__init__.py +2 -2
  756. paddlex/repo_apis/PaddleDetection_api/config_helper.py +3 -3
  757. paddlex/repo_apis/PaddleDetection_api/instance_seg/__init__.py +2 -2
  758. paddlex/repo_apis/PaddleDetection_api/instance_seg/config.py +10 -7
  759. paddlex/repo_apis/PaddleDetection_api/instance_seg/model.py +9 -3
  760. paddlex/repo_apis/PaddleDetection_api/instance_seg/register.py +2 -3
  761. paddlex/repo_apis/PaddleDetection_api/instance_seg/runner.py +1 -2
  762. paddlex/repo_apis/PaddleDetection_api/object_det/__init__.py +3 -3
  763. paddlex/repo_apis/PaddleDetection_api/object_det/config.py +31 -8
  764. paddlex/repo_apis/PaddleDetection_api/object_det/model.py +11 -6
  765. paddlex/repo_apis/PaddleDetection_api/object_det/official_categories.py +82 -1
  766. paddlex/repo_apis/PaddleDetection_api/object_det/register.py +184 -6
  767. paddlex/repo_apis/PaddleDetection_api/object_det/runner.py +1 -2
  768. paddlex/repo_apis/PaddleNLP_api/__init__.py +1 -1
  769. paddlex/repo_apis/PaddleOCR_api/__init__.py +4 -2
  770. paddlex/repo_apis/PaddleOCR_api/config_utils.py +1 -1
  771. paddlex/repo_apis/PaddleOCR_api/formula_rec/__init__.py +16 -0
  772. paddlex/repo_apis/PaddleOCR_api/formula_rec/config.py +571 -0
  773. paddlex/repo_apis/PaddleOCR_api/formula_rec/model.py +402 -0
  774. paddlex/repo_apis/PaddleOCR_api/formula_rec/register.py +72 -0
  775. paddlex/repo_apis/PaddleOCR_api/formula_rec/runner.py +239 -0
  776. paddlex/repo_apis/PaddleOCR_api/table_rec/__init__.py +1 -1
  777. paddlex/repo_apis/PaddleOCR_api/table_rec/config.py +1 -1
  778. paddlex/repo_apis/PaddleOCR_api/table_rec/model.py +3 -3
  779. paddlex/repo_apis/PaddleOCR_api/table_rec/register.py +20 -3
  780. paddlex/repo_apis/PaddleOCR_api/table_rec/runner.py +2 -2
  781. paddlex/repo_apis/PaddleOCR_api/text_det/__init__.py +1 -1
  782. paddlex/repo_apis/PaddleOCR_api/text_det/config.py +1 -1
  783. paddlex/repo_apis/PaddleOCR_api/text_det/model.py +3 -3
  784. paddlex/repo_apis/PaddleOCR_api/text_det/register.py +20 -3
  785. paddlex/repo_apis/PaddleOCR_api/text_det/runner.py +2 -2
  786. paddlex/repo_apis/PaddleOCR_api/text_rec/__init__.py +1 -1
  787. paddlex/repo_apis/PaddleOCR_api/text_rec/config.py +25 -3
  788. paddlex/repo_apis/PaddleOCR_api/text_rec/model.py +10 -4
  789. paddlex/repo_apis/PaddleOCR_api/text_rec/register.py +128 -10
  790. paddlex/repo_apis/PaddleOCR_api/text_rec/runner.py +1 -2
  791. paddlex/repo_apis/PaddleSeg_api/__init__.py +1 -1
  792. paddlex/repo_apis/PaddleSeg_api/base_seg_config.py +2 -2
  793. paddlex/repo_apis/PaddleSeg_api/seg/__init__.py +1 -1
  794. paddlex/repo_apis/PaddleSeg_api/seg/config.py +12 -6
  795. paddlex/repo_apis/PaddleSeg_api/seg/model.py +15 -5
  796. paddlex/repo_apis/PaddleSeg_api/seg/register.py +22 -3
  797. paddlex/repo_apis/PaddleSeg_api/seg/runner.py +1 -2
  798. paddlex/repo_apis/PaddleTS_api/__init__.py +4 -3
  799. paddlex/repo_apis/PaddleTS_api/ts_ad/__init__.py +1 -1
  800. paddlex/repo_apis/PaddleTS_api/ts_ad/config.py +2 -3
  801. paddlex/repo_apis/PaddleTS_api/ts_ad/register.py +2 -2
  802. paddlex/repo_apis/PaddleTS_api/ts_ad/runner.py +2 -2
  803. paddlex/repo_apis/PaddleTS_api/ts_base/__init__.py +1 -1
  804. paddlex/repo_apis/PaddleTS_api/ts_base/config.py +25 -3
  805. paddlex/repo_apis/PaddleTS_api/ts_base/model.py +15 -11
  806. paddlex/repo_apis/PaddleTS_api/ts_base/runner.py +2 -2
  807. paddlex/repo_apis/PaddleTS_api/ts_cls/__init__.py +1 -1
  808. paddlex/repo_apis/PaddleTS_api/ts_cls/config.py +2 -3
  809. paddlex/repo_apis/PaddleTS_api/ts_cls/register.py +2 -2
  810. paddlex/repo_apis/PaddleTS_api/ts_cls/runner.py +2 -2
  811. paddlex/repo_apis/PaddleTS_api/ts_fc/__init__.py +1 -1
  812. paddlex/repo_apis/PaddleTS_api/ts_fc/config.py +2 -3
  813. paddlex/repo_apis/PaddleTS_api/ts_fc/register.py +1 -1
  814. paddlex/repo_apis/PaddleVideo_api/__init__.py +17 -0
  815. paddlex/repo_apis/PaddleVideo_api/config_utils.py +51 -0
  816. paddlex/repo_apis/PaddleVideo_api/video_cls/__init__.py +19 -0
  817. paddlex/repo_apis/PaddleVideo_api/video_cls/config.py +548 -0
  818. paddlex/repo_apis/PaddleVideo_api/video_cls/model.py +346 -0
  819. paddlex/repo_apis/PaddleVideo_api/video_cls/register.py +70 -0
  820. paddlex/repo_apis/PaddleVideo_api/video_cls/runner.py +204 -0
  821. paddlex/repo_apis/PaddleVideo_api/video_det/__init__.py +19 -0
  822. paddlex/repo_apis/PaddleVideo_api/video_det/config.py +549 -0
  823. paddlex/repo_apis/PaddleVideo_api/video_det/model.py +298 -0
  824. paddlex/repo_apis/PaddleVideo_api/video_det/register.py +44 -0
  825. paddlex/repo_apis/PaddleVideo_api/video_det/runner.py +199 -0
  826. paddlex/repo_apis/__init__.py +1 -1
  827. paddlex/repo_apis/base/__init__.py +4 -5
  828. paddlex/repo_apis/base/config.py +2 -3
  829. paddlex/repo_apis/base/model.py +11 -19
  830. paddlex/repo_apis/base/register.py +1 -1
  831. paddlex/repo_apis/base/runner.py +13 -13
  832. paddlex/repo_apis/base/utils/__init__.py +1 -1
  833. paddlex/repo_apis/base/utils/arg.py +1 -1
  834. paddlex/repo_apis/base/utils/subprocess.py +1 -1
  835. paddlex/repo_manager/__init__.py +2 -9
  836. paddlex/repo_manager/core.py +9 -27
  837. paddlex/repo_manager/meta.py +58 -25
  838. paddlex/repo_manager/repo.py +169 -141
  839. paddlex/repo_manager/utils.py +72 -222
  840. paddlex/utils/__init__.py +1 -1
  841. paddlex/utils/cache.py +8 -10
  842. paddlex/utils/config.py +10 -8
  843. paddlex/utils/custom_device_list.py +287 -0
  844. paddlex/utils/deps.py +249 -0
  845. paddlex/utils/device.py +125 -33
  846. paddlex/utils/download.py +4 -4
  847. paddlex/utils/env.py +54 -0
  848. paddlex/utils/errors/__init__.py +1 -1
  849. paddlex/utils/errors/dataset_checker.py +1 -1
  850. paddlex/utils/errors/others.py +2 -16
  851. paddlex/utils/file_interface.py +4 -5
  852. paddlex/utils/flags.py +22 -11
  853. paddlex/utils/fonts/__init__.py +50 -6
  854. paddlex/utils/func_register.py +1 -1
  855. paddlex/utils/install.py +87 -0
  856. paddlex/utils/interactive_get_pipeline.py +3 -3
  857. paddlex/utils/lazy_loader.py +4 -2
  858. paddlex/utils/logging.py +11 -3
  859. paddlex/utils/misc.py +5 -5
  860. paddlex/utils/pipeline_arguments.py +719 -0
  861. paddlex/utils/result_saver.py +4 -5
  862. paddlex/utils/subclass_register.py +2 -4
  863. paddlex/version.py +2 -1
  864. paddlex-3.0.0rc1.dist-info/METADATA +1174 -0
  865. paddlex-3.0.0rc1.dist-info/RECORD +1068 -0
  866. paddlex-3.0.0rc1.dist-info/WHEEL +5 -0
  867. paddlex/configs/face_recognition/MobileFaceNet.yaml +0 -44
  868. paddlex/configs/face_recognition/ResNet50_face.yaml +0 -44
  869. paddlex/configs/formula_recognition/LaTeX_OCR_rec.yaml +0 -40
  870. paddlex/configs/image_classification/CLIP_vit_base_patch16_224.yaml +0 -41
  871. paddlex/configs/image_classification/CLIP_vit_large_patch14_224.yaml +0 -41
  872. paddlex/configs/image_classification/ConvNeXt_large_384.yaml +0 -41
  873. paddlex/configs/object_detection/YOLOX-X.yaml +0 -40
  874. paddlex/configs/semantic_segmentation/SeaFormer_base.yaml +0 -40
  875. paddlex/configs/semantic_segmentation/SeaFormer_large.yaml +0 -40
  876. paddlex/configs/semantic_segmentation/SeaFormer_small.yaml +0 -40
  877. paddlex/configs/semantic_segmentation/SeaFormer_tiny.yaml +0 -40
  878. paddlex/inference/components/__init__.py +0 -18
  879. paddlex/inference/components/base.py +0 -292
  880. paddlex/inference/components/llm/__init__.py +0 -25
  881. paddlex/inference/components/llm/base.py +0 -65
  882. paddlex/inference/components/llm/erniebot.py +0 -212
  883. paddlex/inference/components/paddle_predictor/__init__.py +0 -20
  884. paddlex/inference/components/paddle_predictor/predictor.py +0 -332
  885. paddlex/inference/components/retrieval/__init__.py +0 -15
  886. paddlex/inference/components/retrieval/faiss.py +0 -359
  887. paddlex/inference/components/task_related/__init__.py +0 -33
  888. paddlex/inference/components/task_related/clas.py +0 -124
  889. paddlex/inference/components/task_related/det.py +0 -284
  890. paddlex/inference/components/task_related/instance_seg.py +0 -89
  891. paddlex/inference/components/task_related/seal_det_warp.py +0 -940
  892. paddlex/inference/components/task_related/seg.py +0 -40
  893. paddlex/inference/components/task_related/table_rec.py +0 -191
  894. paddlex/inference/components/task_related/text_det.py +0 -895
  895. paddlex/inference/components/task_related/text_rec.py +0 -353
  896. paddlex/inference/components/task_related/warp.py +0 -43
  897. paddlex/inference/components/transforms/__init__.py +0 -16
  898. paddlex/inference/components/transforms/image/__init__.py +0 -15
  899. paddlex/inference/components/transforms/image/common.py +0 -598
  900. paddlex/inference/components/transforms/image/funcs.py +0 -58
  901. paddlex/inference/components/transforms/read_data.py +0 -67
  902. paddlex/inference/components/transforms/ts/__init__.py +0 -15
  903. paddlex/inference/components/transforms/ts/common.py +0 -393
  904. paddlex/inference/components/transforms/ts/funcs.py +0 -424
  905. paddlex/inference/models/anomaly_detection.py +0 -87
  906. paddlex/inference/models/base/base_predictor.py +0 -76
  907. paddlex/inference/models/base/basic_predictor.py +0 -122
  908. paddlex/inference/models/face_recognition.py +0 -21
  909. paddlex/inference/models/formula_recognition.py +0 -55
  910. paddlex/inference/models/general_recognition.py +0 -99
  911. paddlex/inference/models/image_classification.py +0 -101
  912. paddlex/inference/models/image_unwarping.py +0 -43
  913. paddlex/inference/models/instance_segmentation.py +0 -66
  914. paddlex/inference/models/multilabel_classification.py +0 -33
  915. paddlex/inference/models/object_detection.py +0 -129
  916. paddlex/inference/models/semantic_segmentation.py +0 -86
  917. paddlex/inference/models/table_recognition.py +0 -106
  918. paddlex/inference/models/text_detection.py +0 -105
  919. paddlex/inference/models/text_recognition.py +0 -78
  920. paddlex/inference/models/ts_ad.py +0 -68
  921. paddlex/inference/models/ts_cls.py +0 -57
  922. paddlex/inference/models/ts_fc.py +0 -73
  923. paddlex/inference/pipelines/attribute_recognition.py +0 -92
  924. paddlex/inference/pipelines/face_recognition.py +0 -49
  925. paddlex/inference/pipelines/formula_recognition.py +0 -102
  926. paddlex/inference/pipelines/layout_parsing/layout_parsing.py +0 -362
  927. paddlex/inference/pipelines/ocr.py +0 -80
  928. paddlex/inference/pipelines/pp_shitu_v2.py +0 -152
  929. paddlex/inference/pipelines/ppchatocrv3/__init__.py +0 -15
  930. paddlex/inference/pipelines/ppchatocrv3/ch_prompt.yaml +0 -14
  931. paddlex/inference/pipelines/ppchatocrv3/ppchatocrv3.py +0 -717
  932. paddlex/inference/pipelines/ppchatocrv3/utils.py +0 -168
  933. paddlex/inference/pipelines/seal_recognition.py +0 -152
  934. paddlex/inference/pipelines/serving/__init__.py +0 -17
  935. paddlex/inference/pipelines/serving/_pipeline_apps/__init__.py +0 -205
  936. paddlex/inference/pipelines/serving/_pipeline_apps/anomaly_detection.py +0 -80
  937. paddlex/inference/pipelines/serving/_pipeline_apps/face_recognition.py +0 -317
  938. paddlex/inference/pipelines/serving/_pipeline_apps/formula_recognition.py +0 -119
  939. paddlex/inference/pipelines/serving/_pipeline_apps/image_classification.py +0 -101
  940. paddlex/inference/pipelines/serving/_pipeline_apps/instance_segmentation.py +0 -112
  941. paddlex/inference/pipelines/serving/_pipeline_apps/layout_parsing.py +0 -205
  942. paddlex/inference/pipelines/serving/_pipeline_apps/multi_label_image_classification.py +0 -90
  943. paddlex/inference/pipelines/serving/_pipeline_apps/object_detection.py +0 -90
  944. paddlex/inference/pipelines/serving/_pipeline_apps/ocr.py +0 -98
  945. paddlex/inference/pipelines/serving/_pipeline_apps/pedestrian_attribute_recognition.py +0 -102
  946. paddlex/inference/pipelines/serving/_pipeline_apps/pp_shitu_v2.py +0 -319
  947. paddlex/inference/pipelines/serving/_pipeline_apps/ppchatocrv3.py +0 -445
  948. paddlex/inference/pipelines/serving/_pipeline_apps/seal_recognition.py +0 -110
  949. paddlex/inference/pipelines/serving/_pipeline_apps/semantic_segmentation.py +0 -82
  950. paddlex/inference/pipelines/serving/_pipeline_apps/small_object_detection.py +0 -92
  951. paddlex/inference/pipelines/serving/_pipeline_apps/table_recognition.py +0 -110
  952. paddlex/inference/pipelines/serving/_pipeline_apps/ts_ad.py +0 -68
  953. paddlex/inference/pipelines/serving/_pipeline_apps/ts_cls.py +0 -68
  954. paddlex/inference/pipelines/serving/_pipeline_apps/ts_fc.py +0 -68
  955. paddlex/inference/pipelines/serving/_pipeline_apps/vehicle_attribute_recognition.py +0 -102
  956. paddlex/inference/pipelines/serving/app.py +0 -164
  957. paddlex/inference/pipelines/serving/models.py +0 -30
  958. paddlex/inference/pipelines/serving/server.py +0 -25
  959. paddlex/inference/pipelines/serving/storage.py +0 -161
  960. paddlex/inference/pipelines/serving/utils.py +0 -190
  961. paddlex/inference/pipelines/single_model_pipeline.py +0 -76
  962. paddlex/inference/pipelines/table_recognition/table_recognition.py +0 -193
  963. paddlex/inference/results/__init__.py +0 -31
  964. paddlex/inference/results/attribute_rec.py +0 -89
  965. paddlex/inference/results/base.py +0 -43
  966. paddlex/inference/results/chat_ocr.py +0 -158
  967. paddlex/inference/results/clas.py +0 -133
  968. paddlex/inference/results/det.py +0 -86
  969. paddlex/inference/results/face_rec.py +0 -34
  970. paddlex/inference/results/formula_rec.py +0 -363
  971. paddlex/inference/results/instance_seg.py +0 -152
  972. paddlex/inference/results/ocr.py +0 -157
  973. paddlex/inference/results/seal_rec.py +0 -50
  974. paddlex/inference/results/seg.py +0 -72
  975. paddlex/inference/results/shitu.py +0 -35
  976. paddlex/inference/results/table_rec.py +0 -109
  977. paddlex/inference/results/text_det.py +0 -33
  978. paddlex/inference/results/text_rec.py +0 -66
  979. paddlex/inference/results/ts.py +0 -37
  980. paddlex/inference/results/utils/__init__.py +0 -13
  981. paddlex/inference/results/utils/mixin.py +0 -204
  982. paddlex/inference/results/warp.py +0 -31
  983. paddlex/inference/utils/new_ir_blacklist.py +0 -22
  984. paddlex/inference/utils/process_hook.py +0 -54
  985. paddlex/pipelines/OCR.yaml +0 -8
  986. paddlex/pipelines/PP-ChatOCRv3-doc.yaml +0 -27
  987. paddlex/pipelines/PP-ShiTuV2.yaml +0 -13
  988. paddlex/pipelines/anomaly_detection.yaml +0 -7
  989. paddlex/pipelines/face_recognition.yaml +0 -13
  990. paddlex/pipelines/formula_recognition.yaml +0 -8
  991. paddlex/pipelines/image_classification.yaml +0 -7
  992. paddlex/pipelines/instance_segmentation.yaml +0 -7
  993. paddlex/pipelines/layout_parsing.yaml +0 -14
  994. paddlex/pipelines/multi_label_image_classification.yaml +0 -7
  995. paddlex/pipelines/object_detection.yaml +0 -7
  996. paddlex/pipelines/pedestrian_attribute_recognition.yaml +0 -7
  997. paddlex/pipelines/seal_recognition.yaml +0 -10
  998. paddlex/pipelines/semantic_segmentation.yaml +0 -7
  999. paddlex/pipelines/small_object_detection.yaml +0 -7
  1000. paddlex/pipelines/table_recognition.yaml +0 -12
  1001. paddlex/pipelines/ts_ad.yaml +0 -7
  1002. paddlex/pipelines/ts_cls.yaml +0 -7
  1003. paddlex/pipelines/ts_fc.yaml +0 -7
  1004. paddlex/pipelines/vehicle_attribute_recognition.yaml +0 -7
  1005. paddlex/repo_manager/requirements.txt +0 -18
  1006. paddlex/utils/fonts/PingFang-SC-Regular.ttf +0 -0
  1007. paddlex-3.0.0b2.dist-info/METADATA +0 -760
  1008. paddlex-3.0.0b2.dist-info/RECORD +0 -646
  1009. paddlex-3.0.0b2.dist-info/WHEEL +0 -5
  1010. /paddlex/configs/{doc_text_orientation → modules/doc_text_orientation}/PP-LCNet_x1_0_doc_ori.yaml +0 -0
  1011. /paddlex/configs/{face_detection → modules/face_detection}/BlazeFace-FPN-SSH.yaml +0 -0
  1012. /paddlex/configs/{face_detection → modules/face_detection}/BlazeFace.yaml +0 -0
  1013. /paddlex/configs/{face_detection → modules/face_detection}/PP-YOLOE_plus-S_face.yaml +0 -0
  1014. /paddlex/configs/{face_detection → modules/face_detection}/PicoDet_LCNet_x2_5_face.yaml +0 -0
  1015. /paddlex/configs/{human_detection → modules/human_detection}/PP-YOLOE-L_human.yaml +0 -0
  1016. /paddlex/configs/{human_detection → modules/human_detection}/PP-YOLOE-S_human.yaml +0 -0
  1017. /paddlex/configs/{anomaly_detection → modules/image_anomaly_detection}/STFPM.yaml +0 -0
  1018. /paddlex/configs/{image_classification → modules/image_classification}/ConvNeXt_base_224.yaml +0 -0
  1019. /paddlex/configs/{image_classification → modules/image_classification}/ConvNeXt_base_384.yaml +0 -0
  1020. /paddlex/configs/{image_classification → modules/image_classification}/ConvNeXt_large_224.yaml +0 -0
  1021. /paddlex/configs/{image_classification → modules/image_classification}/ConvNeXt_small.yaml +0 -0
  1022. /paddlex/configs/{image_classification → modules/image_classification}/ConvNeXt_tiny.yaml +0 -0
  1023. /paddlex/configs/{image_classification → modules/image_classification}/FasterNet-L.yaml +0 -0
  1024. /paddlex/configs/{image_classification → modules/image_classification}/FasterNet-M.yaml +0 -0
  1025. /paddlex/configs/{image_classification → modules/image_classification}/FasterNet-S.yaml +0 -0
  1026. /paddlex/configs/{image_classification → modules/image_classification}/FasterNet-T0.yaml +0 -0
  1027. /paddlex/configs/{image_classification → modules/image_classification}/FasterNet-T1.yaml +0 -0
  1028. /paddlex/configs/{image_classification → modules/image_classification}/FasterNet-T2.yaml +0 -0
  1029. /paddlex/configs/{image_classification → modules/image_classification}/MobileNetV1_x0_25.yaml +0 -0
  1030. /paddlex/configs/{image_classification → modules/image_classification}/MobileNetV1_x0_5.yaml +0 -0
  1031. /paddlex/configs/{image_classification → modules/image_classification}/MobileNetV1_x0_75.yaml +0 -0
  1032. /paddlex/configs/{image_classification → modules/image_classification}/MobileNetV1_x1_0.yaml +0 -0
  1033. /paddlex/configs/{image_classification → modules/image_classification}/MobileNetV2_x0_25.yaml +0 -0
  1034. /paddlex/configs/{image_classification → modules/image_classification}/MobileNetV2_x0_5.yaml +0 -0
  1035. /paddlex/configs/{image_classification → modules/image_classification}/MobileNetV2_x1_0.yaml +0 -0
  1036. /paddlex/configs/{image_classification → modules/image_classification}/MobileNetV2_x1_5.yaml +0 -0
  1037. /paddlex/configs/{image_classification → modules/image_classification}/MobileNetV2_x2_0.yaml +0 -0
  1038. /paddlex/configs/{image_classification → modules/image_classification}/MobileNetV3_large_x0_35.yaml +0 -0
  1039. /paddlex/configs/{image_classification → modules/image_classification}/MobileNetV3_large_x0_5.yaml +0 -0
  1040. /paddlex/configs/{image_classification → modules/image_classification}/MobileNetV3_large_x0_75.yaml +0 -0
  1041. /paddlex/configs/{image_classification → modules/image_classification}/MobileNetV3_large_x1_0.yaml +0 -0
  1042. /paddlex/configs/{image_classification → modules/image_classification}/MobileNetV3_large_x1_25.yaml +0 -0
  1043. /paddlex/configs/{image_classification → modules/image_classification}/MobileNetV3_small_x0_35.yaml +0 -0
  1044. /paddlex/configs/{image_classification → modules/image_classification}/MobileNetV3_small_x0_5.yaml +0 -0
  1045. /paddlex/configs/{image_classification → modules/image_classification}/MobileNetV3_small_x0_75.yaml +0 -0
  1046. /paddlex/configs/{image_classification → modules/image_classification}/MobileNetV3_small_x1_0.yaml +0 -0
  1047. /paddlex/configs/{image_classification → modules/image_classification}/MobileNetV3_small_x1_25.yaml +0 -0
  1048. /paddlex/configs/{image_classification → modules/image_classification}/MobileNetV4_conv_large.yaml +0 -0
  1049. /paddlex/configs/{image_classification → modules/image_classification}/MobileNetV4_conv_medium.yaml +0 -0
  1050. /paddlex/configs/{image_classification → modules/image_classification}/MobileNetV4_conv_small.yaml +0 -0
  1051. /paddlex/configs/{image_classification → modules/image_classification}/MobileNetV4_hybrid_large.yaml +0 -0
  1052. /paddlex/configs/{image_classification → modules/image_classification}/MobileNetV4_hybrid_medium.yaml +0 -0
  1053. /paddlex/configs/{image_classification → modules/image_classification}/PP-HGNetV2-B0.yaml +0 -0
  1054. /paddlex/configs/{image_classification → modules/image_classification}/PP-HGNetV2-B1.yaml +0 -0
  1055. /paddlex/configs/{image_classification → modules/image_classification}/PP-HGNetV2-B2.yaml +0 -0
  1056. /paddlex/configs/{image_classification → modules/image_classification}/PP-HGNetV2-B3.yaml +0 -0
  1057. /paddlex/configs/{image_classification → modules/image_classification}/PP-HGNetV2-B4.yaml +0 -0
  1058. /paddlex/configs/{image_classification → modules/image_classification}/PP-HGNetV2-B5.yaml +0 -0
  1059. /paddlex/configs/{image_classification → modules/image_classification}/PP-HGNetV2-B6.yaml +0 -0
  1060. /paddlex/configs/{image_classification → modules/image_classification}/PP-HGNet_base.yaml +0 -0
  1061. /paddlex/configs/{image_classification → modules/image_classification}/PP-HGNet_small.yaml +0 -0
  1062. /paddlex/configs/{image_classification → modules/image_classification}/PP-HGNet_tiny.yaml +0 -0
  1063. /paddlex/configs/{image_classification → modules/image_classification}/PP-LCNetV2_base.yaml +0 -0
  1064. /paddlex/configs/{image_classification → modules/image_classification}/PP-LCNetV2_large.yaml +0 -0
  1065. /paddlex/configs/{image_classification → modules/image_classification}/PP-LCNetV2_small.yaml +0 -0
  1066. /paddlex/configs/{image_classification → modules/image_classification}/PP-LCNet_x0_25.yaml +0 -0
  1067. /paddlex/configs/{image_classification → modules/image_classification}/PP-LCNet_x0_35.yaml +0 -0
  1068. /paddlex/configs/{image_classification → modules/image_classification}/PP-LCNet_x0_5.yaml +0 -0
  1069. /paddlex/configs/{image_classification → modules/image_classification}/PP-LCNet_x0_75.yaml +0 -0
  1070. /paddlex/configs/{image_classification → modules/image_classification}/PP-LCNet_x1_0.yaml +0 -0
  1071. /paddlex/configs/{image_classification → modules/image_classification}/PP-LCNet_x1_5.yaml +0 -0
  1072. /paddlex/configs/{image_classification → modules/image_classification}/PP-LCNet_x2_0.yaml +0 -0
  1073. /paddlex/configs/{image_classification → modules/image_classification}/PP-LCNet_x2_5.yaml +0 -0
  1074. /paddlex/configs/{image_classification → modules/image_classification}/ResNet101.yaml +0 -0
  1075. /paddlex/configs/{image_classification → modules/image_classification}/ResNet101_vd.yaml +0 -0
  1076. /paddlex/configs/{image_classification → modules/image_classification}/ResNet152.yaml +0 -0
  1077. /paddlex/configs/{image_classification → modules/image_classification}/ResNet152_vd.yaml +0 -0
  1078. /paddlex/configs/{image_classification → modules/image_classification}/ResNet18.yaml +0 -0
  1079. /paddlex/configs/{image_classification → modules/image_classification}/ResNet18_vd.yaml +0 -0
  1080. /paddlex/configs/{image_classification → modules/image_classification}/ResNet200_vd.yaml +0 -0
  1081. /paddlex/configs/{image_classification → modules/image_classification}/ResNet34.yaml +0 -0
  1082. /paddlex/configs/{image_classification → modules/image_classification}/ResNet34_vd.yaml +0 -0
  1083. /paddlex/configs/{image_classification → modules/image_classification}/ResNet50.yaml +0 -0
  1084. /paddlex/configs/{image_classification → modules/image_classification}/ResNet50_vd.yaml +0 -0
  1085. /paddlex/configs/{image_classification → modules/image_classification}/StarNet-S1.yaml +0 -0
  1086. /paddlex/configs/{image_classification → modules/image_classification}/StarNet-S2.yaml +0 -0
  1087. /paddlex/configs/{image_classification → modules/image_classification}/StarNet-S3.yaml +0 -0
  1088. /paddlex/configs/{image_classification → modules/image_classification}/StarNet-S4.yaml +0 -0
  1089. /paddlex/configs/{image_classification → modules/image_classification}/SwinTransformer_base_patch4_window12_384.yaml +0 -0
  1090. /paddlex/configs/{image_classification → modules/image_classification}/SwinTransformer_base_patch4_window7_224.yaml +0 -0
  1091. /paddlex/configs/{image_classification → modules/image_classification}/SwinTransformer_large_patch4_window12_384.yaml +0 -0
  1092. /paddlex/configs/{image_classification → modules/image_classification}/SwinTransformer_large_patch4_window7_224.yaml +0 -0
  1093. /paddlex/configs/{image_classification → modules/image_classification}/SwinTransformer_small_patch4_window7_224.yaml +0 -0
  1094. /paddlex/configs/{image_classification → modules/image_classification}/SwinTransformer_tiny_patch4_window7_224.yaml +0 -0
  1095. /paddlex/configs/{general_recognition → modules/image_feature}/PP-ShiTuV2_rec.yaml +0 -0
  1096. /paddlex/configs/{general_recognition → modules/image_feature}/PP-ShiTuV2_rec_CLIP_vit_base.yaml +0 -0
  1097. /paddlex/configs/{general_recognition → modules/image_feature}/PP-ShiTuV2_rec_CLIP_vit_large.yaml +0 -0
  1098. /paddlex/configs/{multilabel_classification → modules/image_multilabel_classification}/CLIP_vit_base_patch16_448_ML.yaml +0 -0
  1099. /paddlex/configs/{multilabel_classification → modules/image_multilabel_classification}/PP-HGNetV2-B0_ML.yaml +0 -0
  1100. /paddlex/configs/{multilabel_classification → modules/image_multilabel_classification}/PP-HGNetV2-B4_ML.yaml +0 -0
  1101. /paddlex/configs/{multilabel_classification → modules/image_multilabel_classification}/PP-HGNetV2-B6_ML.yaml +0 -0
  1102. /paddlex/configs/{multilabel_classification → modules/image_multilabel_classification}/PP-LCNet_x1_0_ML.yaml +0 -0
  1103. /paddlex/configs/{multilabel_classification → modules/image_multilabel_classification}/ResNet50_ML.yaml +0 -0
  1104. /paddlex/configs/{image_unwarping → modules/image_unwarping}/UVDoc.yaml +0 -0
  1105. /paddlex/configs/{instance_segmentation → modules/instance_segmentation}/Cascade-MaskRCNN-ResNet50-FPN.yaml +0 -0
  1106. /paddlex/configs/{instance_segmentation → modules/instance_segmentation}/Cascade-MaskRCNN-ResNet50-vd-SSLDv2-FPN.yaml +0 -0
  1107. /paddlex/configs/{instance_segmentation → modules/instance_segmentation}/Mask-RT-DETR-H.yaml +0 -0
  1108. /paddlex/configs/{instance_segmentation → modules/instance_segmentation}/Mask-RT-DETR-L.yaml +0 -0
  1109. /paddlex/configs/{instance_segmentation → modules/instance_segmentation}/Mask-RT-DETR-M.yaml +0 -0
  1110. /paddlex/configs/{instance_segmentation → modules/instance_segmentation}/Mask-RT-DETR-S.yaml +0 -0
  1111. /paddlex/configs/{instance_segmentation → modules/instance_segmentation}/Mask-RT-DETR-X.yaml +0 -0
  1112. /paddlex/configs/{instance_segmentation → modules/instance_segmentation}/MaskRCNN-ResNeXt101-vd-FPN.yaml +0 -0
  1113. /paddlex/configs/{instance_segmentation → modules/instance_segmentation}/MaskRCNN-ResNet101-FPN.yaml +0 -0
  1114. /paddlex/configs/{instance_segmentation → modules/instance_segmentation}/MaskRCNN-ResNet101-vd-FPN.yaml +0 -0
  1115. /paddlex/configs/{instance_segmentation → modules/instance_segmentation}/MaskRCNN-ResNet50-FPN.yaml +0 -0
  1116. /paddlex/configs/{instance_segmentation → modules/instance_segmentation}/MaskRCNN-ResNet50-vd-FPN.yaml +0 -0
  1117. /paddlex/configs/{instance_segmentation → modules/instance_segmentation}/MaskRCNN-ResNet50.yaml +0 -0
  1118. /paddlex/configs/{instance_segmentation → modules/instance_segmentation}/PP-YOLOE_seg-S.yaml +0 -0
  1119. /paddlex/configs/{instance_segmentation → modules/instance_segmentation}/SOLOv2.yaml +0 -0
  1120. /paddlex/configs/{structure_analysis → modules/layout_detection}/PicoDet-L_layout_17cls.yaml +0 -0
  1121. /paddlex/configs/{structure_analysis → modules/layout_detection}/PicoDet-L_layout_3cls.yaml +0 -0
  1122. /paddlex/configs/{structure_analysis → modules/layout_detection}/PicoDet-S_layout_17cls.yaml +0 -0
  1123. /paddlex/configs/{structure_analysis → modules/layout_detection}/PicoDet-S_layout_3cls.yaml +0 -0
  1124. /paddlex/configs/{structure_analysis → modules/layout_detection}/PicoDet_layout_1x.yaml +0 -0
  1125. /paddlex/configs/{structure_analysis → modules/layout_detection}/PicoDet_layout_1x_table.yaml +0 -0
  1126. /paddlex/configs/{structure_analysis → modules/layout_detection}/RT-DETR-H_layout_17cls.yaml +0 -0
  1127. /paddlex/configs/{structure_analysis → modules/layout_detection}/RT-DETR-H_layout_3cls.yaml +0 -0
  1128. /paddlex/configs/{mainbody_detection → modules/mainbody_detection}/PP-ShiTuV2_det.yaml +0 -0
  1129. /paddlex/configs/{object_detection → modules/object_detection}/Cascade-FasterRCNN-ResNet50-FPN.yaml +0 -0
  1130. /paddlex/configs/{object_detection → modules/object_detection}/Cascade-FasterRCNN-ResNet50-vd-SSLDv2-FPN.yaml +0 -0
  1131. /paddlex/configs/{object_detection → modules/object_detection}/CenterNet-DLA-34.yaml +0 -0
  1132. /paddlex/configs/{object_detection → modules/object_detection}/CenterNet-ResNet50.yaml +0 -0
  1133. /paddlex/configs/{object_detection → modules/object_detection}/DETR-R50.yaml +0 -0
  1134. /paddlex/configs/{object_detection → modules/object_detection}/FCOS-ResNet50.yaml +0 -0
  1135. /paddlex/configs/{object_detection → modules/object_detection}/FasterRCNN-ResNeXt101-vd-FPN.yaml +0 -0
  1136. /paddlex/configs/{object_detection → modules/object_detection}/FasterRCNN-ResNet101-FPN.yaml +0 -0
  1137. /paddlex/configs/{object_detection → modules/object_detection}/FasterRCNN-ResNet101.yaml +0 -0
  1138. /paddlex/configs/{object_detection → modules/object_detection}/FasterRCNN-ResNet34-FPN.yaml +0 -0
  1139. /paddlex/configs/{object_detection → modules/object_detection}/FasterRCNN-ResNet50-FPN.yaml +0 -0
  1140. /paddlex/configs/{object_detection → modules/object_detection}/FasterRCNN-ResNet50-vd-FPN.yaml +0 -0
  1141. /paddlex/configs/{object_detection → modules/object_detection}/FasterRCNN-ResNet50-vd-SSLDv2-FPN.yaml +0 -0
  1142. /paddlex/configs/{object_detection → modules/object_detection}/FasterRCNN-ResNet50.yaml +0 -0
  1143. /paddlex/configs/{object_detection → modules/object_detection}/FasterRCNN-Swin-Tiny-FPN.yaml +0 -0
  1144. /paddlex/configs/{object_detection → modules/object_detection}/PP-YOLOE_plus-L.yaml +0 -0
  1145. /paddlex/configs/{object_detection → modules/object_detection}/PP-YOLOE_plus-M.yaml +0 -0
  1146. /paddlex/configs/{object_detection → modules/object_detection}/PP-YOLOE_plus-S.yaml +0 -0
  1147. /paddlex/configs/{object_detection → modules/object_detection}/PP-YOLOE_plus-X.yaml +0 -0
  1148. /paddlex/configs/{object_detection → modules/object_detection}/PicoDet-L.yaml +0 -0
  1149. /paddlex/configs/{object_detection → modules/object_detection}/PicoDet-M.yaml +0 -0
  1150. /paddlex/configs/{object_detection → modules/object_detection}/PicoDet-S.yaml +0 -0
  1151. /paddlex/configs/{object_detection → modules/object_detection}/PicoDet-XS.yaml +0 -0
  1152. /paddlex/configs/{object_detection → modules/object_detection}/RT-DETR-H.yaml +0 -0
  1153. /paddlex/configs/{object_detection → modules/object_detection}/RT-DETR-L.yaml +0 -0
  1154. /paddlex/configs/{object_detection → modules/object_detection}/RT-DETR-R18.yaml +0 -0
  1155. /paddlex/configs/{object_detection → modules/object_detection}/RT-DETR-R50.yaml +0 -0
  1156. /paddlex/configs/{object_detection → modules/object_detection}/RT-DETR-X.yaml +0 -0
  1157. /paddlex/configs/{object_detection → modules/object_detection}/YOLOX-L.yaml +0 -0
  1158. /paddlex/configs/{object_detection → modules/object_detection}/YOLOX-M.yaml +0 -0
  1159. /paddlex/configs/{object_detection → modules/object_detection}/YOLOX-N.yaml +0 -0
  1160. /paddlex/configs/{object_detection → modules/object_detection}/YOLOX-S.yaml +0 -0
  1161. /paddlex/configs/{object_detection → modules/object_detection}/YOLOX-T.yaml +0 -0
  1162. /paddlex/configs/{object_detection → modules/object_detection}/YOLOv3-DarkNet53.yaml +0 -0
  1163. /paddlex/configs/{object_detection → modules/object_detection}/YOLOv3-MobileNetV3.yaml +0 -0
  1164. /paddlex/configs/{object_detection → modules/object_detection}/YOLOv3-ResNet50_vd_DCN.yaml +0 -0
  1165. /paddlex/configs/{pedestrian_attribute → modules/pedestrian_attribute_recognition}/PP-LCNet_x1_0_pedestrian_attribute.yaml +0 -0
  1166. /paddlex/configs/{text_detection_seal → modules/seal_text_detection}/PP-OCRv4_mobile_seal_det.yaml +0 -0
  1167. /paddlex/configs/{text_detection_seal → modules/seal_text_detection}/PP-OCRv4_server_seal_det.yaml +0 -0
  1168. /paddlex/configs/{semantic_segmentation → modules/semantic_segmentation}/Deeplabv3-R101.yaml +0 -0
  1169. /paddlex/configs/{semantic_segmentation → modules/semantic_segmentation}/Deeplabv3-R50.yaml +0 -0
  1170. /paddlex/configs/{semantic_segmentation → modules/semantic_segmentation}/Deeplabv3_Plus-R101.yaml +0 -0
  1171. /paddlex/configs/{semantic_segmentation → modules/semantic_segmentation}/Deeplabv3_Plus-R50.yaml +0 -0
  1172. /paddlex/configs/{semantic_segmentation → modules/semantic_segmentation}/OCRNet_HRNet-W18.yaml +0 -0
  1173. /paddlex/configs/{semantic_segmentation → modules/semantic_segmentation}/OCRNet_HRNet-W48.yaml +0 -0
  1174. /paddlex/configs/{semantic_segmentation → modules/semantic_segmentation}/PP-LiteSeg-B.yaml +0 -0
  1175. /paddlex/configs/{semantic_segmentation → modules/semantic_segmentation}/PP-LiteSeg-T.yaml +0 -0
  1176. /paddlex/configs/{semantic_segmentation → modules/semantic_segmentation}/SegFormer-B0.yaml +0 -0
  1177. /paddlex/configs/{semantic_segmentation → modules/semantic_segmentation}/SegFormer-B1.yaml +0 -0
  1178. /paddlex/configs/{semantic_segmentation → modules/semantic_segmentation}/SegFormer-B2.yaml +0 -0
  1179. /paddlex/configs/{semantic_segmentation → modules/semantic_segmentation}/SegFormer-B3.yaml +0 -0
  1180. /paddlex/configs/{semantic_segmentation → modules/semantic_segmentation}/SegFormer-B4.yaml +0 -0
  1181. /paddlex/configs/{semantic_segmentation → modules/semantic_segmentation}/SegFormer-B5.yaml +0 -0
  1182. /paddlex/configs/{small_object_detection → modules/small_object_detection}/PP-YOLOE_plus_SOD-L.yaml +0 -0
  1183. /paddlex/configs/{small_object_detection → modules/small_object_detection}/PP-YOLOE_plus_SOD-S.yaml +0 -0
  1184. /paddlex/configs/{small_object_detection → modules/small_object_detection}/PP-YOLOE_plus_SOD-largesize-L.yaml +0 -0
  1185. /paddlex/configs/{table_recognition → modules/table_structure_recognition}/SLANet.yaml +0 -0
  1186. /paddlex/configs/{table_recognition → modules/table_structure_recognition}/SLANet_plus.yaml +0 -0
  1187. /paddlex/configs/{text_detection → modules/text_detection}/PP-OCRv4_mobile_det.yaml +0 -0
  1188. /paddlex/configs/{text_detection → modules/text_detection}/PP-OCRv4_server_det.yaml +0 -0
  1189. /paddlex/configs/{text_recognition → modules/text_recognition}/PP-OCRv4_mobile_rec.yaml +0 -0
  1190. /paddlex/configs/{text_recognition → modules/text_recognition}/PP-OCRv4_server_rec.yaml +0 -0
  1191. /paddlex/configs/{text_recognition → modules/text_recognition}/ch_RepSVTR_rec.yaml +0 -0
  1192. /paddlex/configs/{text_recognition → modules/text_recognition}/ch_SVTRv2_rec.yaml +0 -0
  1193. /paddlex/configs/{ts_anomaly_detection → modules/ts_anomaly_detection}/AutoEncoder_ad.yaml +0 -0
  1194. /paddlex/configs/{ts_anomaly_detection → modules/ts_anomaly_detection}/DLinear_ad.yaml +0 -0
  1195. /paddlex/configs/{ts_anomaly_detection → modules/ts_anomaly_detection}/Nonstationary_ad.yaml +0 -0
  1196. /paddlex/configs/{ts_anomaly_detection → modules/ts_anomaly_detection}/PatchTST_ad.yaml +0 -0
  1197. /paddlex/configs/{ts_anomaly_detection → modules/ts_anomaly_detection}/TimesNet_ad.yaml +0 -0
  1198. /paddlex/configs/{ts_classification → modules/ts_classification}/TimesNet_cls.yaml +0 -0
  1199. /paddlex/configs/{ts_forecast → modules/ts_forecast}/DLinear.yaml +0 -0
  1200. /paddlex/configs/{ts_forecast → modules/ts_forecast}/NLinear.yaml +0 -0
  1201. /paddlex/configs/{ts_forecast → modules/ts_forecast}/Nonstationary.yaml +0 -0
  1202. /paddlex/configs/{ts_forecast → modules/ts_forecast}/PatchTST.yaml +0 -0
  1203. /paddlex/configs/{ts_forecast → modules/ts_forecast}/RLinear.yaml +0 -0
  1204. /paddlex/configs/{ts_forecast → modules/ts_forecast}/TiDE.yaml +0 -0
  1205. /paddlex/configs/{ts_forecast → modules/ts_forecast}/TimesNet.yaml +0 -0
  1206. /paddlex/configs/{vehicle_attribute → modules/vehicle_attribute_recognition}/PP-LCNet_x1_0_vehicle_attribute.yaml +0 -0
  1207. /paddlex/configs/{vehicle_detection → modules/vehicle_detection}/PP-YOLOE-L_vehicle.yaml +0 -0
  1208. /paddlex/configs/{vehicle_detection → modules/vehicle_detection}/PP-YOLOE-S_vehicle.yaml +0 -0
  1209. {paddlex-3.0.0b2.dist-info → paddlex-3.0.0rc1.dist-info}/entry_points.txt +0 -0
  1210. {paddlex-3.0.0b2.dist-info → paddlex-3.0.0rc1.dist-info/licenses}/LICENSE +0 -0
  1211. {paddlex-3.0.0b2.dist-info → paddlex-3.0.0rc1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,2600 @@
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import math
16
+ import os
17
+ from dataclasses import dataclass
18
+ from functools import partial
19
+ from typing import Any, Dict, List, Optional, Tuple, Union
20
+
21
+ import paddle
22
+ import paddle.distributed.fleet.meta_parallel as mpu
23
+ import paddle.nn as nn
24
+ import paddle.nn.functional as F
25
+ from paddle import Tensor
26
+ from paddle.distributed import fleet
27
+ from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
28
+ from paddle.distributed.fleet.utils import recompute
29
+
30
+ from .....utils import logging
31
+ from ....utils.benchmark import (
32
+ benchmark,
33
+ get_inference_operations,
34
+ set_inference_operations,
35
+ )
36
+ from ...common.vlm.activations import ACT2FN
37
+ from ...common.vlm.bert_padding import index_first_axis, pad_input, unpad_input
38
+ from ...common.vlm.flash_attn_utils import has_flash_attn_func
39
+ from ...common.vlm.transformers import PretrainedConfig, PretrainedModel
40
+ from ...common.vlm.transformers.model_outputs import (
41
+ BaseModelOutputWithPast,
42
+ ModelOutput,
43
+ )
44
+
45
+ flash_attn_func, flash_attn_varlen_func = has_flash_attn_func()
46
+ _IS_NPU = "npu" in paddle.get_device()
47
+
48
+ Linear = nn.Linear
49
+ ColumnParallelLinear = mpu.ColumnParallelLinear
50
+ RowParallelLinear = mpu.RowParallelLinear
51
+
52
+
53
+ class Qwen2VLVisionConfig(PretrainedConfig):
54
+ model_type = "qwen2_vl"
55
+
56
+ def __init__(
57
+ self,
58
+ depth=32,
59
+ embed_dim=1280,
60
+ hidden_size=3584,
61
+ hidden_act="quick_gelu",
62
+ mlp_ratio=4,
63
+ num_heads=16,
64
+ in_channels=3,
65
+ patch_size=14,
66
+ spatial_merge_size=2,
67
+ temporal_patch_size=2,
68
+ attn_implementation="eager", # new added
69
+ **kwargs,
70
+ ):
71
+ super().__init__(**kwargs)
72
+
73
+ self.depth = depth
74
+ self.embed_dim = embed_dim
75
+ self.hidden_size = hidden_size
76
+ self.hidden_act = hidden_act
77
+ self.mlp_ratio = mlp_ratio
78
+ self.num_heads = num_heads
79
+ self.in_channels = in_channels
80
+ self.patch_size = patch_size
81
+ self.spatial_merge_size = spatial_merge_size
82
+ self.temporal_patch_size = temporal_patch_size
83
+ self.attn_implementation = attn_implementation
84
+
85
+ @classmethod
86
+ def from_pretrained(
87
+ cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
88
+ ) -> "PretrainedConfig":
89
+
90
+ config_dict, kwargs = cls.get_config_dict(
91
+ pretrained_model_name_or_path, **kwargs
92
+ )
93
+
94
+ if config_dict.get("model_type") == "qwen2_vl":
95
+ config_dict = config_dict["vision_config"]
96
+
97
+ if (
98
+ "model_type" in config_dict
99
+ and hasattr(cls, "model_type")
100
+ and config_dict["model_type"] != cls.model_type
101
+ ):
102
+ logging.warning(
103
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
104
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
105
+ )
106
+
107
+ return cls.from_dict(config_dict, **kwargs)
108
+
109
+
110
+ class Qwen2VLConfig(PretrainedConfig):
111
+ r"""
112
+ This is the configuration class to store the configuration of a [`Qwen2VLModel`]. It is used to instantiate a
113
+ Qwen2-VL model according to the specified arguments, defining the model architecture. Instantiating a configuration
114
+ with the defaults will yield a similar configuration to that of
115
+ Qwen2-VL-7B-Instruct [Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct).
116
+
117
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
118
+ documentation from [`PretrainedConfig`] for more information.
119
+
120
+ Args:
121
+ vocab_size (`int`, *optional*, defaults to 152064):
122
+ Vocabulary size of the Qwen2VL model. Defines the number of different tokens that can be represented by the
123
+ `inputs_ids` passed when calling [`Qwen2VLModel`]
124
+ hidden_size (`int`, *optional*, defaults to 8192):
125
+ Dimension of the hidden representations.
126
+ intermediate_size (`int`, *optional*, defaults to 29568):
127
+ Dimension of the MLP representations.
128
+ num_hidden_layers (`int`, *optional*, defaults to 80):
129
+ Number of hidden layers in the Transformer encoder.
130
+ num_attention_heads (`int`, *optional*, defaults to 64):
131
+ Number of attention heads for each attention layer in the Transformer encoder.
132
+ num_key_value_heads (`int`, *optional*, defaults to 8):
133
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
134
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
135
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
136
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
137
+ by meanpooling all the original heads within that group. For more details checkout [this
138
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
139
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
140
+ The non-linear activation function (function or string) in the decoder.
141
+ max_position_embeddings (`int`, *optional*, defaults to 32768):
142
+ The maximum sequence length that this model might ever be used with.
143
+ initializer_range (`float`, *optional*, defaults to 0.02):
144
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
145
+ rms_norm_eps (`float`, *optional*, defaults to 1e-05):
146
+ The epsilon used by the rms normalization layers.
147
+ use_cache (`bool`, *optional*, defaults to `True`):
148
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
149
+ relevant if `config.is_decoder=True`.
150
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
151
+ Whether the model's input and output word embeddings should be tied.
152
+ rope_theta (`float`, *optional*, defaults to 1000000.0):
153
+ The base period of the RoPE embeddings.
154
+ use_sliding_window (`bool`, *optional*, defaults to `False`):
155
+ Whether to use sliding window attention.
156
+ sliding_window (`int`, *optional*, defaults to 4096):
157
+ Sliding window attention (SWA) window size. If not specified, will default to `4096`.
158
+ max_window_layers (`int`, *optional*, defaults to 80):
159
+ The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
160
+ attention_dropout (`float`, *optional*, defaults to 0.0):
161
+ The dropout ratio for the attention probabilities.
162
+ vision_config (`Dict`, *optional*):
163
+ The config for the visual encoder initialization.
164
+ rope_scaling (`Dict`, *optional*):
165
+ Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
166
+ strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
167
+ `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
168
+ `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
169
+ these scaling strategies behave:
170
+ https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
171
+ experimental feature, subject to breaking API changes in future versions.
172
+ """
173
+
174
+ model_type = "qwen2_vl"
175
+ keys_to_ignore_at_inference = ["past_key_values"]
176
+
177
+ def __init__(
178
+ self,
179
+ vocab_size=152064,
180
+ hidden_size=8192,
181
+ intermediate_size=29568,
182
+ num_hidden_layers=80,
183
+ num_attention_heads=64,
184
+ num_key_value_heads=8,
185
+ hidden_act="silu",
186
+ max_position_embeddings=32768,
187
+ initializer_range=0.02,
188
+ rms_norm_eps=1e-05,
189
+ use_cache=True,
190
+ tie_word_embeddings=False,
191
+ rope_theta=1000000.0,
192
+ use_sliding_window=False,
193
+ sliding_window=4096,
194
+ max_window_layers=80,
195
+ attention_dropout=0.0,
196
+ vision_config=None,
197
+ rope_scaling=None,
198
+ **kwargs,
199
+ ):
200
+ if isinstance(vision_config, dict):
201
+ self.vision_config = Qwen2VLVisionConfig(**vision_config)
202
+ elif vision_config is None:
203
+ self.vision_config = Qwen2VLVisionConfig()
204
+
205
+ self.vocab_size = vocab_size
206
+ self.max_position_embeddings = max_position_embeddings
207
+ self.hidden_size = hidden_size
208
+ self.intermediate_size = intermediate_size
209
+ self.num_hidden_layers = num_hidden_layers
210
+ self.num_attention_heads = num_attention_heads
211
+ self.use_sliding_window = use_sliding_window
212
+ self.sliding_window = sliding_window
213
+ self.max_window_layers = max_window_layers
214
+
215
+ if num_key_value_heads is None:
216
+ num_key_value_heads = num_attention_heads
217
+
218
+ self.num_key_value_heads = num_key_value_heads
219
+ self.hidden_act = hidden_act
220
+ self.initializer_range = initializer_range
221
+ self.rms_norm_eps = rms_norm_eps
222
+ self.use_cache = use_cache
223
+ self.rope_theta = rope_theta
224
+ self.attention_dropout = attention_dropout
225
+ self.rope_scaling = rope_scaling
226
+
227
+ super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
228
+
229
+
230
+ def get_triangle_upper_mask(x, mask=None):
231
+ if mask is not None:
232
+ return mask
233
+ shape = x.shape
234
+ shape[1] = 1
235
+ mask = paddle.full(shape, paddle.finfo(x.dtype).min, dtype=x.dtype)
236
+ mask = paddle.triu(mask, diagonal=1)
237
+ mask.stop_gradient = True
238
+ return mask
239
+
240
+
241
+ def parallel_matmul(
242
+ x: Tensor, y: Tensor, transpose_y=True, tensor_parallel_output=True
243
+ ):
244
+ is_fleet_init = True
245
+ tensor_parallel_degree = 1
246
+ try:
247
+ hcg = fleet.get_hybrid_communicate_group()
248
+ model_parallel_group = hcg.get_model_parallel_group()
249
+ tensor_parallel_degree = hcg.get_model_parallel_world_size()
250
+ except:
251
+ is_fleet_init = False
252
+
253
+ if paddle.in_dynamic_mode():
254
+ y_is_distributed = y.is_distributed
255
+ else:
256
+ y_is_distributed = tensor_parallel_degree > 1
257
+
258
+ if is_fleet_init and tensor_parallel_degree > 1 and y_is_distributed:
259
+
260
+ input_parallel = paddle.distributed.collective._c_identity(
261
+ x, group=model_parallel_group
262
+ )
263
+ logits = paddle.matmul(input_parallel, y, transpose_y=transpose_y)
264
+
265
+ if tensor_parallel_output:
266
+ return logits
267
+ return paddle.distributed.collective._c_concat(
268
+ logits, group=model_parallel_group
269
+ )
270
+
271
+ else:
272
+ logits = paddle.matmul(x, y, transpose_y=transpose_y)
273
+ return logits
274
+
275
+
276
+ def _compute_default_rope_parameters(
277
+ config: Optional[PretrainedConfig] = None,
278
+ device: Optional["paddle.device"] = None,
279
+ seq_len: Optional[int] = None,
280
+ **rope_kwargs,
281
+ ) -> Tuple["paddle.Tensor", float]:
282
+ """
283
+ Computes the inverse frequencies according to the original RoPE implementation
284
+ Args:
285
+ config ([`~transformers.PretrainedConfig`]):
286
+ The model configuration.
287
+ device (`paddle.device`):
288
+ The device to use for initialization of the inverse frequencies.
289
+ seq_len (`int`, *optional*):
290
+ The current sequence length. Unused for this type of RoPE.
291
+ rope_kwargs (`Dict`, *optional*):
292
+ BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
293
+ Returns:
294
+ Tuple of (`paddle.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
295
+ post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
296
+ """
297
+ if config is not None and len(rope_kwargs) > 0:
298
+ raise ValueError(
299
+ "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
300
+ f"`_compute_default_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
301
+ )
302
+ if len(rope_kwargs) > 0:
303
+ base = rope_kwargs["base"]
304
+ dim = rope_kwargs["dim"]
305
+ elif config is not None:
306
+ base = config.rope_theta
307
+ partial_rotary_factor = (
308
+ config.partial_rotary_factor
309
+ if hasattr(config, "partial_rotary_factor")
310
+ else 1.0
311
+ )
312
+ head_dim = getattr(
313
+ config, "head_dim", config.hidden_size // config.num_attention_heads
314
+ )
315
+ dim = int(head_dim * partial_rotary_factor)
316
+
317
+ attention_factor = 1.0
318
+
319
+ inv_freq = 1.0 / (
320
+ base ** (paddle.arange(0, dim, 2, dtype="int64").astype("float32") / dim)
321
+ )
322
+ return inv_freq, attention_factor
323
+
324
+
325
+ ROPE_INIT_FUNCTIONS = {
326
+ "default": _compute_default_rope_parameters,
327
+ }
328
+
329
+
330
+ def _get_unpad_data(attention_mask):
331
+ seqlens_in_batch = attention_mask.sum(axis=-1, dtype="int32")
332
+ indices = paddle.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
333
+ max_seqlen_in_batch = seqlens_in_batch.max().item() # [2, 1, 1323]
334
+ cu_seqlens = F.pad(
335
+ paddle.cumsum(seqlens_in_batch, axis=0), (1, 0), data_format="NCL"
336
+ )
337
+ return (
338
+ indices,
339
+ cu_seqlens,
340
+ max_seqlen_in_batch,
341
+ )
342
+
343
+
344
+ def is_casual_mask(attention_mask):
345
+ """
346
+ Upper triangular of attention_mask equals to attention_mask is casual
347
+ """
348
+ return (paddle.triu(attention_mask) == attention_mask).all().item()
349
+
350
+
351
+ def _make_causal_mask(input_ids_shape, past_key_values_length):
352
+ """
353
+ Make causal mask used for self-attention
354
+ """
355
+ batch_size, target_length = input_ids_shape
356
+
357
+ mask = paddle.tril(paddle.ones((target_length, target_length), dtype="bool"))
358
+
359
+ if past_key_values_length > 0:
360
+ mask = paddle.concat(
361
+ [paddle.ones([target_length, past_key_values_length], dtype="bool"), mask],
362
+ axis=-1,
363
+ )
364
+
365
+ return mask[None, None, :, :].expand(
366
+ [batch_size, 1, target_length, target_length + past_key_values_length]
367
+ )
368
+
369
+
370
+ def _expand_2d_mask(mask, dtype, tgt_length):
371
+ """
372
+ Expands attention_mask from `[batch_size, src_length]` to `[batch_size, 1, tgt_length, src_length]`.
373
+ """
374
+ batch_size, src_length = mask.shape[0], mask.shape[-1]
375
+ tgt_length = tgt_length if tgt_length is not None else src_length
376
+
377
+ mask = mask[:, None, None, :].astype("bool")
378
+ mask.stop_gradient = True
379
+ expanded_mask = mask.expand([batch_size, 1, tgt_length, src_length])
380
+
381
+ return expanded_mask
382
+
383
+
384
+ @dataclass
385
+ class Qwen2VLCausalLMOutputWithPast(ModelOutput):
386
+ """
387
+ Base class for Qwen2VL causal language model (or autoregressive) outputs.
388
+
389
+ Args:
390
+ loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
391
+ Language modeling loss (for next-token prediction).
392
+ logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
393
+ Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
394
+ past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
395
+ Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
396
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
397
+
398
+ Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
399
+ `past_key_values` input) to speed up sequential decoding.
400
+ hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
401
+ Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
402
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
403
+
404
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
405
+ attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
406
+ Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
407
+ sequence_length)`.
408
+
409
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
410
+ heads.
411
+ rope_deltas (`paddle.Tensor` of shape `(batch_size, )`, *optional*):
412
+ The rope index difference between sequence length and multimodal rope.
413
+ """
414
+
415
+ loss: Optional[paddle.Tensor] = None
416
+ logits: paddle.Tensor = None
417
+ past_key_values: Optional[List[paddle.Tensor]] = None
418
+ hidden_states: Optional[Tuple[paddle.Tensor]] = None
419
+ attentions: Optional[Tuple[paddle.Tensor]] = None
420
+ rope_deltas: Optional[paddle.Tensor] = None
421
+
422
+
423
+ class Qwen2VLRotaryEmbedding(nn.Layer):
424
+ def __init__(
425
+ self,
426
+ dim=None,
427
+ max_position_embeddings=2048,
428
+ base=10000,
429
+ device=None,
430
+ scaling_factor=1.0,
431
+ rope_type="default",
432
+ config: Optional[Qwen2VLConfig] = None,
433
+ ):
434
+ super().__init__()
435
+ self.rope_kwargs = {}
436
+ if config is None:
437
+ self.rope_kwargs = {
438
+ "rope_type": rope_type,
439
+ "factor": scaling_factor,
440
+ "dim": dim,
441
+ "base": base,
442
+ "max_position_embeddings": max_position_embeddings,
443
+ }
444
+ self.rope_type = rope_type
445
+ self.max_seq_len_cached = max_position_embeddings
446
+ self.original_max_seq_len = max_position_embeddings
447
+ else:
448
+ # BC: "rope_type" was originally "type"
449
+ if config.rope_scaling is not None:
450
+ self.rope_type = config.rope_scaling.get(
451
+ "rope_type", config.rope_scaling.get("type")
452
+ )
453
+ else:
454
+ self.rope_type = "default"
455
+ self.max_seq_len_cached = config.max_position_embeddings
456
+ self.original_max_seq_len = config.max_position_embeddings
457
+
458
+ self.config = config
459
+ self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
460
+
461
+ self.inv_freq, self.attention_scaling = self.rope_init_fn(
462
+ self.config, device, **self.rope_kwargs
463
+ )
464
+ self.original_inv_freq = self.inv_freq
465
+
466
+ self._set_cos_sin_cache(seq_len=max_position_embeddings)
467
+
468
+ def _set_cos_sin_cache(self, seq_len):
469
+ self.max_seq_len_cached = seq_len
470
+ t = paddle.arange(seq_len, dtype="float32")
471
+ freqs = paddle.einsum("i,j->ij", t, self.inv_freq)
472
+ emb = paddle.concat([freqs, freqs], axis=-1)
473
+ self.cos_cached = emb.cos()
474
+ self.sin_cached = emb.sin()
475
+
476
+ def _dynamic_frequency_update(self, position_ids, device):
477
+ """
478
+ dynamic RoPE layers should recompute `inv_freq` in the following situations:
479
+ 1 - growing beyond the cached sequence length (allow scaling)
480
+ 2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
481
+ """
482
+ seq_len = paddle.max(position_ids) + 1
483
+ if seq_len > self.max_seq_len_cached: # growth
484
+ inv_freq, self.attention_scaling = self.rope_init_fn(
485
+ self.config, device, seq_len=seq_len, **self.rope_kwargs
486
+ )
487
+ self.inv_freq = inv_freq
488
+ self.max_seq_len_cached = seq_len
489
+
490
+ if (
491
+ seq_len < self.original_max_seq_len
492
+ and self.max_seq_len_cached > self.original_max_seq_len
493
+ ): # reset
494
+ self.inv_freq = self.original_inv_freq
495
+ self.max_seq_len_cached = self.original_max_seq_len
496
+
497
+ @paddle.no_grad()
498
+ def forward(self, x, position_ids):
499
+ if "dynamic" in self.rope_type:
500
+ self._dynamic_frequency_update(position_ids, device=x.device)
501
+
502
+ inv_freq_expanded = (
503
+ self.inv_freq[None, None, :, None]
504
+ .astype("float32")
505
+ .expand([3, position_ids.shape[1], -1, 1])
506
+ )
507
+ position_ids_expanded = position_ids[:, :, None, :].astype("float32")
508
+ device_type = paddle.get_device()
509
+ device_type = (
510
+ device_type
511
+ if isinstance(device_type, str) and device_type != "mps"
512
+ else "cpu"
513
+ )
514
+ with paddle.amp.auto_cast():
515
+ freqs = paddle.matmul(inv_freq_expanded, position_ids_expanded)
516
+ freqs = freqs.transpose([0, 1, 3, 2])
517
+ emb = paddle.concat((freqs, freqs), axis=-1)
518
+ cos = emb.cos()
519
+ sin = emb.sin()
520
+
521
+ cos = cos * self.attention_scaling
522
+ sin = sin * self.attention_scaling
523
+
524
+ return cos.astype(x.dtype), sin.astype(x.dtype)
525
+
526
+
527
+ # Copied from transformers.models.llama.modeling_llama.rotate_half
528
+ def rotate_half(x):
529
+ """Rotates half the hidden dims of the input."""
530
+ x1 = x[..., : x.shape[-1] // 2]
531
+ x2 = x[..., x.shape[-1] // 2 :]
532
+ return paddle.concat([-x2, x1], axis=-1)
533
+
534
+
535
+ def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim=1):
536
+ """Applies Rotary Position Embedding with Multimodal Sections to the query and key tensors (https://qwenlm.github.io/blog/qwen2-vl/).
537
+
538
+ Explanation:
539
+ Multimodal 3D rotary position embedding is an extension to 1D rotary position embedding. The input embedding
540
+ sequence contains vision (images / videos) embedding and text embedding or just contains text embedding. For
541
+ vision embedding part, we apply rotary position embedding on temporal, height and width dimension separately.
542
+ Here we split the channel dimension to 3 chunks for the temporal, height and width rotary position embedding.
543
+ For text embedding part, we just apply 1D rotary position embedding. The three rotary position index (temporal,
544
+ height and width) of text embedding is always the same, so the text embedding rotary position embedding has no
545
+ difference with modern LLMs.
546
+
547
+ Args:
548
+ q (`paddle.Tensor`): The query tensor.
549
+ k (`paddle.Tensor`): The key tensor.
550
+ cos (`paddle.Tensor`): The cosine part of the rotary embedding.
551
+ sin (`paddle.Tensor`): The sine part of the rotary embedding.
552
+ position_ids (`paddle.Tensor`):
553
+ The position indices of the tokens corresponding to the query and key tensors. For example, this can be
554
+ used to pass offsetted position ids when working with a KV-cache.
555
+ mrope_section(`List(int)`):
556
+ Multimodal rope section is for channel dimension of temporal, height and width in rope calculation.
557
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
558
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
559
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
560
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
561
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
562
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
563
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
564
+ Returns:
565
+ `tuple(paddle.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
566
+ """
567
+
568
+ mrope_section = mrope_section * 2
569
+ cos = paddle.concat(
570
+ x=[m[i % 3] for i, m in enumerate(cos.split(mrope_section, axis=-1))], axis=-1
571
+ ).unsqueeze(axis=unsqueeze_dim)
572
+ sin = paddle.concat(
573
+ x=[m[i % 3] for i, m in enumerate(sin.split(mrope_section, axis=-1))], axis=-1
574
+ ).unsqueeze(axis=unsqueeze_dim)
575
+
576
+ q_embed = (q * cos) + (rotate_half(q) * sin)
577
+ k_embed = (k * cos) + (rotate_half(k) * sin)
578
+ return q_embed, k_embed
579
+
580
+
581
+ def apply_rotary_pos_emb_vision(
582
+ tensor: paddle.Tensor, freqs: paddle.Tensor
583
+ ) -> paddle.Tensor:
584
+ orig_dtype = tensor.dtype
585
+
586
+ with paddle.amp.auto_cast(False):
587
+ tensor = tensor.astype(dtype="float32")
588
+ cos = freqs.cos()
589
+ sin = freqs.sin()
590
+ cos = (
591
+ cos.unsqueeze(1)
592
+ .tile(repeat_times=[1, 1, 2])
593
+ .unsqueeze(0)
594
+ .astype(dtype="float32")
595
+ )
596
+ sin = (
597
+ sin.unsqueeze(1)
598
+ .tile(repeat_times=[1, 1, 2])
599
+ .unsqueeze(0)
600
+ .astype(dtype="float32")
601
+ )
602
+ output = tensor * cos + rotate_half(tensor) * sin
603
+ output = paddle.cast(output, orig_dtype)
604
+ return output
605
+
606
+
607
+ class VisionRotaryEmbedding(nn.Layer):
608
+ def __init__(self, dim: int, theta: float = 10000.0) -> None:
609
+ super().__init__()
610
+ self.inv_freq = 1.0 / theta ** (
611
+ paddle.arange(start=0, end=dim, step=2, dtype="float32") / dim
612
+ )
613
+
614
+ def forward(self, seqlen: int) -> paddle.Tensor:
615
+ seq = paddle.arange(seqlen).cast(self.inv_freq.dtype)
616
+ freqs = paddle.outer(x=seq, y=self.inv_freq)
617
+ return freqs
618
+
619
+
620
+ class PatchEmbed(nn.Layer):
621
+ def __init__(
622
+ self,
623
+ patch_size: int = 14,
624
+ temporal_patch_size: int = 2,
625
+ in_channels: int = 3,
626
+ embed_dim: int = 1152,
627
+ ) -> None:
628
+ super().__init__()
629
+ self.patch_size = patch_size
630
+ self.temporal_patch_size = temporal_patch_size
631
+ self.in_channels = in_channels
632
+ self.embed_dim = embed_dim
633
+
634
+ kernel_size = [temporal_patch_size, patch_size, patch_size]
635
+ self.proj = nn.Conv3D(
636
+ in_channels,
637
+ embed_dim,
638
+ kernel_size=kernel_size,
639
+ stride=kernel_size,
640
+ bias_attr=False,
641
+ )
642
+
643
+ def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
644
+
645
+ target_dtype = self.proj.weight.dtype
646
+ hidden_states = hidden_states.reshape(
647
+ [
648
+ -1,
649
+ self.in_channels,
650
+ self.temporal_patch_size,
651
+ self.patch_size,
652
+ self.patch_size,
653
+ ]
654
+ )
655
+ # NOTE(changwenbin): AttributeError: 'Variable' object has no attribute 'to'.
656
+ # hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).reshape([-1, self.embed_dim])
657
+ # hidden_states = paddle.cast(hidden_states, dtype=target_dtype)
658
+ hidden_states = self.proj(
659
+ paddle.cast(hidden_states, dtype=target_dtype)
660
+ ).reshape([-1, self.embed_dim])
661
+ return hidden_states
662
+
663
+
664
+ class PatchMerger(nn.Layer):
665
+ def __init__(self, dim: int, context_dim: int, spatial_merge_size: int = 2) -> None:
666
+ super().__init__()
667
+ self.hidden_size = context_dim * (spatial_merge_size**2)
668
+ self.ln_q = nn.LayerNorm(context_dim, epsilon=1e-6)
669
+ self.mlp = nn.Sequential(
670
+ nn.Linear(self.hidden_size, self.hidden_size),
671
+ nn.GELU(),
672
+ nn.Linear(self.hidden_size, dim),
673
+ )
674
+
675
+ def forward(self, x: paddle.Tensor) -> paddle.Tensor:
676
+ x = self.mlp(self.ln_q(x).reshape([-1, self.hidden_size]))
677
+ return x
678
+
679
+
680
+ class VisionMlp(nn.Layer):
681
+ def __init__(self, dim: int, hidden_dim: int, hidden_act: str) -> None:
682
+ super().__init__()
683
+ self.fc1 = nn.Linear(dim, hidden_dim)
684
+ self.act = ACT2FN[hidden_act]
685
+ self.fc2 = nn.Linear(hidden_dim, dim)
686
+
687
+ def forward(self, x) -> paddle.Tensor:
688
+ return self.fc2(self.act(self.fc1(x)))
689
+
690
+
691
+ class VisionAttention(nn.Layer):
692
+ def __init__(self, dim: int, num_heads: int = 16) -> None:
693
+ super().__init__()
694
+ self.num_heads = num_heads
695
+ self.qkv = nn.Linear(dim, dim * 3, bias_attr=True)
696
+ self.proj = nn.Linear(dim, dim)
697
+ self.head_dim = dim // num_heads # must added
698
+
699
+ def forward(
700
+ self,
701
+ hidden_states: paddle.Tensor,
702
+ cu_seqlens: paddle.Tensor,
703
+ rotary_pos_emb: paddle.Tensor = None,
704
+ ) -> paddle.Tensor:
705
+ seq_length = hidden_states.shape[0]
706
+ q, k, v = (
707
+ self.qkv(hidden_states)
708
+ .reshape([seq_length, 3, self.num_heads, -1])
709
+ .transpose([1, 0, 2, 3])
710
+ .unbind(0)
711
+ )
712
+ q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
713
+ k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
714
+
715
+ attention_mask = paddle.zeros([1, seq_length, seq_length], dtype="bool")
716
+ for i in range(1, len(cu_seqlens)):
717
+ attention_mask[
718
+ ...,
719
+ cu_seqlens[i - 1] : cu_seqlens[i],
720
+ cu_seqlens[i - 1] : cu_seqlens[i],
721
+ ] = True
722
+
723
+ zero = paddle.zeros(attention_mask.shape, dtype=hidden_states.dtype)
724
+ neg_inf = paddle.full_like(
725
+ attention_mask,
726
+ paddle.finfo(hidden_states.dtype).min,
727
+ dtype=hidden_states.dtype,
728
+ )
729
+ attention_mask = paddle.where(attention_mask, zero, neg_inf)
730
+
731
+ q = q.transpose([1, 0, 2])
732
+ k = k.transpose([1, 0, 2])
733
+ v = v.transpose([1, 0, 2])
734
+ attn_weights = paddle.matmul(q, k.transpose([0, 2, 1])) / math.sqrt(
735
+ self.head_dim
736
+ )
737
+ attn_weights = attn_weights + attention_mask
738
+ attn_weights = nn.functional.softmax(attn_weights, axis=-1, dtype="float32")
739
+ attn_output = paddle.matmul(attn_weights, v)
740
+ attn_output = attn_output.transpose([1, 0, 2])
741
+ attn_output = attn_output.reshape([seq_length, -1])
742
+ attn_output = self.proj(attn_output)
743
+ return attn_output
744
+
745
+
746
+ class VisionFlashAttention2(nn.Layer):
747
+ def __init__(self, dim: int, num_heads: int = 16) -> None:
748
+ super().__init__()
749
+ self.num_heads = num_heads
750
+ self.qkv = nn.Linear(dim, dim * 3, bias_attr=True)
751
+ self.proj = nn.Linear(dim, dim)
752
+ self.head_dim = dim // num_heads # must added
753
+
754
+ def forward(
755
+ self,
756
+ hidden_states: paddle.Tensor,
757
+ cu_seqlens: paddle.Tensor,
758
+ rotary_pos_emb: paddle.Tensor = None,
759
+ ) -> paddle.Tensor:
760
+ seq_length = tuple(hidden_states.shape)[0]
761
+ qkv = (
762
+ self.qkv(hidden_states)
763
+ .reshape([seq_length, 3, self.num_heads, -1])
764
+ .transpose(perm=[1, 0, 2, 3])
765
+ )
766
+ q, k, v = qkv.unbind(axis=0)
767
+ q = apply_rotary_pos_emb_vision(q.unsqueeze(axis=0), rotary_pos_emb).squeeze(
768
+ axis=0
769
+ )
770
+ k = apply_rotary_pos_emb_vision(k.unsqueeze(axis=0), rotary_pos_emb).squeeze(
771
+ axis=0
772
+ )
773
+
774
+ if _IS_NPU:
775
+ attn_output = paddle.nn.functional.flash_attention_npu(
776
+ q.astype("bfloat16"),
777
+ k.astype("bfloat16"),
778
+ v.astype("bfloat16"),
779
+ is_varlen=True,
780
+ batch_size=1,
781
+ seq_length=seq_length,
782
+ ).reshape([seq_length, -1])
783
+ else:
784
+ max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
785
+
786
+ softmax_scale = self.head_dim**-0.5
787
+ attn_output = (
788
+ flash_attn_varlen_func(
789
+ q.astype("bfloat16"),
790
+ k.astype("bfloat16"),
791
+ v.astype("bfloat16"),
792
+ cu_seqlens,
793
+ cu_seqlens,
794
+ max_seqlen,
795
+ max_seqlen,
796
+ scale=softmax_scale,
797
+ )[0]
798
+ .squeeze(0)
799
+ .reshape([seq_length, -1])
800
+ )
801
+ if self.proj.weight.dtype == paddle.bfloat16:
802
+ attn_output = attn_output.astype(paddle.bfloat16)
803
+ elif self.proj.weight.dtype == paddle.float16:
804
+ attn_output = attn_output.astype(paddle.float16)
805
+ elif self.proj.weight.dtype == paddle.float32:
806
+ attn_output = attn_output.astype(paddle.float32)
807
+ attn_output = self.proj(attn_output)
808
+ return attn_output
809
+
810
+
811
+ def create_attention_module(config, module_type, layer_idx=None):
812
+ if flash_attn_func is not None:
813
+ if module_type == "qwen2vl":
814
+ return Qwen2VLFlashAttention2(config, layer_idx)
815
+ elif module_type == "vision":
816
+ return VisionFlashAttention2(config.embed_dim, num_heads=config.num_heads)
817
+ else:
818
+ logging.warning_once(
819
+ f"Warning: Flash Attention2 is not available for {module_type}, fallback to normal attention."
820
+ )
821
+
822
+ if module_type == "qwen2vl":
823
+ return Qwen2VLAttention(config, layer_idx)
824
+ elif module_type == "vision":
825
+ return VisionAttention(config.embed_dim, num_heads=config.num_heads)
826
+
827
+
828
+ class Qwen2VLVisionBlock(nn.Layer):
829
+ def __init__(self, config, attn_implementation: str = "flash_attention_2") -> None:
830
+ super().__init__()
831
+ self.norm1 = nn.LayerNorm(config.embed_dim, epsilon=1e-6)
832
+ self.norm2 = nn.LayerNorm(config.embed_dim, epsilon=1e-6)
833
+ mlp_hidden_dim = int(config.embed_dim * config.mlp_ratio)
834
+
835
+ self.attn = create_attention_module(config, "vision")
836
+ self.mlp = VisionMlp(
837
+ dim=config.embed_dim,
838
+ hidden_dim=mlp_hidden_dim,
839
+ hidden_act=config.hidden_act,
840
+ )
841
+
842
+ def forward(self, hidden_states, cu_seqlens, rotary_pos_emb) -> paddle.Tensor:
843
+ hidden_states = hidden_states + self.attn(
844
+ self.norm1(hidden_states),
845
+ cu_seqlens=cu_seqlens,
846
+ rotary_pos_emb=rotary_pos_emb,
847
+ )
848
+ hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
849
+ return hidden_states
850
+
851
+
852
+ def _prepare_4d_causal_attention_mask_with_cache_position(
853
+ attention_mask: paddle.Tensor,
854
+ sequence_length: int,
855
+ target_length: int,
856
+ dtype: paddle.dtype,
857
+ min_dtype: float,
858
+ cache_position: paddle.Tensor,
859
+ batch_size: int,
860
+ ):
861
+ """
862
+ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
863
+ `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
864
+
865
+ Args:
866
+ attention_mask (`paddle.Tensor`):
867
+ A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
868
+ sequence_length (`int`):
869
+ The sequence length being processed.
870
+ target_length (`int`):
871
+ The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
872
+ dtype (`paddle.dtype`):
873
+ The dtype to use for the 4D attention mask.
874
+ min_dtype (`float`):
875
+ The minimum value representable with the dtype `dtype`.
876
+ cache_position (`paddle.Tensor`):
877
+ Indices depicting the position of the input sequence tokens in the sequence.
878
+ batch_size (`paddle.Tensor`):
879
+ Batch size.
880
+ """
881
+ if attention_mask is not None and attention_mask.dim() == 4:
882
+ causal_mask = attention_mask
883
+ else:
884
+ causal_mask = paddle.full(
885
+ [sequence_length, target_length], fill_value=min_dtype, dtype=dtype
886
+ )
887
+ if sequence_length != 1:
888
+ causal_mask = paddle.triu(x=causal_mask, diagonal=1)
889
+ causal_mask *= paddle.arange(target_length) > cache_position.reshape([-1, 1])
890
+ causal_mask = causal_mask[None, None, :, :].expand(
891
+ shape=[batch_size, 1, -1, -1]
892
+ )
893
+ if attention_mask is not None:
894
+ causal_mask = causal_mask.clone()
895
+ mask_length = tuple(attention_mask.shape)[-1]
896
+ padding_mask = (
897
+ causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
898
+ )
899
+ padding_mask = padding_mask == 0
900
+ causal_mask[:, :, :, :mask_length] = causal_mask[
901
+ :, :, :, :mask_length
902
+ ].masked_fill(mask=padding_mask, value=min_dtype)
903
+
904
+ return causal_mask
905
+
906
+
907
+ class Qwen2RMSNorm(nn.Layer):
908
+ def __init__(self, config: Qwen2VLConfig, hidden_size, eps=1e-6):
909
+ """
910
+ Qwen2RMSNorm is equivalent to T5LayerNorm
911
+ """
912
+ super().__init__()
913
+ self.weight = paddle.create_parameter(
914
+ shape=[hidden_size],
915
+ dtype=paddle.get_default_dtype(),
916
+ default_initializer=nn.initializer.Constant(1.0),
917
+ )
918
+ self.variance_epsilon = eps
919
+
920
+ def forward(self, hidden_states):
921
+ if paddle.in_dynamic_mode():
922
+ with paddle.amp.auto_cast(False):
923
+ variance = hidden_states.astype("float32").pow(2).mean(-1, keepdim=True)
924
+ hidden_states = (
925
+ paddle.rsqrt(variance + self.variance_epsilon) * hidden_states
926
+ )
927
+ else:
928
+ variance = hidden_states.astype("float32").pow(2).mean(-1, keepdim=True)
929
+ hidden_states = (
930
+ paddle.rsqrt(variance + self.variance_epsilon) * hidden_states
931
+ )
932
+
933
+ if self.weight.dtype in [paddle.float16, paddle.bfloat16]:
934
+ hidden_states = paddle.cast(hidden_states, self.weight.dtype)
935
+ return hidden_states * self.weight
936
+
937
+
938
+ class Qwen2MLP(nn.Layer):
939
+ def __init__(self, config):
940
+ super().__init__()
941
+ self.hidden_size = config.hidden_size
942
+ self.intermediate_size = config.intermediate_size
943
+ self.fuse_attention_ffn = config.fuse_attention_ffn
944
+ self.tensor_parallel_degree = config.tensor_parallel_degree
945
+
946
+ if config.tensor_parallel_degree > 1:
947
+
948
+ self.gate_proj = ColumnParallelLinear(
949
+ self.hidden_size,
950
+ self.intermediate_size,
951
+ gather_output=False,
952
+ has_bias=False,
953
+ )
954
+ self.up_proj = ColumnParallelLinear(
955
+ self.hidden_size,
956
+ self.intermediate_size,
957
+ gather_output=False,
958
+ has_bias=False,
959
+ )
960
+ self.down_proj = RowParallelLinear(
961
+ self.intermediate_size,
962
+ self.hidden_size,
963
+ input_is_parallel=True,
964
+ has_bias=False,
965
+ )
966
+ else:
967
+ self.gate_proj = Linear(
968
+ self.hidden_size, self.intermediate_size, bias_attr=False
969
+ ) # w1
970
+ self.up_proj = Linear(
971
+ self.hidden_size, self.intermediate_size, bias_attr=False
972
+ ) # w3
973
+ self.down_proj = Linear(
974
+ self.intermediate_size, self.hidden_size, bias_attr=False
975
+ ) # w2
976
+
977
+ self.act_fn = ACT2FN[config.hidden_act]
978
+ self.fuse_swiglu = False
979
+
980
+ def forward(self, x):
981
+ x, y = self.gate_proj(x), self.up_proj(x)
982
+ if self.fuse_swiglu:
983
+ x = self.act_fn(x, y)
984
+ else:
985
+ x = self.act_fn(x) * y
986
+
987
+ return self.down_proj(x)
988
+
989
+
990
+ # Copied from transformers.models.llama.modeling_llama.repeat_kv
991
+ def repeat_kv(hidden_states: paddle.Tensor, n_rep: int) -> paddle.Tensor:
992
+ """
993
+ This is the equivalent of paddle.repeat_interleave(x, axis=1, repeats=n_rep). The hidden states go from (batch,
994
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
995
+ """
996
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
997
+ if n_rep == 1:
998
+ return hidden_states
999
+ hidden_states = hidden_states[:, :, None, :, :].expand(
1000
+ [batch, num_key_value_heads, n_rep, slen, head_dim]
1001
+ )
1002
+ return hidden_states.reshape([batch, num_key_value_heads * n_rep, slen, head_dim])
1003
+
1004
+
1005
+ class Qwen2VLAttention(nn.Layer):
1006
+ """
1007
+ Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
1008
+ and "Generating Long Sequences with Sparse Transformers".
1009
+ """
1010
+
1011
+ def __init__(self, config: Qwen2VLConfig, layer_idx: Optional[int] = None):
1012
+ super().__init__()
1013
+ self.config = config
1014
+ self.layer_idx = layer_idx
1015
+ if layer_idx is None:
1016
+ logging.warning_once(
1017
+ f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
1018
+ "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
1019
+ "when creating this class."
1020
+ )
1021
+
1022
+ self.hidden_size = config.hidden_size
1023
+ self.num_heads = config.num_attention_heads
1024
+ self.head_dim = self.hidden_size // self.num_heads
1025
+ self.num_key_value_heads = config.num_key_value_heads
1026
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
1027
+ self.max_position_embeddings = config.max_position_embeddings
1028
+ self.rope_theta = config.rope_theta
1029
+ self.is_causal = True
1030
+ self.attention_dropout = config.attention_dropout
1031
+ self.rope_scaling = config.rope_scaling
1032
+ # self.sequence_parallel = config.sequence_parallel
1033
+
1034
+ if config.tensor_parallel_degree > 1:
1035
+ assert (
1036
+ self.num_heads % config.tensor_parallel_degree == 0
1037
+ ), f"num_heads: {self.num_heads}, tensor_parallel_degree: {config.tensor_parallel_degree}"
1038
+ self.num_heads = self.num_heads // config.tensor_parallel_degree
1039
+
1040
+ assert (
1041
+ self.num_key_value_heads % config.tensor_parallel_degree == 0
1042
+ ), f"num_key_value_heads: {self.num_key_value_heads}, tensor_parallel_degree: {config.tensor_parallel_degree}"
1043
+ self.num_key_value_heads = (
1044
+ self.num_key_value_heads // config.tensor_parallel_degree
1045
+ )
1046
+
1047
+ if config.tensor_parallel_degree > 1:
1048
+ self.q_proj = ColumnParallelLinear(
1049
+ self.hidden_size, self.hidden_size, has_bias=True, gather_output=False
1050
+ )
1051
+ self.k_proj = ColumnParallelLinear(self.hidden_size, self.config.num_key_value_heads * self.head_dim, has_bias=True, gather_output=False) # fmt:skip
1052
+ self.v_proj = ColumnParallelLinear(self.hidden_size, self.config.num_key_value_heads * self.head_dim, has_bias=True, gather_output=False) # fmt:skip
1053
+ self.o_proj = RowParallelLinear(
1054
+ self.hidden_size,
1055
+ self.hidden_size,
1056
+ has_bias=False,
1057
+ input_is_parallel=True,
1058
+ )
1059
+ else:
1060
+ self.q_proj = Linear(self.hidden_size, self.hidden_size, bias_attr=True)
1061
+ self.k_proj = Linear(
1062
+ self.hidden_size,
1063
+ self.config.num_key_value_heads * self.head_dim,
1064
+ bias_attr=True,
1065
+ )
1066
+ self.v_proj = Linear(
1067
+ self.hidden_size,
1068
+ self.config.num_key_value_heads * self.head_dim,
1069
+ bias_attr=True,
1070
+ )
1071
+ self.o_proj = Linear(self.hidden_size, self.hidden_size, bias_attr=False)
1072
+
1073
+ self.rotary_emb = Qwen2VLRotaryEmbedding(
1074
+ self.head_dim,
1075
+ max_position_embeddings=self.max_position_embeddings,
1076
+ base=self.rope_theta,
1077
+ )
1078
+
1079
+ def forward(
1080
+ self,
1081
+ hidden_states: paddle.Tensor,
1082
+ attention_mask: Optional[paddle.Tensor] = None,
1083
+ position_ids: Optional[paddle.Tensor] = None,
1084
+ past_key_value: Optional[Tuple[paddle.Tensor]] = None,
1085
+ output_attentions: bool = False,
1086
+ use_cache: bool = False, # default true
1087
+ cache_position: Optional[paddle.Tensor] = None,
1088
+ ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
1089
+ bsz, q_len, _ = hidden_states.shape
1090
+
1091
+ try:
1092
+ query_states = self.q_proj(hidden_states)
1093
+ key_states = self.k_proj(hidden_states)
1094
+ value_states = self.v_proj(hidden_states)
1095
+ except:
1096
+ hidden_states = hidden_states.astype(self.config.dtype)
1097
+ query_states = self.q_proj(hidden_states)
1098
+ key_states = self.k_proj(hidden_states)
1099
+ value_states = self.v_proj(hidden_states)
1100
+
1101
+ target_query_shape = [0, 0, self.num_heads, self.head_dim]
1102
+ target_key_value_shape = [0, 0, self.num_key_value_heads, self.head_dim]
1103
+ query_states = query_states.reshape(shape=target_query_shape)
1104
+ key_states = key_states.reshape(shape=target_key_value_shape)
1105
+ value_states = value_states.reshape(shape=target_key_value_shape)
1106
+
1107
+ new_perm = [0, 2, 1, 3]
1108
+ query_states = query_states.transpose(new_perm)
1109
+ key_states = key_states.transpose(new_perm)
1110
+ value_states = value_states.transpose(new_perm)
1111
+
1112
+ kv_seq_len = key_states.shape[-2]
1113
+ if past_key_value is not None:
1114
+ kv_seq_len += cache_position[0] + 1
1115
+
1116
+ cos, sin = self.rotary_emb(value_states, position_ids)
1117
+ query_states, key_states = apply_multimodal_rotary_pos_emb(
1118
+ query_states, key_states, cos, sin, self.rope_scaling["mrope_section"]
1119
+ )
1120
+
1121
+ if past_key_value is not None:
1122
+ key_states = paddle.concat([past_key_value[0], key_states], axis=2)
1123
+ value_states = paddle.concat([past_key_value[1], value_states], axis=2)
1124
+ past_key_value = (key_states, value_states) if use_cache else None
1125
+
1126
+ # repeat k/v heads if n_kv_heads < n_heads
1127
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
1128
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
1129
+
1130
+ query_states = query_states.astype("float32")
1131
+ key_states = key_states.astype("float32")
1132
+ value_states = value_states.astype("float32")
1133
+
1134
+ attn_weights = paddle.matmul(
1135
+ query_states, key_states.transpose([0, 1, 3, 2])
1136
+ ) / math.sqrt(self.head_dim)
1137
+
1138
+ if attention_mask is not None:
1139
+ attn_weights = attn_weights + attention_mask
1140
+ attn_weights = nn.functional.softmax(attn_weights, axis=-1, dtype="float32")
1141
+ attn_weights = nn.functional.dropout(
1142
+ x=attn_weights, p=self.attention_dropout, training=self.training
1143
+ )
1144
+ attn_output = paddle.matmul(
1145
+ attn_weights.cast(self.config.dtype), value_states.cast(self.config.dtype)
1146
+ )
1147
+
1148
+ if attn_output.shape != [bsz, self.num_heads, q_len, self.head_dim]:
1149
+ raise ValueError(
1150
+ f"`attn_output` should be of size {(bsz, q_len, self.num_heads, self.head_dim)}, but is"
1151
+ f" {attn_output.shape}"
1152
+ )
1153
+
1154
+ attn_output = attn_output.transpose([0, 2, 1, 3])
1155
+ attn_output = attn_output.reshape([bsz, q_len, -1])
1156
+
1157
+ if self.o_proj.weight.dtype == paddle.bfloat16:
1158
+ attn_output = attn_output.astype(paddle.bfloat16)
1159
+ elif self.o_proj.weight.dtype == paddle.float16:
1160
+ attn_output = attn_output.astype(paddle.float16)
1161
+ elif self.o_proj.weight.dtype == paddle.float32:
1162
+ attn_output = attn_output.astype(paddle.float32)
1163
+
1164
+ attn_output = self.o_proj(attn_output)
1165
+ if not output_attentions:
1166
+ attn_weights = None
1167
+ return attn_output, attn_weights, past_key_value
1168
+
1169
+
1170
+ class Qwen2VLFlashAttention2(Qwen2VLAttention):
1171
+ """
1172
+ Qwen2VL flash attention module, following Qwen2VL attention module. This module inherits from `Qwen2VLAttention`
1173
+ as the weights of the module stays untouched. The only required change would be on the forward pass
1174
+ where it needs to correctly call the public API of flash attention and deal with padding tokens
1175
+ in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
1176
+ config.max_window_layers layers.
1177
+ """
1178
+
1179
+ def __init__(self, *args, **kwargs):
1180
+ super().__init__(*args, **kwargs)
1181
+
1182
+ def forward(
1183
+ self,
1184
+ hidden_states: paddle.Tensor,
1185
+ attention_mask: Optional[paddle.Tensor] = None,
1186
+ position_ids: Optional[paddle.Tensor] = None,
1187
+ past_key_value: Optional[Tuple[paddle.Tensor]] = None,
1188
+ output_attentions: bool = False,
1189
+ use_cache: bool = False, # default true
1190
+ cache_position: Optional[paddle.Tensor] = None,
1191
+ ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
1192
+ bsz, q_len, _ = tuple(hidden_states.shape)
1193
+
1194
+ try:
1195
+ query_states = self.q_proj(hidden_states)
1196
+ key_states = self.k_proj(hidden_states)
1197
+ value_states = self.v_proj(hidden_states)
1198
+ except:
1199
+ hidden_states = hidden_states.astype("bfloat16")
1200
+ query_states = self.q_proj(hidden_states)
1201
+ key_states = self.k_proj(hidden_states)
1202
+ value_states = self.v_proj(hidden_states)
1203
+
1204
+ target_query_shape = [0, 0, self.num_heads, self.head_dim]
1205
+ target_key_value_shape = [0, 0, self.num_key_value_heads, self.head_dim]
1206
+ query_states = query_states.reshape(shape=target_query_shape)
1207
+ key_states = key_states.reshape(shape=target_key_value_shape)
1208
+ value_states = value_states.reshape(shape=target_key_value_shape)
1209
+
1210
+ new_perm = [0, 2, 1, 3]
1211
+ query_states = query_states.transpose(new_perm)
1212
+ key_states = key_states.transpose(new_perm)
1213
+ value_states = value_states.transpose(new_perm)
1214
+
1215
+ kv_seq_len = key_states.shape[-2]
1216
+ if past_key_value is not None:
1217
+ kv_seq_len += cache_position[0] + 1
1218
+
1219
+ # Because the input can be padded, the absolute sequence length depends on the max position id.
1220
+ cos, sin = self.rotary_emb(value_states, position_ids)
1221
+ query_states, key_states = apply_multimodal_rotary_pos_emb(
1222
+ query_states, key_states, cos, sin, self.rope_scaling["mrope_section"]
1223
+ )
1224
+
1225
+ if past_key_value is not None:
1226
+ key_states = paddle.concat([past_key_value[0], key_states], axis=2)
1227
+ value_states = paddle.concat([past_key_value[1], value_states], axis=2)
1228
+ past_key_value = (key_states, value_states) if use_cache else None
1229
+
1230
+ # repeat k/v heads if n_kv_heads < n_heads
1231
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
1232
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
1233
+
1234
+ # Reashape to the expected shape for Flash Attention
1235
+ # [1, 3599, 12, 128]
1236
+ query_states = query_states.transpose(perm=[0, 2, 1, 3])
1237
+ key_states = key_states.transpose(perm=[0, 2, 1, 3])
1238
+ value_states = value_states.transpose(perm=[0, 2, 1, 3])
1239
+
1240
+ attn_output = self._flash_attention_forward(
1241
+ query_states, key_states, value_states, attention_mask, q_len
1242
+ )
1243
+
1244
+ attn_output = attn_output.reshape([bsz, q_len, -1])
1245
+ attn_output = self.o_proj(attn_output)
1246
+ if not output_attentions:
1247
+ attn_weights = None
1248
+ return attn_output, attn_weights, past_key_value
1249
+
1250
+ def _flash_attention_forward(
1251
+ self,
1252
+ query_states,
1253
+ key_states,
1254
+ value_states,
1255
+ attention_mask,
1256
+ query_length,
1257
+ dropout=0.0,
1258
+ softmax_scale=None,
1259
+ ):
1260
+ """
1261
+ Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
1262
+ first unpad the input, then computes the attention scores and pad the final attention scores.
1263
+
1264
+ Args:
1265
+ query_states (`paddle.Tensor`):
1266
+ Input query states to be passed to Flash Attention API
1267
+ key_states (`paddle.Tensor`):
1268
+ Input key states to be passed to Flash Attention API
1269
+ value_states (`paddle.Tensor`):
1270
+ Input value states to be passed to Flash Attention API
1271
+ attention_mask (`paddle.Tensor`):
1272
+ The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
1273
+ position of padding tokens and 1 for the position of non-padding tokens.
1274
+ dropout (`int`, *optional*):
1275
+ Attention dropout
1276
+ softmax_scale (`float`, *optional*):
1277
+ The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
1278
+ """
1279
+ # Contains at least one padding token in the sequence
1280
+ causal = self.is_causal and query_length != 1
1281
+
1282
+ if _IS_NPU:
1283
+ if attention_mask is not None:
1284
+ attn_output = paddle.nn.functional.flash_attention_npu( # TODO: flash_attn_unpadded
1285
+ query_states,
1286
+ key_states,
1287
+ value_states,
1288
+ attn_mask=attention_mask,
1289
+ dropout=dropout,
1290
+ causal=causal,
1291
+ is_varlen=True,
1292
+ )
1293
+ else:
1294
+ dtype = query_states.dtype
1295
+ attn_output = paddle.nn.functional.flash_attention_npu( # TODO: flash_attn_unpadded
1296
+ query_states.astype("bfloat16"),
1297
+ key_states.astype("bfloat16"),
1298
+ value_states.astype("bfloat16"),
1299
+ attn_mask=attention_mask,
1300
+ dropout=dropout,
1301
+ causal=causal,
1302
+ )
1303
+ attn_output = attn_output.astype(dtype)
1304
+ else:
1305
+ head_dim = query_states.shape[-1]
1306
+ softmax_scale = head_dim**-0.5 # TODO: 需要手动加上
1307
+
1308
+ if attention_mask is not None:
1309
+ batch_size = query_states.shape[0]
1310
+ (
1311
+ query_states,
1312
+ key_states,
1313
+ value_states,
1314
+ indices_q,
1315
+ cu_seq_lens,
1316
+ max_seq_lens,
1317
+ ) = self._unpad_input(
1318
+ query_states, key_states, value_states, attention_mask, query_length
1319
+ )
1320
+ cu_seqlens_q, cu_seqlens_k = cu_seq_lens
1321
+ max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
1322
+
1323
+ attn_output_unpad = flash_attn_varlen_func(
1324
+ query_states,
1325
+ key_states,
1326
+ value_states,
1327
+ cu_seqlens_q=cu_seqlens_q,
1328
+ cu_seqlens_k=cu_seqlens_k,
1329
+ max_seqlen_q=max_seqlen_in_batch_q,
1330
+ max_seqlen_k=max_seqlen_in_batch_k,
1331
+ scale=softmax_scale, # not softmax_scale=
1332
+ dropout=dropout,
1333
+ causal=causal,
1334
+ )[0]
1335
+
1336
+ attn_output = pad_input(
1337
+ attn_output_unpad, indices_q, batch_size, query_length
1338
+ )
1339
+ else:
1340
+ attn_output = flash_attn_func(
1341
+ query_states,
1342
+ key_states,
1343
+ value_states,
1344
+ dropout,
1345
+ causal=causal,
1346
+ )[0]
1347
+
1348
+ return attn_output
1349
+
1350
+ def _unpad_input(
1351
+ self, query_layer, key_layer, value_layer, attention_mask, query_length
1352
+ ):
1353
+ indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
1354
+ batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
1355
+
1356
+ # TODO:cuda error
1357
+ key_layer = index_first_axis(
1358
+ key_layer.reshape([batch_size * kv_seq_len, num_key_value_heads, head_dim]),
1359
+ indices_k,
1360
+ )
1361
+ value_layer = index_first_axis(
1362
+ value_layer.reshape(
1363
+ [batch_size * kv_seq_len, num_key_value_heads, head_dim]
1364
+ ),
1365
+ indices_k,
1366
+ )
1367
+
1368
+ if query_length == kv_seq_len:
1369
+ query_layer = index_first_axis(
1370
+ query_layer.reshape(
1371
+ [batch_size * kv_seq_len, self.num_heads, head_dim]
1372
+ ),
1373
+ indices_k,
1374
+ )
1375
+ cu_seqlens_q = cu_seqlens_k
1376
+ max_seqlen_in_batch_q = max_seqlen_in_batch_k
1377
+ indices_q = indices_k
1378
+ elif query_length == 1:
1379
+ max_seqlen_in_batch_q = 1
1380
+ cu_seqlens_q = paddle.arange(
1381
+ batch_size + 1, dtype=paddle.int32
1382
+ ) # There is a memcpy here, that is very bad.
1383
+ indices_q = cu_seqlens_q[:-1]
1384
+ query_layer = query_layer.squeeze(1)
1385
+ else:
1386
+ # The -q_len: slice assumes left padding.
1387
+ attention_mask = attention_mask[:, -query_length:]
1388
+ query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(
1389
+ query_layer, attention_mask
1390
+ )
1391
+
1392
+ return (
1393
+ query_layer,
1394
+ key_layer,
1395
+ value_layer,
1396
+ indices_q.to(paddle.int64),
1397
+ (cu_seqlens_q, cu_seqlens_k),
1398
+ (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
1399
+ )
1400
+
1401
+
1402
+ class Qwen2VLDecoderLayer(nn.Layer):
1403
+ def __init__(self, config: Qwen2VLConfig, layer_idx: int):
1404
+ super().__init__()
1405
+ self.hidden_size = config.hidden_size
1406
+
1407
+ # use_sliding_window false
1408
+ if (
1409
+ config.use_sliding_window
1410
+ and config.attn_implementation != "flash_attention_2"
1411
+ ):
1412
+ logging.warning_once(
1413
+ f"Sliding Window Attention is enabled but not implemented for `{config.attn_implementation}`; "
1414
+ "unexpected results may be encountered."
1415
+ )
1416
+
1417
+ self.self_attn = create_attention_module(config, "qwen2vl", layer_idx=layer_idx)
1418
+ # self.self_attn = Qwen2VLAttention(config, layer_idx)
1419
+ self.mlp = Qwen2MLP(config)
1420
+ self.input_layernorm = Qwen2RMSNorm(
1421
+ config, config.hidden_size, eps=config.rms_norm_eps
1422
+ )
1423
+ self.post_attention_layernorm = Qwen2RMSNorm(
1424
+ config, config.hidden_size, eps=config.rms_norm_eps
1425
+ )
1426
+
1427
+ def forward(
1428
+ self,
1429
+ hidden_states: paddle.Tensor,
1430
+ attention_mask: Optional[paddle.Tensor] = None,
1431
+ position_ids: Optional[paddle.Tensor] = None,
1432
+ past_key_value: Optional[Tuple[paddle.Tensor]] = None,
1433
+ output_attentions: Optional[bool] = False,
1434
+ use_cache: Optional[bool] = False,
1435
+ cache_position: Optional[paddle.Tensor] = None,
1436
+ **kwargs,
1437
+ ):
1438
+ """
1439
+ Args:
1440
+ hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
1441
+ attention_mask (`paddle.Tensor`, *optional*): attention mask of size
1442
+ `(batch, sequence_length)` where padding elements are indicated by 0.
1443
+ output_attentions (`bool`, *optional*):
1444
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
1445
+ returned tensors for more detail.
1446
+ use_cache (`bool`, *optional*):
1447
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
1448
+ (see `past_key_values`).
1449
+ past_key_value (`Tuple(paddle.Tensor)`, *optional*): cached past key and value projection states
1450
+ cache_position (`paddle.Tensor` of shape `(sequence_length)`, *optional*):
1451
+ Indices depicting the position of the input sequence tokens in the sequence.
1452
+ kwargs (`dict`, *optional*):
1453
+ Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
1454
+ into the model
1455
+ """
1456
+
1457
+ residual = hidden_states
1458
+
1459
+ hidden_states = self.input_layernorm(hidden_states)
1460
+
1461
+ # Self Attention
1462
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
1463
+ hidden_states=hidden_states,
1464
+ attention_mask=attention_mask,
1465
+ position_ids=position_ids,
1466
+ past_key_value=past_key_value,
1467
+ output_attentions=output_attentions,
1468
+ use_cache=use_cache,
1469
+ cache_position=cache_position,
1470
+ )
1471
+ hidden_states = residual + hidden_states
1472
+
1473
+ # Fully Connected
1474
+ residual = hidden_states
1475
+ hidden_states = self.post_attention_layernorm(hidden_states)
1476
+ hidden_states = self.mlp(hidden_states)
1477
+ hidden_states = residual + hidden_states
1478
+
1479
+ outputs = (hidden_states,)
1480
+
1481
+ if output_attentions:
1482
+ outputs += (self_attn_weights,)
1483
+
1484
+ if use_cache:
1485
+ outputs += (present_key_value,)
1486
+
1487
+ return outputs
1488
+
1489
+
1490
+ class Qwen2VLPreTrainedModel(PretrainedModel):
1491
+ config_class = Qwen2VLConfig
1492
+ base_model_prefix = "model"
1493
+ _no_split_modules = ["Qwen2VLDecoderLayer", "Qwen2VLVisionBlock"]
1494
+ _skip_keys_device_placement = "past_key_values"
1495
+
1496
+ def _init_weights(self, layer):
1497
+ std = 0.2
1498
+ if isinstance(layer, (nn.Linear, nn.Conv3D)):
1499
+ nn.initializer.Normal(mean=0.0, std=std)(layer.weight)
1500
+ if layer.bias is not None:
1501
+ nn.initializer.Constant(0.0)(layer.bias)
1502
+ elif isinstance(layer, nn.Embedding):
1503
+ nn.initializer.Normal(mean=0.0, std=std)(layer.weight)
1504
+ if layer._padding_idx is not None:
1505
+ with paddle.no_grad():
1506
+ layer.weight[layer._padding_idx] = 0.0
1507
+
1508
+
1509
+ class Qwen2VisionTransformerPretrainedModel(Qwen2VLPreTrainedModel):
1510
+ config_class = Qwen2VLVisionConfig
1511
+ _no_split_modules = ["Qwen2VLVisionBlock"]
1512
+
1513
+ def __init__(self, config) -> None:
1514
+ super().__init__(config)
1515
+ self.spatial_merge_size = config.spatial_merge_size
1516
+
1517
+ self.patch_embed = PatchEmbed(
1518
+ patch_size=config.patch_size,
1519
+ temporal_patch_size=config.temporal_patch_size,
1520
+ in_channels=config.in_channels,
1521
+ embed_dim=config.embed_dim,
1522
+ )
1523
+
1524
+ head_dim = config.embed_dim // config.num_heads
1525
+ self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2)
1526
+
1527
+ self.blocks = nn.LayerList(
1528
+ [Qwen2VLVisionBlock(config) for _ in range(config.depth)]
1529
+ )
1530
+ self.merger = PatchMerger(dim=config.hidden_size, context_dim=config.embed_dim)
1531
+ self.enable_recompute = False
1532
+
1533
+ def get_dtype(self) -> paddle.dtype:
1534
+ return self.blocks[0].mlp.fc2.weight.dtype
1535
+
1536
+ def rot_pos_emb(self, grid_thw):
1537
+ pos_ids = []
1538
+ for t, h, w in grid_thw:
1539
+ hpos_ids = paddle.arange(h).unsqueeze(1).expand([-1, w])
1540
+ hpos_ids = hpos_ids.reshape(
1541
+ [
1542
+ h // self.spatial_merge_size,
1543
+ self.spatial_merge_size,
1544
+ w // self.spatial_merge_size,
1545
+ self.spatial_merge_size,
1546
+ ]
1547
+ )
1548
+ hpos_ids = hpos_ids.transpose(perm=[0, 2, 1, 3])
1549
+ hpos_ids = hpos_ids.flatten()
1550
+
1551
+ wpos_ids = paddle.arange(w).unsqueeze(0).expand([h, -1])
1552
+ wpos_ids = wpos_ids.reshape(
1553
+ [
1554
+ h // self.spatial_merge_size,
1555
+ self.spatial_merge_size,
1556
+ w // self.spatial_merge_size,
1557
+ self.spatial_merge_size,
1558
+ ]
1559
+ )
1560
+ wpos_ids = wpos_ids.transpose([0, 2, 1, 3])
1561
+ wpos_ids = wpos_ids.flatten()
1562
+ pos_ids.append(
1563
+ paddle.stack(x=[hpos_ids, wpos_ids], axis=-1).tile(repeat_times=[t, 1])
1564
+ )
1565
+ pos_ids = paddle.concat(x=pos_ids, axis=0)
1566
+ max_grid_size = grid_thw[:, 1:].max()
1567
+ rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
1568
+ rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(start_axis=1)
1569
+ return rotary_pos_emb
1570
+
1571
+ @paddle.jit.not_to_static
1572
+ def recompute_training_full(
1573
+ self,
1574
+ layer_module: nn.Layer,
1575
+ hidden_states: paddle.Tensor,
1576
+ cu_seqlens_now: paddle.Tensor,
1577
+ rotary_pos_emb: paddle.Tensor,
1578
+ ):
1579
+ def create_custom_forward(module):
1580
+ def custom_forward(*inputs):
1581
+ return module(*inputs)
1582
+
1583
+ return custom_forward
1584
+
1585
+ hidden_states = recompute(
1586
+ create_custom_forward(layer_module),
1587
+ hidden_states,
1588
+ cu_seqlens_now,
1589
+ rotary_pos_emb,
1590
+ # use_reentrant=self.config.recompute_use_reentrant,
1591
+ )
1592
+ return hidden_states
1593
+
1594
+ def forward(
1595
+ self, hidden_states: paddle.Tensor, grid_thw: paddle.Tensor
1596
+ ) -> paddle.Tensor:
1597
+ # breakpoint()
1598
+ hidden_states = self.patch_embed(hidden_states)
1599
+ rotary_pos_emb = self.rot_pos_emb(grid_thw)
1600
+
1601
+ cu_seqlens = paddle.repeat_interleave(
1602
+ grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
1603
+ ).cumsum(axis=0, dtype="int32")
1604
+ cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
1605
+
1606
+ for idx, blk in enumerate(self.blocks):
1607
+ if self.enable_recompute and self.training:
1608
+ hidden_states = self.recompute_training_full(
1609
+ blk, hidden_states, cu_seqlens, rotary_pos_emb
1610
+ )
1611
+ else:
1612
+ hidden_states = blk(
1613
+ hidden_states, cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb
1614
+ )
1615
+
1616
+ return self.merger(hidden_states)
1617
+
1618
+
1619
+ class Qwen2VLModel(Qwen2VLPreTrainedModel):
1620
+ def __init__(self, config: Qwen2VLConfig):
1621
+ super().__init__(config)
1622
+ self.padding_idx = config.pad_token_id
1623
+ self.vocab_size = config.vocab_size
1624
+ self.hidden_size = config.hidden_size
1625
+ # Recompute defaults to False and is controlled by Trainer
1626
+
1627
+ if (
1628
+ config.tensor_parallel_degree > 1
1629
+ and config.vocab_size % config.tensor_parallel_degree == 0
1630
+ ):
1631
+ self.embed_tokens = mpu.VocabParallelEmbedding(
1632
+ self.vocab_size,
1633
+ self.hidden_size,
1634
+ weight_attr=paddle.ParamAttr(initializer=nn.initializer.XavierNormal()),
1635
+ )
1636
+ else:
1637
+ self.embed_tokens = nn.Embedding(
1638
+ self.vocab_size,
1639
+ self.hidden_size,
1640
+ )
1641
+
1642
+ # self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
1643
+ self.layers = nn.LayerList(
1644
+ [
1645
+ Qwen2VLDecoderLayer(config, layer_idx)
1646
+ for layer_idx in range(config.num_hidden_layers)
1647
+ ]
1648
+ )
1649
+ self.norm = Qwen2RMSNorm(config, config.hidden_size, eps=config.rms_norm_eps)
1650
+
1651
+ self.enamble_recompute = False
1652
+
1653
+ def get_input_embeddings(self):
1654
+ return self.embed_tokens
1655
+
1656
+ def set_input_embeddings(self, value):
1657
+ self.embed_tokens = value
1658
+
1659
+ @staticmethod
1660
+ def _prepare_decoder_attention_mask(
1661
+ attention_mask, input_shape, past_key_values_length, dtype
1662
+ ):
1663
+ if attention_mask is not None:
1664
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
1665
+ if len(attention_mask.shape) == 2:
1666
+ expanded_attn_mask = _expand_2d_mask(
1667
+ attention_mask, dtype, tgt_length=input_shape[-1]
1668
+ )
1669
+ # For decoding phase in generation, seq_length = 1, we don't need to add causal mask
1670
+ if input_shape[-1] > 1:
1671
+ combined_attention_mask = _make_causal_mask(
1672
+ input_shape,
1673
+ past_key_values_length=past_key_values_length,
1674
+ )
1675
+ expanded_attn_mask = expanded_attn_mask & combined_attention_mask
1676
+ # [bsz, seq_len, seq_len] -> [bsz, 1, seq_len, seq_len]
1677
+ elif len(attention_mask.shape) == 3:
1678
+ expanded_attn_mask = attention_mask.unsqueeze(1).astype("bool")
1679
+ # if attention_mask is already 4-D, do nothing
1680
+ else:
1681
+ expanded_attn_mask = attention_mask
1682
+ else:
1683
+ expanded_attn_mask = _make_causal_mask(
1684
+ input_shape,
1685
+ past_key_values_length=past_key_values_length,
1686
+ )
1687
+ # Convert bool attention_mask to float attention mask, which will be added to attention_scores later
1688
+ expanded_attn_mask = paddle.where(
1689
+ expanded_attn_mask, 0.0, paddle.finfo(dtype).min
1690
+ ).astype(dtype)
1691
+ return expanded_attn_mask
1692
+
1693
+ @paddle.jit.not_to_static
1694
+ def recompute_training_full(
1695
+ self,
1696
+ layer_module: nn.Layer,
1697
+ hidden_states: paddle.Tensor,
1698
+ attention_mask: paddle.Tensor,
1699
+ position_ids: Optional[paddle.Tensor],
1700
+ past_key_value: paddle.Tensor,
1701
+ output_attentions: bool,
1702
+ use_cache: bool,
1703
+ cache_position: Optional[paddle.Tensor] = None,
1704
+ ):
1705
+ def create_custom_forward(module):
1706
+ def custom_forward(*inputs):
1707
+ return module(*inputs)
1708
+
1709
+ return custom_forward
1710
+
1711
+ hidden_states = recompute(
1712
+ create_custom_forward(layer_module),
1713
+ hidden_states,
1714
+ attention_mask,
1715
+ position_ids,
1716
+ past_key_value,
1717
+ output_attentions,
1718
+ use_cache,
1719
+ cache_position,
1720
+ use_reentrant=self.config.recompute_use_reentrant,
1721
+ )
1722
+ return hidden_states
1723
+
1724
+ def forward(
1725
+ self,
1726
+ input_ids: paddle.Tensor = None,
1727
+ attention_mask: Optional[paddle.Tensor] = None,
1728
+ position_ids: Optional[paddle.Tensor] = None,
1729
+ past_key_values: Optional[List[paddle.Tensor]] = None,
1730
+ inputs_embeds: Optional[paddle.Tensor] = None,
1731
+ use_cache: Optional[bool] = None,
1732
+ output_attentions: Optional[bool] = None,
1733
+ output_hidden_states: Optional[bool] = None,
1734
+ return_dict: Optional[bool] = None,
1735
+ cache_position: Optional[paddle.Tensor] = None,
1736
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
1737
+ output_attentions = (
1738
+ output_attentions
1739
+ if output_attentions is not None
1740
+ else self.config.output_attentions
1741
+ )
1742
+ output_hidden_states = (
1743
+ output_hidden_states
1744
+ if output_hidden_states is not None
1745
+ else self.config.output_hidden_states
1746
+ )
1747
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
1748
+
1749
+ return_dict = (
1750
+ return_dict if return_dict is not None else self.config.use_return_dict
1751
+ )
1752
+
1753
+ if (input_ids is None) ^ (inputs_embeds is not None):
1754
+ raise ValueError(
1755
+ "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
1756
+ )
1757
+ elif input_ids is not None:
1758
+ batch_size, seq_length = input_ids.shape
1759
+ elif inputs_embeds is not None:
1760
+ batch_size, seq_length, _ = inputs_embeds.shape
1761
+ else:
1762
+ raise ValueError(
1763
+ "You have to specify either decoder_input_ids or decoder_inputs_embeds"
1764
+ )
1765
+
1766
+ if past_key_values is None:
1767
+ past_key_values = tuple([None] * len(self.layers))
1768
+ # NOTE: to make cache can be clear in-time
1769
+ past_key_values = list(past_key_values)
1770
+
1771
+ seq_length_with_past = seq_length
1772
+ cache_length = 0
1773
+ if past_key_values[0] is not None:
1774
+ cache_length = past_key_values[0][0].shape[2] # shape[1] in qwen2
1775
+ seq_length_with_past += cache_length
1776
+
1777
+ if inputs_embeds is None:
1778
+ inputs_embeds = self.embed_tokens(input_ids)
1779
+
1780
+ # embed positions
1781
+ if attention_mask is None:
1782
+ # [bs, seq_len]
1783
+ attention_mask = paddle.ones(
1784
+ (batch_size, seq_length_with_past), dtype=paddle.bool
1785
+ )
1786
+
1787
+ if flash_attn_varlen_func:
1788
+ causal_mask = attention_mask
1789
+ else:
1790
+ causal_mask = self._prepare_decoder_attention_mask(
1791
+ attention_mask,
1792
+ (batch_size, seq_length),
1793
+ cache_length,
1794
+ inputs_embeds.dtype,
1795
+ ) # [bs, 1, seq_len, seq_len]
1796
+
1797
+ if cache_position is None:
1798
+ past_seen_tokens = (
1799
+ past_key_values[0][0].shape[2] if past_key_values[0] is not None else 0
1800
+ )
1801
+ cache_position = paddle.arange(
1802
+ past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1]
1803
+ )
1804
+
1805
+ if position_ids is None:
1806
+ # the hard coded `3` is for temporal, height and width.
1807
+ position_ids = cache_position.reshape([1, 1, -1]).expand(
1808
+ [3, inputs_embeds.shape[0], -1]
1809
+ )
1810
+
1811
+ hidden_states = inputs_embeds
1812
+
1813
+ # decoder layers
1814
+ all_hidden_states = () if output_hidden_states else None
1815
+ all_self_attns = () if output_attentions else None
1816
+ next_decoder_cache = ()
1817
+
1818
+ for idx, (decoder_layer) in enumerate(self.layers):
1819
+ if output_hidden_states:
1820
+ all_hidden_states += (hidden_states,)
1821
+
1822
+ past_key_value = (
1823
+ past_key_values[idx] if past_key_values is not None else None
1824
+ )
1825
+
1826
+ if self.enamble_recompute and self.training:
1827
+ layer_outputs = self.recompute_training_full(
1828
+ decoder_layer,
1829
+ hidden_states,
1830
+ causal_mask,
1831
+ position_ids,
1832
+ past_key_value,
1833
+ output_attentions,
1834
+ use_cache,
1835
+ cache_position,
1836
+ )
1837
+ else:
1838
+ layer_outputs = decoder_layer(
1839
+ hidden_states,
1840
+ attention_mask=causal_mask,
1841
+ position_ids=position_ids,
1842
+ past_key_value=past_key_value,
1843
+ output_attentions=output_attentions, # False
1844
+ use_cache=use_cache, # True
1845
+ cache_position=cache_position,
1846
+ )
1847
+
1848
+ # NOTE: clear outdate cache after it has been used for memory saving
1849
+ past_key_value = past_key_values[idx] = None
1850
+
1851
+ hidden_states = layer_outputs[0]
1852
+
1853
+ next_decoder_cache = (
1854
+ next_decoder_cache + (layer_outputs[-1],) if use_cache else None
1855
+ )
1856
+
1857
+ if output_attentions:
1858
+ all_self_attns += (layer_outputs[1],)
1859
+
1860
+ hidden_states = self.norm(hidden_states)
1861
+
1862
+ # add hidden states from the last decoder layer
1863
+ if output_hidden_states:
1864
+ all_hidden_states += (hidden_states,)
1865
+
1866
+ next_cache = next_decoder_cache if use_cache else None
1867
+
1868
+ if not return_dict:
1869
+ return tuple(
1870
+ v
1871
+ for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
1872
+ if v is not None
1873
+ )
1874
+ return BaseModelOutputWithPast(
1875
+ last_hidden_state=hidden_states,
1876
+ past_key_values=next_cache,
1877
+ hidden_states=all_hidden_states,
1878
+ attentions=all_self_attns,
1879
+ )
1880
+
1881
+
1882
+ class Qwen2LMHead(nn.Layer):
1883
+ def __init__(self, config, embedding_weights=None, transpose_y=False):
1884
+ super(Qwen2LMHead, self).__init__()
1885
+ self.config = config
1886
+ if (
1887
+ config.tensor_parallel_degree > 1
1888
+ and config.vocab_size % config.tensor_parallel_degree == 0
1889
+ ):
1890
+ vocab_size = config.vocab_size // config.tensor_parallel_degree
1891
+ else:
1892
+ vocab_size = config.vocab_size
1893
+
1894
+ self.transpose_y = transpose_y
1895
+ if transpose_y:
1896
+ # only for weight from embedding_weights
1897
+ if embedding_weights is not None:
1898
+ self.weight = embedding_weights
1899
+ else:
1900
+ self.weight = self.create_parameter(
1901
+ shape=[vocab_size, config.hidden_size],
1902
+ dtype=paddle.get_default_dtype(),
1903
+ )
1904
+ else:
1905
+
1906
+ if vocab_size != config.vocab_size:
1907
+ with get_rng_state_tracker().rng_state():
1908
+ self.weight = self.create_parameter(
1909
+ shape=[config.hidden_size, vocab_size],
1910
+ dtype=paddle.get_default_dtype(),
1911
+ )
1912
+ else:
1913
+ self.weight = self.create_parameter(
1914
+ shape=[config.hidden_size, vocab_size],
1915
+ dtype=paddle.get_default_dtype(),
1916
+ )
1917
+
1918
+ # Must set distributed attr for Tensor Parallel !
1919
+ self.weight.is_distributed = (
1920
+ True if (vocab_size != config.vocab_size) else False
1921
+ )
1922
+ if self.weight.is_distributed:
1923
+ # for tie_word_embeddings
1924
+ self.weight.split_axis = 0 if self.transpose_y else 1
1925
+
1926
+ def forward(self, hidden_states, tensor_parallel_output=None):
1927
+ if tensor_parallel_output is None:
1928
+ tensor_parallel_output = self.config.tensor_parallel_output
1929
+
1930
+ # 确保数据类型一致
1931
+ if self.weight.dtype != hidden_states.dtype:
1932
+ hidden_states = paddle.cast(hidden_states, self.weight.dtype)
1933
+
1934
+ logits = parallel_matmul(
1935
+ hidden_states,
1936
+ self.weight,
1937
+ transpose_y=self.transpose_y,
1938
+ tensor_parallel_output=tensor_parallel_output,
1939
+ )
1940
+ return logits
1941
+
1942
+
1943
+ class Qwen2VLForConditionalGeneration(Qwen2VLPreTrainedModel):
1944
+ _tied_weights_keys = ["lm_head.weight"]
1945
+
1946
+ def __init__(self, config, attn_implementation="flash_attention_2"):
1947
+ super().__init__(config)
1948
+ config._attn_implementation = attn_implementation
1949
+ config.vision_config._attn_implementation = attn_implementation
1950
+
1951
+ self.visual = Qwen2VisionTransformerPretrainedModel._from_config(
1952
+ config.vision_config
1953
+ )
1954
+ self.model = Qwen2VLModel(config)
1955
+ self.vocab_size = config.vocab_size
1956
+
1957
+ if config.tie_word_embeddings:
1958
+ self.lm_head = Qwen2LMHead(
1959
+ config,
1960
+ embedding_weights=self.model.embed_tokens.weight,
1961
+ transpose_y=True,
1962
+ )
1963
+ self.tie_weights()
1964
+ else:
1965
+ self.lm_head = Qwen2LMHead(config)
1966
+ self.padding_side = "left" # set it to left by default, user can use setter to change padding_sides
1967
+
1968
+ def get_input_embeddings(self):
1969
+ return self.model.embed_tokens
1970
+
1971
+ def set_input_embeddings(self, value):
1972
+ self.model.embed_tokens = value
1973
+
1974
+ def get_output_embeddings(self):
1975
+ return self.lm_head
1976
+
1977
+ def set_output_embeddings(self, new_embeddings):
1978
+ self.lm_head = new_embeddings
1979
+
1980
+ def set_decoder(self, decoder):
1981
+ self.model = decoder
1982
+
1983
+ def get_decoder(self):
1984
+ return self.model
1985
+
1986
+ @classmethod
1987
+ def _get_tensor_parallel_mappings(cls, config: Qwen2VLConfig, is_split=True):
1988
+
1989
+ logging.info("Qwen2 inference model _get_tensor_parallel_mappings")
1990
+
1991
+ from paddlenlp.transformers.conversion_utils import split_or_merge_func
1992
+
1993
+ fn = split_or_merge_func(
1994
+ is_split=is_split,
1995
+ tensor_parallel_degree=config.tensor_parallel_degree,
1996
+ tensor_parallel_rank=config.tensor_parallel_rank,
1997
+ num_attention_heads=config.num_attention_heads,
1998
+ )
1999
+
2000
+ def get_tensor_parallel_split_mappings(num_layers):
2001
+ final_actions = {}
2002
+
2003
+ base_actions = {
2004
+ "lm_head.weight": partial(fn, is_column=True),
2005
+ # Row Linear
2006
+ "embed_tokens.weight": partial(fn, is_column=False),
2007
+ "layers.0.self_attn.o_proj.weight": partial(fn, is_column=False),
2008
+ "layers.0.mlp.down_proj.weight": partial(fn, is_column=False),
2009
+ }
2010
+
2011
+ base_actions["layers.0.self_attn.q_proj.weight"] = partial(
2012
+ fn, is_column=True
2013
+ )
2014
+ base_actions["layers.0.self_attn.q_proj.bias"] = partial(fn, is_column=True)
2015
+ # if we have enough num_key_value_heads to split, then split it.
2016
+ if config.num_key_value_heads % config.tensor_parallel_degree == 0:
2017
+ base_actions["layers.0.self_attn.k_proj.weight"] = partial(
2018
+ fn, is_column=True
2019
+ )
2020
+ base_actions["layers.0.self_attn.v_proj.weight"] = partial(
2021
+ fn, is_column=True
2022
+ )
2023
+ base_actions["layers.0.self_attn.k_proj.bias"] = partial(
2024
+ fn, is_column=True
2025
+ )
2026
+ base_actions["layers.0.self_attn.v_proj.bias"] = partial(
2027
+ fn, is_column=True
2028
+ )
2029
+
2030
+ if config.fuse_attention_ffn:
2031
+ base_actions["layers.0.mlp.gate_up_fused_proj.weight"] = partial(
2032
+ fn, is_column=True, is_naive_2fuse=True
2033
+ )
2034
+ else:
2035
+ base_actions["layers.0.mlp.gate_proj.weight"] = partial(
2036
+ fn, is_column=True
2037
+ )
2038
+ base_actions["layers.0.mlp.up_proj.weight"] = partial(
2039
+ fn, is_column=True
2040
+ )
2041
+
2042
+ for key, action in base_actions.items():
2043
+ if "layers.0." in key:
2044
+ for i in range(num_layers):
2045
+ final_actions[key.replace("layers.0.", f"layers.{i}.")] = action
2046
+ final_actions[key] = action
2047
+
2048
+ return final_actions
2049
+
2050
+ mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers)
2051
+
2052
+ return mappings
2053
+
2054
+ @staticmethod
2055
+ def get_rope_index(
2056
+ spatial_merge_size,
2057
+ image_token_id,
2058
+ video_token_id,
2059
+ vision_start_token_id,
2060
+ input_ids: paddle.Tensor,
2061
+ image_grid_thw: Optional[paddle.Tensor] = None,
2062
+ video_grid_thw: Optional[paddle.Tensor] = None,
2063
+ attention_mask: Optional[paddle.Tensor] = None,
2064
+ ) -> Tuple[paddle.Tensor, paddle.Tensor]:
2065
+ """
2066
+ Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
2067
+
2068
+ Explanation:
2069
+ Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
2070
+
2071
+ For pure text embedding sequence, the rotary position embedding has no difference with modern LLMs.
2072
+ Examples:
2073
+ input_ids: [T T T T T], here T is for text.
2074
+ temporal position_ids: [0, 1, 2, 3, 4]
2075
+ height position_ids: [0, 1, 2, 3, 4]
2076
+ width position_ids: [0, 1, 2, 3, 4]
2077
+
2078
+ For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
2079
+ and 1D rotary position embedding for text part.
2080
+ Examples:
2081
+ Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
2082
+ input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
2083
+ vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
2084
+ vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
2085
+ vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
2086
+ text temporal position_ids: [3, 4, 5, 6, 7]
2087
+ text height position_ids: [3, 4, 5, 6, 7]
2088
+ text width position_ids: [3, 4, 5, 6, 7]
2089
+ Here we calculate the text start position_ids as the max vision position_ids plus 1.
2090
+
2091
+ Args:
2092
+ input_ids (`paddle.Tensor` of shape `(batch_size, sequence_length)`):
2093
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
2094
+ it.
2095
+ image_grid_thw (`paddle.Tensor` of shape `(num_images, 3)`, *optional*):
2096
+ The temporal, height and width of feature shape of each image in LLM.
2097
+ video_grid_thw (`paddle.Tensor` of shape `(num_videos, 3)`, *optional*):
2098
+ The temporal, height and width of feature shape of each video in LLM.
2099
+ attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
2100
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
2101
+
2102
+ - 1 for tokens that are **not masked**,
2103
+ - 0 for tokens that are **masked**.
2104
+
2105
+ Returns:
2106
+ position_ids (`paddle.Tensor` of shape `(3, batch_size, sequence_length)`)
2107
+ mrope_position_deltas (`paddle.Tensor` of shape `(batch_size)`)
2108
+ """
2109
+ mrope_position_deltas = []
2110
+ if image_grid_thw is not None or video_grid_thw is not None:
2111
+ total_input_ids = input_ids
2112
+ position_ids = paddle.ones(
2113
+ [3, input_ids.shape[0], input_ids.shape[1]], dtype=input_ids.dtype
2114
+ )
2115
+ image_index, video_index = 0, 0
2116
+ for i, input_ids in enumerate(total_input_ids):
2117
+ # TODO: CUDA error in some paddle version
2118
+ if attention_mask is not None:
2119
+ input_ids = paddle.to_tensor(
2120
+ input_ids.cpu()[attention_mask[i].cpu() == 1]
2121
+ ) # NOTE 原始写法
2122
+
2123
+ image_nums, video_nums = 0, 0
2124
+ vision_start_indices = paddle.nonzero(
2125
+ input_ids == vision_start_token_id
2126
+ ).squeeze(
2127
+ 1
2128
+ ) # NOTE 原始写法
2129
+
2130
+ vision_tokens = input_ids[vision_start_indices + 1]
2131
+ image_nums = (
2132
+ (vision_tokens == image_token_id).sum()
2133
+ if vision_tokens.numel() > 0
2134
+ else 0
2135
+ )
2136
+ video_nums = (
2137
+ (vision_tokens == video_token_id).sum()
2138
+ if vision_tokens.numel() > 0
2139
+ else 0
2140
+ )
2141
+ input_tokens = input_ids.tolist()
2142
+ llm_pos_ids_list: list = []
2143
+ st = 0
2144
+ remain_images, remain_videos = image_nums, video_nums
2145
+ for _ in range(image_nums + video_nums):
2146
+ if image_token_id in input_tokens and remain_images > 0:
2147
+ ed_image = input_tokens.index(image_token_id, st)
2148
+ else:
2149
+ ed_image = len(input_tokens) + 1
2150
+ if video_token_id in input_tokens and remain_videos > 0:
2151
+ ed_video = input_tokens.index(video_token_id, st)
2152
+ else:
2153
+ ed_video = len(input_tokens) + 1
2154
+ if ed_image < ed_video:
2155
+ t, h, w = (
2156
+ image_grid_thw[image_index][0],
2157
+ image_grid_thw[image_index][1],
2158
+ image_grid_thw[image_index][2],
2159
+ )
2160
+ image_index += 1
2161
+ remain_images -= 1
2162
+ ed = ed_image
2163
+ else:
2164
+ t, h, w = (
2165
+ video_grid_thw[video_index][0],
2166
+ video_grid_thw[video_index][1],
2167
+ video_grid_thw[video_index][2],
2168
+ )
2169
+ video_index += 1
2170
+ remain_videos -= 1
2171
+ ed = ed_video
2172
+ llm_grid_t, llm_grid_h, llm_grid_w = (
2173
+ t.item(),
2174
+ h.item() // spatial_merge_size,
2175
+ w.item() // spatial_merge_size,
2176
+ )
2177
+ text_len = ed - st
2178
+
2179
+ st_idx = (
2180
+ llm_pos_ids_list[-1].max() + 1
2181
+ if len(llm_pos_ids_list) > 0
2182
+ else 0
2183
+ )
2184
+ llm_pos_ids_list.append(
2185
+ paddle.arange(text_len).reshape([1, -1]).expand([3, -1])
2186
+ + st_idx
2187
+ )
2188
+
2189
+ t_index = (
2190
+ paddle.arange(llm_grid_t)
2191
+ .reshape([-1, 1])
2192
+ .expand([-1, llm_grid_h * llm_grid_w])
2193
+ .flatten()
2194
+ )
2195
+ h_index = (
2196
+ paddle.arange(llm_grid_h)
2197
+ .reshape([1, -1, 1])
2198
+ .expand([llm_grid_t, -1, llm_grid_w])
2199
+ .flatten()
2200
+ )
2201
+ w_index = (
2202
+ paddle.arange(llm_grid_w)
2203
+ .reshape([1, 1, -1])
2204
+ .expand([llm_grid_t, llm_grid_h, -1])
2205
+ .flatten()
2206
+ )
2207
+ llm_pos_ids_list.append(
2208
+ paddle.stack([t_index, h_index, w_index]) + text_len + st_idx
2209
+ )
2210
+ st = ed + llm_grid_t * llm_grid_h * llm_grid_w
2211
+
2212
+ if st < len(input_tokens):
2213
+ st_idx = (
2214
+ llm_pos_ids_list[-1].max() + 1
2215
+ if len(llm_pos_ids_list) > 0
2216
+ else 0
2217
+ )
2218
+ text_len = len(input_tokens) - st
2219
+ llm_pos_ids_list.append(
2220
+ paddle.arange(text_len).reshape([1, -1]).expand([3, -1])
2221
+ + st_idx
2222
+ )
2223
+
2224
+ llm_positions = paddle.concat(llm_pos_ids_list, axis=1).reshape([3, -1])
2225
+ if _IS_NPU:
2226
+ bool_indices = (
2227
+ (attention_mask[i] == 1)
2228
+ .unsqueeze(0)
2229
+ .tile([position_ids.shape[0], 1])
2230
+ )
2231
+ position_ids[:, i] = paddle.index_put(
2232
+ position_ids[:, i], [bool_indices], llm_positions.reshape([-1])
2233
+ )
2234
+ else:
2235
+ position_ids[..., i, attention_mask[i] == 1] = llm_positions
2236
+ mrope_position_deltas.append(
2237
+ llm_positions.max() + 1 - len(total_input_ids[i])
2238
+ )
2239
+ mrope_position_deltas = paddle.to_tensor(mrope_position_deltas).unsqueeze(1)
2240
+ else:
2241
+ if attention_mask is not None:
2242
+ position_ids = paddle.cast(attention_mask, dtype="int64").cumsum(-1) - 1
2243
+ position_ids.masked_fill_(mask=attention_mask == 0, value=1)
2244
+ position_ids = position_ids.unsqueeze(0).expand([3, -1, -1])
2245
+ max_position_ids = position_ids.max(0, keepdim=False)[0].max(
2246
+ -1, keepdim=True
2247
+ )[0]
2248
+ mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
2249
+ else:
2250
+ position_ids = (
2251
+ paddle.arange(input_ids.shape[1])
2252
+ .reshape([1, 1, -1])
2253
+ .expand(shape=[3, input_ids.shape[0], -1])
2254
+ )
2255
+ mrope_position_deltas = paddle.zeros(
2256
+ [input_ids.shape[0], 1], dtype=input_ids.dtype
2257
+ )
2258
+
2259
+ return position_ids, mrope_position_deltas
2260
+
2261
+ def update_model_kwargs_for_generation(
2262
+ self,
2263
+ outputs: ModelOutput,
2264
+ model_kwargs: Dict[str, Any],
2265
+ is_encoder_decoder: bool = False,
2266
+ # num_new_tokens: int = 1,
2267
+ ) -> Dict[str, Any]:
2268
+ model_kwargs = super().update_model_kwargs_for_generation(
2269
+ outputs=outputs,
2270
+ model_kwargs=model_kwargs,
2271
+ is_encoder_decoder=is_encoder_decoder,
2272
+ )
2273
+
2274
+ if getattr(outputs, "rope_deltas", None) is not None:
2275
+ model_kwargs["rope_deltas"] = outputs.rope_deltas
2276
+
2277
+ return model_kwargs
2278
+
2279
+ def vision_forward(
2280
+ self,
2281
+ input_ids: paddle.Tensor,
2282
+ inputs_embeds: Optional[paddle.Tensor] = None,
2283
+ attention_mask: Optional[paddle.Tensor] = None,
2284
+ position_ids: Optional[paddle.Tensor] = None,
2285
+ pixel_values: Optional[paddle.Tensor] = None,
2286
+ pixel_values_videos: Optional[paddle.Tensor] = None,
2287
+ image_grid_thw: Optional[paddle.Tensor] = None,
2288
+ video_grid_thw: Optional[paddle.Tensor] = None,
2289
+ rope_deltas: Optional[paddle.Tensor] = None,
2290
+ ):
2291
+
2292
+ if inputs_embeds is None:
2293
+ from paddlenlp.experimental.transformers.qwen2.modeling import (
2294
+ Qwen2VLForConditionalGenerationBlockInferenceModel,
2295
+ )
2296
+
2297
+ assert isinstance(
2298
+ self.model, Qwen2VLForConditionalGenerationBlockInferenceModel
2299
+ ), "model is not an instance of Qwen2VLForConditionalGenerationBlockInferenceModel"
2300
+
2301
+ inputs_embeds = self.model.qwen2.embed_tokens(input_ids)
2302
+ if pixel_values is not None:
2303
+ pixel_values = paddle.cast(pixel_values, paddle.bfloat16)
2304
+ image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
2305
+ image_mask = input_ids == self.config.image_token_id
2306
+
2307
+ inputs_embeds[image_mask] = image_embeds
2308
+ if pixel_values_videos is not None:
2309
+ pixel_values_videos = paddle.cast(pixel_values_videos, paddle.bfloat16)
2310
+ video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
2311
+ video_mask = input_ids == self.config.video_token_id
2312
+ inputs_embeds[video_mask] = video_embeds
2313
+ return inputs_embeds
2314
+
2315
+ def forward(
2316
+ self,
2317
+ input_ids: paddle.Tensor = None,
2318
+ attention_mask: Optional[paddle.Tensor] = None,
2319
+ position_ids: Optional[paddle.Tensor] = None,
2320
+ past_key_values: Optional[List[paddle.Tensor]] = None,
2321
+ inputs_embeds: Optional[paddle.Tensor] = None,
2322
+ labels: Optional[paddle.Tensor] = None,
2323
+ use_cache: Optional[bool] = None,
2324
+ output_attentions: Optional[bool] = None,
2325
+ output_hidden_states: Optional[bool] = None,
2326
+ return_dict: Optional[bool] = None,
2327
+ pixel_values: Optional[paddle.Tensor] = None,
2328
+ pixel_values_videos: Optional[paddle.Tensor] = None,
2329
+ image_grid_thw: Optional[paddle.Tensor] = None,
2330
+ video_grid_thw: Optional[paddle.Tensor] = None,
2331
+ rope_deltas: Optional[paddle.Tensor] = None,
2332
+ ):
2333
+ """
2334
+ Args:
2335
+ labels (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
2336
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
2337
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
2338
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
2339
+ """
2340
+ output_attentions = (
2341
+ output_attentions
2342
+ if output_attentions is not None
2343
+ else self.config.output_attentions
2344
+ )
2345
+ output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states # fmt:skip
2346
+
2347
+ return_dict = True # return_dict if return_dict is not None else self.config.use_return_dict
2348
+
2349
+ if inputs_embeds is None:
2350
+ inputs_embeds = self.model.embed_tokens(input_ids)
2351
+
2352
+ if pixel_values is not None:
2353
+ pixel_values = paddle.cast(pixel_values, inputs_embeds.dtype)
2354
+
2355
+ image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
2356
+ image_embeds = paddle.cast(image_embeds, inputs_embeds.dtype)
2357
+
2358
+ image_mask = input_ids == self.config.image_token_id
2359
+ if self.training:
2360
+ inputs_embeds = inputs_embeds.clone()
2361
+ inputs_embeds[image_mask] = image_embeds
2362
+ if pixel_values_videos is not None:
2363
+ pixel_values_videos = paddle.cast(
2364
+ pixel_values_videos, inputs_embeds.dtype
2365
+ )
2366
+ video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
2367
+ video_embeds = paddle.cast(video_embeds, inputs_embeds.dtype)
2368
+ video_mask = input_ids == self.config.video_token_id
2369
+ inputs_embeds[video_mask] = video_embeds
2370
+ if attention_mask is not None:
2371
+ attention_mask = attention_mask
2372
+
2373
+ outputs = self.model(
2374
+ input_ids=None,
2375
+ position_ids=position_ids,
2376
+ attention_mask=attention_mask,
2377
+ past_key_values=past_key_values,
2378
+ inputs_embeds=inputs_embeds,
2379
+ use_cache=use_cache,
2380
+ output_attentions=output_attentions,
2381
+ output_hidden_states=output_hidden_states,
2382
+ return_dict=return_dict,
2383
+ )
2384
+
2385
+ hidden_states = outputs[0]
2386
+
2387
+ tensor_parallel_output = (
2388
+ self.config.tensor_parallel_output
2389
+ and self.config.tensor_parallel_degree > 1
2390
+ )
2391
+
2392
+ logits = self.lm_head(
2393
+ hidden_states, tensor_parallel_output=tensor_parallel_output
2394
+ )
2395
+
2396
+ logits = paddle.cast(logits, "float32")
2397
+
2398
+ loss = None
2399
+ if labels is not None:
2400
+ # Shift so that tokens < n predict n
2401
+ shift_logits = logits[..., :-1, :]
2402
+ shift_labels = labels[..., 1:]
2403
+ # Flatten the tokens
2404
+ shift_logits = shift_logits.reshape([-1, self.config.vocab_size])
2405
+ shift_labels = shift_labels.reshape([-1])
2406
+ if _IS_NPU:
2407
+ tmp = F.log_softmax(shift_logits, axis=1)
2408
+ loss = F.nll_loss(tmp, shift_labels, reduction="sum")
2409
+ else:
2410
+ loss_fct = nn.CrossEntropyLoss(reduction="sum")
2411
+ loss = loss_fct(shift_logits, shift_labels)
2412
+ label_sum = paddle.sum(shift_labels != -100).cast("float32")
2413
+ loss = loss / label_sum
2414
+
2415
+ if not return_dict:
2416
+ output = (logits,) + tuple(outputs[1:])
2417
+ return (loss,) + output if loss is not None else output
2418
+
2419
+ return Qwen2VLCausalLMOutputWithPast(
2420
+ loss=loss,
2421
+ logits=logits,
2422
+ past_key_values=outputs.past_key_values,
2423
+ hidden_states=outputs.hidden_states,
2424
+ attentions=outputs.attentions,
2425
+ rope_deltas=rope_deltas,
2426
+ )
2427
+
2428
+ def prepare_inputs_for_generation(
2429
+ self,
2430
+ input_ids,
2431
+ past_key_values=None,
2432
+ attention_mask=None,
2433
+ inputs_embeds=None,
2434
+ cache_position=None,
2435
+ position_ids=None,
2436
+ use_cache=True,
2437
+ pixel_values=None,
2438
+ pixel_values_videos=None,
2439
+ image_grid_thw=None,
2440
+ video_grid_thw=None,
2441
+ **kwargs,
2442
+ ):
2443
+
2444
+ batch_size, seq_length = input_ids.shape
2445
+ if past_key_values is None:
2446
+ cache_position = paddle.arange(input_ids.shape[1])
2447
+ else:
2448
+ cache_position = paddle.to_tensor([seq_length - 1])
2449
+
2450
+ if past_key_values is not None:
2451
+ input_ids = input_ids[:, -1].unsqueeze(-1)
2452
+
2453
+ rope_deltas = kwargs.get("rope_deltas", None)
2454
+
2455
+ if attention_mask is not None and position_ids is None:
2456
+ if cache_position is None or (
2457
+ cache_position is not None and cache_position[0] == 0
2458
+ ):
2459
+ position_ids, rope_deltas = self.get_rope_index(
2460
+ self.config.vision_config.spatial_merge_size,
2461
+ self.config.image_token_id,
2462
+ self.config.video_token_id,
2463
+ self.config.vision_start_token_id,
2464
+ input_ids,
2465
+ image_grid_thw,
2466
+ video_grid_thw,
2467
+ attention_mask,
2468
+ )
2469
+ else:
2470
+ batch_size, seq_length = input_ids.shape
2471
+ delta = (
2472
+ cache_position[0] + rope_deltas
2473
+ if cache_position is not None and rope_deltas is not None
2474
+ else 0
2475
+ )
2476
+ position_ids = paddle.arange(seq_length)
2477
+ position_ids = position_ids.reshape([1, -1]).expand([batch_size, -1])
2478
+ position_ids = position_ids + delta
2479
+ position_ids = position_ids.unsqueeze(axis=0).expand([3, -1, -1])
2480
+
2481
+ if cache_position[0] != 0:
2482
+ pixel_values = None
2483
+ pixel_values_videos = None
2484
+
2485
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
2486
+ if inputs_embeds is not None and cache_position[0] == 0:
2487
+ model_inputs = {"inputs_embeds": inputs_embeds}
2488
+ else:
2489
+ model_inputs = {"input_ids": input_ids}
2490
+
2491
+ model_inputs.update(
2492
+ {
2493
+ "position_ids": position_ids, # [3, 1, 3602]
2494
+ "past_key_values": past_key_values, # DynamicCache()
2495
+ "use_cache": use_cache, # 1
2496
+ "attention_mask": attention_mask, # [1, 3602]
2497
+ "pixel_values": pixel_values, # [14308, 1176]
2498
+ "pixel_values_videos": pixel_values_videos,
2499
+ "image_grid_thw": image_grid_thw, # [[ 1, 98, 146]]
2500
+ "video_grid_thw": video_grid_thw,
2501
+ "rope_deltas": rope_deltas, # [[-3504]]
2502
+ }
2503
+ )
2504
+ return model_inputs
2505
+
2506
+ def gme_qwen2_vl_forward(
2507
+ self,
2508
+ input_ids: paddle.Tensor = None,
2509
+ attention_mask: Optional[paddle.Tensor] = None,
2510
+ position_ids: Optional[paddle.Tensor] = None,
2511
+ past_key_values: Optional[List[paddle.Tensor]] = None,
2512
+ inputs_embeds: Optional[paddle.Tensor] = None,
2513
+ labels: Optional[paddle.Tensor] = None,
2514
+ use_cache: Optional[bool] = None,
2515
+ output_attentions: Optional[bool] = None,
2516
+ output_hidden_states: Optional[bool] = None,
2517
+ return_dict: Optional[bool] = None,
2518
+ pixel_values: Optional[paddle.Tensor] = None,
2519
+ pixel_values_videos: Optional[paddle.Tensor] = None,
2520
+ image_grid_thw: Optional[paddle.Tensor] = None,
2521
+ video_grid_thw: Optional[paddle.Tensor] = None,
2522
+ rope_deltas: Optional[paddle.Tensor] = None,
2523
+ ):
2524
+
2525
+ output_attentions = (
2526
+ output_attentions
2527
+ if output_attentions is not None
2528
+ else self.config.output_attentions
2529
+ )
2530
+ output_hidden_states = (
2531
+ output_hidden_states
2532
+ if output_hidden_states is not None
2533
+ else self.config.output_hidden_states
2534
+ )
2535
+ return_dict = True # return_dict if return_dict is not None else self.config.use_return_dict
2536
+
2537
+ if inputs_embeds is None:
2538
+ inputs_embeds = self.model.embed_tokens(input_ids)
2539
+ if pixel_values is not None:
2540
+ # 确保 pixel_values 和 inputs_embeds 使用相同的数据类型
2541
+ pixel_values = paddle.cast(pixel_values, inputs_embeds.dtype)
2542
+ image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
2543
+ # 确保 image_embeds 和 inputs_embeds 使用相同的数据类型
2544
+ image_embeds = paddle.cast(image_embeds, inputs_embeds.dtype)
2545
+ image_mask = input_ids == self.config.image_token_id
2546
+ if self.training:
2547
+ inputs_embeds = inputs_embeds.clone()
2548
+
2549
+ inputs_embeds[image_mask] = image_embeds
2550
+
2551
+ if pixel_values_videos is not None:
2552
+ # 确保 pixel_values_videos 和 inputs_embeds 使用相同的数据类型
2553
+ pixel_values_videos = paddle.cast(
2554
+ pixel_values_videos, inputs_embeds.dtype
2555
+ )
2556
+ video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
2557
+ # 确保 video_embeds 和 inputs_embeds 使用相同的数据类型
2558
+ video_embeds = paddle.cast(video_embeds, inputs_embeds.dtype)
2559
+ video_mask = input_ids == self.config.video_token_id
2560
+ inputs_embeds[video_mask] = video_embeds
2561
+ if attention_mask is not None:
2562
+ attention_mask = attention_mask
2563
+
2564
+ outputs = self.model(
2565
+ input_ids=None,
2566
+ position_ids=position_ids,
2567
+ attention_mask=attention_mask,
2568
+ past_key_values=past_key_values,
2569
+ inputs_embeds=inputs_embeds,
2570
+ use_cache=use_cache,
2571
+ output_attentions=output_attentions,
2572
+ output_hidden_states=output_hidden_states,
2573
+ return_dict=return_dict,
2574
+ )
2575
+
2576
+ hidden_states = outputs[0]
2577
+ # get last hidden state
2578
+ last_hidden_state = hidden_states[:, -1, :]
2579
+ return last_hidden_state
2580
+
2581
+
2582
+ class PPDocBeeInference(Qwen2VLForConditionalGeneration):
2583
+ set_inference_operations(get_inference_operations() + ["docbee_generate"])
2584
+
2585
+ @benchmark.timeit_with_options(name="docbee_generate")
2586
+ def generate(self, inputs, **kwargs):
2587
+ max_new_tokens = kwargs.get("max_new_tokens", 2048)
2588
+ temperature = kwargs.get("temperature", 0.1)
2589
+ top_p = kwargs.get("top_p", 0.001)
2590
+ top_k = kwargs.get("top_k", 1)
2591
+ with paddle.no_grad():
2592
+ generated_ids = super().generate(
2593
+ **inputs,
2594
+ max_new_tokens=max_new_tokens,
2595
+ temperature=temperature,
2596
+ top_p=top_p,
2597
+ top_k=top_k,
2598
+ )
2599
+
2600
+ return generated_ids