paddlex 2.0.0rc4__py3-none-any.whl → 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1533) hide show
  1. paddlex/.version +1 -0
  2. paddlex/__init__.py +35 -18
  3. paddlex/__main__.py +39 -0
  4. paddlex/configs/modules/3d_bev_detection/BEVFusion.yaml +38 -0
  5. paddlex/configs/modules/chart_parsing/PP-Chart2Table.yaml +13 -0
  6. paddlex/configs/modules/doc_text_orientation/PP-LCNet_x1_0_doc_ori.yaml +41 -0
  7. paddlex/configs/modules/doc_vlm/PP-DocBee-2B.yaml +14 -0
  8. paddlex/configs/modules/doc_vlm/PP-DocBee-7B.yaml +14 -0
  9. paddlex/configs/modules/doc_vlm/PP-DocBee2-3B.yaml +14 -0
  10. paddlex/configs/modules/face_detection/BlazeFace-FPN-SSH.yaml +40 -0
  11. paddlex/configs/modules/face_detection/BlazeFace.yaml +40 -0
  12. paddlex/configs/modules/face_detection/PP-YOLOE_plus-S_face.yaml +40 -0
  13. paddlex/configs/modules/face_detection/PicoDet_LCNet_x2_5_face.yaml +40 -0
  14. paddlex/configs/modules/face_feature/MobileFaceNet.yaml +41 -0
  15. paddlex/configs/modules/face_feature/ResNet50_face.yaml +41 -0
  16. paddlex/configs/modules/formula_recognition/LaTeX_OCR_rec.yaml +40 -0
  17. paddlex/configs/modules/formula_recognition/PP-FormulaNet-L.yaml +40 -0
  18. paddlex/configs/modules/formula_recognition/PP-FormulaNet-S.yaml +40 -0
  19. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-L.yaml +40 -0
  20. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-M.yaml +40 -0
  21. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-S.yaml +40 -0
  22. paddlex/configs/modules/formula_recognition/UniMERNet.yaml +40 -0
  23. paddlex/configs/modules/human_detection/PP-YOLOE-L_human.yaml +42 -0
  24. paddlex/configs/modules/human_detection/PP-YOLOE-S_human.yaml +42 -0
  25. paddlex/configs/modules/image_anomaly_detection/STFPM.yaml +41 -0
  26. paddlex/configs/modules/image_classification/CLIP_vit_base_patch16_224.yaml +41 -0
  27. paddlex/configs/modules/image_classification/CLIP_vit_large_patch14_224.yaml +41 -0
  28. paddlex/configs/modules/image_classification/ConvNeXt_base_224.yaml +41 -0
  29. paddlex/configs/modules/image_classification/ConvNeXt_base_384.yaml +41 -0
  30. paddlex/configs/modules/image_classification/ConvNeXt_large_224.yaml +41 -0
  31. paddlex/configs/modules/image_classification/ConvNeXt_large_384.yaml +41 -0
  32. paddlex/configs/modules/image_classification/ConvNeXt_small.yaml +41 -0
  33. paddlex/configs/modules/image_classification/ConvNeXt_tiny.yaml +41 -0
  34. paddlex/configs/modules/image_classification/FasterNet-L.yaml +40 -0
  35. paddlex/configs/modules/image_classification/FasterNet-M.yaml +40 -0
  36. paddlex/configs/modules/image_classification/FasterNet-S.yaml +40 -0
  37. paddlex/configs/modules/image_classification/FasterNet-T0.yaml +40 -0
  38. paddlex/configs/modules/image_classification/FasterNet-T1.yaml +40 -0
  39. paddlex/configs/modules/image_classification/FasterNet-T2.yaml +40 -0
  40. paddlex/configs/modules/image_classification/MobileNetV1_x0_25.yaml +41 -0
  41. paddlex/configs/modules/image_classification/MobileNetV1_x0_5.yaml +41 -0
  42. paddlex/configs/modules/image_classification/MobileNetV1_x0_75.yaml +41 -0
  43. paddlex/configs/modules/image_classification/MobileNetV1_x1_0.yaml +41 -0
  44. paddlex/configs/modules/image_classification/MobileNetV2_x0_25.yaml +41 -0
  45. paddlex/configs/modules/image_classification/MobileNetV2_x0_5.yaml +41 -0
  46. paddlex/configs/modules/image_classification/MobileNetV2_x1_0.yaml +41 -0
  47. paddlex/configs/modules/image_classification/MobileNetV2_x1_5.yaml +41 -0
  48. paddlex/configs/modules/image_classification/MobileNetV2_x2_0.yaml +41 -0
  49. paddlex/configs/modules/image_classification/MobileNetV3_large_x0_35.yaml +41 -0
  50. paddlex/configs/modules/image_classification/MobileNetV3_large_x0_5.yaml +41 -0
  51. paddlex/configs/modules/image_classification/MobileNetV3_large_x0_75.yaml +41 -0
  52. paddlex/configs/modules/image_classification/MobileNetV3_large_x1_0.yaml +41 -0
  53. paddlex/configs/modules/image_classification/MobileNetV3_large_x1_25.yaml +41 -0
  54. paddlex/configs/modules/image_classification/MobileNetV3_small_x0_35.yaml +41 -0
  55. paddlex/configs/modules/image_classification/MobileNetV3_small_x0_5.yaml +41 -0
  56. paddlex/configs/modules/image_classification/MobileNetV3_small_x0_75.yaml +41 -0
  57. paddlex/configs/modules/image_classification/MobileNetV3_small_x1_0.yaml +41 -0
  58. paddlex/configs/modules/image_classification/MobileNetV3_small_x1_25.yaml +41 -0
  59. paddlex/configs/modules/image_classification/MobileNetV4_conv_large.yaml +41 -0
  60. paddlex/configs/modules/image_classification/MobileNetV4_conv_medium.yaml +41 -0
  61. paddlex/configs/modules/image_classification/MobileNetV4_conv_small.yaml +41 -0
  62. paddlex/configs/modules/image_classification/MobileNetV4_hybrid_large.yaml +41 -0
  63. paddlex/configs/modules/image_classification/MobileNetV4_hybrid_medium.yaml +41 -0
  64. paddlex/configs/modules/image_classification/PP-HGNetV2-B0.yaml +41 -0
  65. paddlex/configs/modules/image_classification/PP-HGNetV2-B1.yaml +41 -0
  66. paddlex/configs/modules/image_classification/PP-HGNetV2-B2.yaml +41 -0
  67. paddlex/configs/modules/image_classification/PP-HGNetV2-B3.yaml +41 -0
  68. paddlex/configs/modules/image_classification/PP-HGNetV2-B4.yaml +41 -0
  69. paddlex/configs/modules/image_classification/PP-HGNetV2-B5.yaml +41 -0
  70. paddlex/configs/modules/image_classification/PP-HGNetV2-B6.yaml +41 -0
  71. paddlex/configs/modules/image_classification/PP-HGNet_base.yaml +41 -0
  72. paddlex/configs/modules/image_classification/PP-HGNet_small.yaml +41 -0
  73. paddlex/configs/modules/image_classification/PP-HGNet_tiny.yaml +41 -0
  74. paddlex/configs/modules/image_classification/PP-LCNetV2_base.yaml +41 -0
  75. paddlex/configs/modules/image_classification/PP-LCNetV2_large.yaml +41 -0
  76. paddlex/configs/modules/image_classification/PP-LCNetV2_small.yaml +41 -0
  77. paddlex/configs/modules/image_classification/PP-LCNet_x0_25.yaml +41 -0
  78. paddlex/configs/modules/image_classification/PP-LCNet_x0_35.yaml +41 -0
  79. paddlex/configs/modules/image_classification/PP-LCNet_x0_5.yaml +41 -0
  80. paddlex/configs/modules/image_classification/PP-LCNet_x0_75.yaml +41 -0
  81. paddlex/configs/modules/image_classification/PP-LCNet_x1_0.yaml +41 -0
  82. paddlex/configs/modules/image_classification/PP-LCNet_x1_5.yaml +41 -0
  83. paddlex/configs/modules/image_classification/PP-LCNet_x2_0.yaml +41 -0
  84. paddlex/configs/modules/image_classification/PP-LCNet_x2_5.yaml +41 -0
  85. paddlex/configs/modules/image_classification/ResNet101.yaml +41 -0
  86. paddlex/configs/modules/image_classification/ResNet101_vd.yaml +41 -0
  87. paddlex/configs/modules/image_classification/ResNet152.yaml +41 -0
  88. paddlex/configs/modules/image_classification/ResNet152_vd.yaml +41 -0
  89. paddlex/configs/modules/image_classification/ResNet18.yaml +41 -0
  90. paddlex/configs/modules/image_classification/ResNet18_vd.yaml +41 -0
  91. paddlex/configs/modules/image_classification/ResNet200_vd.yaml +41 -0
  92. paddlex/configs/modules/image_classification/ResNet34.yaml +41 -0
  93. paddlex/configs/modules/image_classification/ResNet34_vd.yaml +41 -0
  94. paddlex/configs/modules/image_classification/ResNet50.yaml +41 -0
  95. paddlex/configs/modules/image_classification/ResNet50_vd.yaml +41 -0
  96. paddlex/configs/modules/image_classification/StarNet-S1.yaml +41 -0
  97. paddlex/configs/modules/image_classification/StarNet-S2.yaml +41 -0
  98. paddlex/configs/modules/image_classification/StarNet-S3.yaml +41 -0
  99. paddlex/configs/modules/image_classification/StarNet-S4.yaml +41 -0
  100. paddlex/configs/modules/image_classification/SwinTransformer_base_patch4_window12_384.yaml +41 -0
  101. paddlex/configs/modules/image_classification/SwinTransformer_base_patch4_window7_224.yaml +41 -0
  102. paddlex/configs/modules/image_classification/SwinTransformer_large_patch4_window12_384.yaml +41 -0
  103. paddlex/configs/modules/image_classification/SwinTransformer_large_patch4_window7_224.yaml +41 -0
  104. paddlex/configs/modules/image_classification/SwinTransformer_small_patch4_window7_224.yaml +41 -0
  105. paddlex/configs/modules/image_classification/SwinTransformer_tiny_patch4_window7_224.yaml +41 -0
  106. paddlex/configs/modules/image_feature/PP-ShiTuV2_rec.yaml +42 -0
  107. paddlex/configs/modules/image_feature/PP-ShiTuV2_rec_CLIP_vit_base.yaml +42 -0
  108. paddlex/configs/modules/image_feature/PP-ShiTuV2_rec_CLIP_vit_large.yaml +41 -0
  109. paddlex/configs/modules/image_multilabel_classification/CLIP_vit_base_patch16_448_ML.yaml +41 -0
  110. paddlex/configs/modules/image_multilabel_classification/PP-HGNetV2-B0_ML.yaml +41 -0
  111. paddlex/configs/modules/image_multilabel_classification/PP-HGNetV2-B4_ML.yaml +41 -0
  112. paddlex/configs/modules/image_multilabel_classification/PP-HGNetV2-B6_ML.yaml +41 -0
  113. paddlex/configs/modules/image_multilabel_classification/PP-LCNet_x1_0_ML.yaml +41 -0
  114. paddlex/configs/modules/image_multilabel_classification/ResNet50_ML.yaml +41 -0
  115. paddlex/configs/modules/image_unwarping/UVDoc.yaml +12 -0
  116. paddlex/configs/modules/instance_segmentation/Cascade-MaskRCNN-ResNet50-FPN.yaml +40 -0
  117. paddlex/configs/modules/instance_segmentation/Cascade-MaskRCNN-ResNet50-vd-SSLDv2-FPN.yaml +40 -0
  118. paddlex/configs/modules/instance_segmentation/Mask-RT-DETR-H.yaml +40 -0
  119. paddlex/configs/modules/instance_segmentation/Mask-RT-DETR-L.yaml +40 -0
  120. paddlex/configs/modules/instance_segmentation/Mask-RT-DETR-M.yaml +40 -0
  121. paddlex/configs/modules/instance_segmentation/Mask-RT-DETR-S.yaml +40 -0
  122. paddlex/configs/modules/instance_segmentation/Mask-RT-DETR-X.yaml +40 -0
  123. paddlex/configs/modules/instance_segmentation/MaskRCNN-ResNeXt101-vd-FPN.yaml +39 -0
  124. paddlex/configs/modules/instance_segmentation/MaskRCNN-ResNet101-FPN.yaml +40 -0
  125. paddlex/configs/modules/instance_segmentation/MaskRCNN-ResNet101-vd-FPN.yaml +40 -0
  126. paddlex/configs/modules/instance_segmentation/MaskRCNN-ResNet50-FPN.yaml +40 -0
  127. paddlex/configs/modules/instance_segmentation/MaskRCNN-ResNet50-vd-FPN.yaml +40 -0
  128. paddlex/configs/modules/instance_segmentation/MaskRCNN-ResNet50.yaml +40 -0
  129. paddlex/configs/modules/instance_segmentation/PP-YOLOE_seg-S.yaml +40 -0
  130. paddlex/configs/modules/instance_segmentation/SOLOv2.yaml +40 -0
  131. paddlex/configs/modules/keypoint_detection/PP-TinyPose_128x96.yaml +40 -0
  132. paddlex/configs/modules/keypoint_detection/PP-TinyPose_256x192.yaml +40 -0
  133. paddlex/configs/modules/layout_detection/PP-DocBlockLayout.yaml +40 -0
  134. paddlex/configs/modules/layout_detection/PP-DocLayout-L.yaml +40 -0
  135. paddlex/configs/modules/layout_detection/PP-DocLayout-M.yaml +40 -0
  136. paddlex/configs/modules/layout_detection/PP-DocLayout-S.yaml +40 -0
  137. paddlex/configs/modules/layout_detection/PP-DocLayout_plus-L.yaml +40 -0
  138. paddlex/configs/modules/layout_detection/PicoDet-L_layout_17cls.yaml +40 -0
  139. paddlex/configs/modules/layout_detection/PicoDet-L_layout_3cls.yaml +40 -0
  140. paddlex/configs/modules/layout_detection/PicoDet-S_layout_17cls.yaml +40 -0
  141. paddlex/configs/modules/layout_detection/PicoDet-S_layout_3cls.yaml +40 -0
  142. paddlex/configs/modules/layout_detection/PicoDet_layout_1x.yaml +40 -0
  143. paddlex/configs/modules/layout_detection/PicoDet_layout_1x_table.yaml +40 -0
  144. paddlex/configs/modules/layout_detection/RT-DETR-H_layout_17cls.yaml +40 -0
  145. paddlex/configs/modules/layout_detection/RT-DETR-H_layout_3cls.yaml +40 -0
  146. paddlex/configs/modules/mainbody_detection/PP-ShiTuV2_det.yaml +41 -0
  147. paddlex/configs/modules/multilingual_speech_recognition/whisper_base.yaml +12 -0
  148. paddlex/configs/modules/multilingual_speech_recognition/whisper_large.yaml +12 -0
  149. paddlex/configs/modules/multilingual_speech_recognition/whisper_medium.yaml +12 -0
  150. paddlex/configs/modules/multilingual_speech_recognition/whisper_small.yaml +12 -0
  151. paddlex/configs/modules/multilingual_speech_recognition/whisper_tiny.yaml +12 -0
  152. paddlex/configs/modules/object_detection/Cascade-FasterRCNN-ResNet50-FPN.yaml +41 -0
  153. paddlex/configs/modules/object_detection/Cascade-FasterRCNN-ResNet50-vd-SSLDv2-FPN.yaml +42 -0
  154. paddlex/configs/modules/object_detection/CenterNet-DLA-34.yaml +41 -0
  155. paddlex/configs/modules/object_detection/CenterNet-ResNet50.yaml +41 -0
  156. paddlex/configs/modules/object_detection/Co-DINO-R50.yaml +40 -0
  157. paddlex/configs/modules/object_detection/Co-DINO-Swin-L.yaml +40 -0
  158. paddlex/configs/modules/object_detection/Co-Deformable-DETR-R50.yaml +40 -0
  159. paddlex/configs/modules/object_detection/Co-Deformable-DETR-Swin-T.yaml +40 -0
  160. paddlex/configs/modules/object_detection/DETR-R50.yaml +42 -0
  161. paddlex/configs/modules/object_detection/FCOS-ResNet50.yaml +41 -0
  162. paddlex/configs/modules/object_detection/FasterRCNN-ResNeXt101-vd-FPN.yaml +42 -0
  163. paddlex/configs/modules/object_detection/FasterRCNN-ResNet101-FPN.yaml +42 -0
  164. paddlex/configs/modules/object_detection/FasterRCNN-ResNet101.yaml +42 -0
  165. paddlex/configs/modules/object_detection/FasterRCNN-ResNet34-FPN.yaml +42 -0
  166. paddlex/configs/modules/object_detection/FasterRCNN-ResNet50-FPN.yaml +42 -0
  167. paddlex/configs/modules/object_detection/FasterRCNN-ResNet50-vd-FPN.yaml +42 -0
  168. paddlex/configs/modules/object_detection/FasterRCNN-ResNet50-vd-SSLDv2-FPN.yaml +42 -0
  169. paddlex/configs/modules/object_detection/FasterRCNN-ResNet50.yaml +42 -0
  170. paddlex/configs/modules/object_detection/FasterRCNN-Swin-Tiny-FPN.yaml +42 -0
  171. paddlex/configs/modules/object_detection/PP-YOLOE_plus-L.yaml +40 -0
  172. paddlex/configs/modules/object_detection/PP-YOLOE_plus-M.yaml +40 -0
  173. paddlex/configs/modules/object_detection/PP-YOLOE_plus-S.yaml +40 -0
  174. paddlex/configs/modules/object_detection/PP-YOLOE_plus-X.yaml +40 -0
  175. paddlex/configs/modules/object_detection/PicoDet-L.yaml +40 -0
  176. paddlex/configs/modules/object_detection/PicoDet-M.yaml +42 -0
  177. paddlex/configs/modules/object_detection/PicoDet-S.yaml +40 -0
  178. paddlex/configs/modules/object_detection/PicoDet-XS.yaml +42 -0
  179. paddlex/configs/modules/object_detection/RT-DETR-H.yaml +40 -0
  180. paddlex/configs/modules/object_detection/RT-DETR-L.yaml +40 -0
  181. paddlex/configs/modules/object_detection/RT-DETR-R18.yaml +40 -0
  182. paddlex/configs/modules/object_detection/RT-DETR-R50.yaml +40 -0
  183. paddlex/configs/modules/object_detection/RT-DETR-X.yaml +40 -0
  184. paddlex/configs/modules/object_detection/YOLOX-L.yaml +40 -0
  185. paddlex/configs/modules/object_detection/YOLOX-M.yaml +40 -0
  186. paddlex/configs/modules/object_detection/YOLOX-N.yaml +40 -0
  187. paddlex/configs/modules/object_detection/YOLOX-S.yaml +40 -0
  188. paddlex/configs/modules/object_detection/YOLOX-T.yaml +40 -0
  189. paddlex/configs/modules/object_detection/YOLOX-X.yaml +40 -0
  190. paddlex/configs/modules/object_detection/YOLOv3-DarkNet53.yaml +40 -0
  191. paddlex/configs/modules/object_detection/YOLOv3-MobileNetV3.yaml +40 -0
  192. paddlex/configs/modules/object_detection/YOLOv3-ResNet50_vd_DCN.yaml +40 -0
  193. paddlex/configs/modules/open_vocabulary_detection/GroundingDINO-T.yaml +13 -0
  194. paddlex/configs/modules/open_vocabulary_detection/YOLO-Worldv2-L.yaml +13 -0
  195. paddlex/configs/modules/open_vocabulary_segmentation/SAM-H_box.yaml +17 -0
  196. paddlex/configs/modules/open_vocabulary_segmentation/SAM-H_point.yaml +15 -0
  197. paddlex/configs/modules/pedestrian_attribute_recognition/PP-LCNet_x1_0_pedestrian_attribute.yaml +41 -0
  198. paddlex/configs/modules/rotated_object_detection/PP-YOLOE-R-L.yaml +40 -0
  199. paddlex/configs/modules/seal_text_detection/PP-OCRv4_mobile_seal_det.yaml +40 -0
  200. paddlex/configs/modules/seal_text_detection/PP-OCRv4_server_seal_det.yaml +40 -0
  201. paddlex/configs/modules/semantic_segmentation/Deeplabv3-R101.yaml +40 -0
  202. paddlex/configs/modules/semantic_segmentation/Deeplabv3-R50.yaml +40 -0
  203. paddlex/configs/modules/semantic_segmentation/Deeplabv3_Plus-R101.yaml +40 -0
  204. paddlex/configs/modules/semantic_segmentation/Deeplabv3_Plus-R50.yaml +40 -0
  205. paddlex/configs/modules/semantic_segmentation/MaskFormer_small.yaml +42 -0
  206. paddlex/configs/modules/semantic_segmentation/MaskFormer_tiny.yaml +42 -0
  207. paddlex/configs/modules/semantic_segmentation/OCRNet_HRNet-W18.yaml +40 -0
  208. paddlex/configs/modules/semantic_segmentation/OCRNet_HRNet-W48.yaml +40 -0
  209. paddlex/configs/modules/semantic_segmentation/PP-LiteSeg-B.yaml +41 -0
  210. paddlex/configs/modules/semantic_segmentation/PP-LiteSeg-T.yaml +40 -0
  211. paddlex/configs/modules/semantic_segmentation/SeaFormer_base.yaml +40 -0
  212. paddlex/configs/modules/semantic_segmentation/SeaFormer_large.yaml +40 -0
  213. paddlex/configs/modules/semantic_segmentation/SeaFormer_small.yaml +40 -0
  214. paddlex/configs/modules/semantic_segmentation/SeaFormer_tiny.yaml +40 -0
  215. paddlex/configs/modules/semantic_segmentation/SegFormer-B0.yaml +40 -0
  216. paddlex/configs/modules/semantic_segmentation/SegFormer-B1.yaml +40 -0
  217. paddlex/configs/modules/semantic_segmentation/SegFormer-B2.yaml +40 -0
  218. paddlex/configs/modules/semantic_segmentation/SegFormer-B3.yaml +40 -0
  219. paddlex/configs/modules/semantic_segmentation/SegFormer-B4.yaml +40 -0
  220. paddlex/configs/modules/semantic_segmentation/SegFormer-B5.yaml +40 -0
  221. paddlex/configs/modules/small_object_detection/PP-YOLOE_plus_SOD-L.yaml +42 -0
  222. paddlex/configs/modules/small_object_detection/PP-YOLOE_plus_SOD-S.yaml +42 -0
  223. paddlex/configs/modules/small_object_detection/PP-YOLOE_plus_SOD-largesize-L.yaml +42 -0
  224. paddlex/configs/modules/table_cells_detection/RT-DETR-L_wired_table_cell_det.yaml +40 -0
  225. paddlex/configs/modules/table_cells_detection/RT-DETR-L_wireless_table_cell_det.yaml +40 -0
  226. paddlex/configs/modules/table_classification/PP-LCNet_x1_0_table_cls.yaml +41 -0
  227. paddlex/configs/modules/table_structure_recognition/SLANeXt_wired.yaml +39 -0
  228. paddlex/configs/modules/table_structure_recognition/SLANeXt_wireless.yaml +39 -0
  229. paddlex/configs/modules/table_structure_recognition/SLANet.yaml +39 -0
  230. paddlex/configs/modules/table_structure_recognition/SLANet_plus.yaml +39 -0
  231. paddlex/configs/modules/text_detection/PP-OCRv3_mobile_det.yaml +40 -0
  232. paddlex/configs/modules/text_detection/PP-OCRv3_server_det.yaml +40 -0
  233. paddlex/configs/modules/text_detection/PP-OCRv4_mobile_det.yaml +40 -0
  234. paddlex/configs/modules/text_detection/PP-OCRv4_server_det.yaml +40 -0
  235. paddlex/configs/modules/text_detection/PP-OCRv5_mobile_det.yaml +40 -0
  236. paddlex/configs/modules/text_detection/PP-OCRv5_server_det.yaml +40 -0
  237. paddlex/configs/modules/text_recognition/PP-OCRv3_mobile_rec.yaml +39 -0
  238. paddlex/configs/modules/text_recognition/PP-OCRv4_mobile_rec.yaml +39 -0
  239. paddlex/configs/modules/text_recognition/PP-OCRv4_server_rec.yaml +39 -0
  240. paddlex/configs/modules/text_recognition/PP-OCRv4_server_rec_doc.yaml +39 -0
  241. paddlex/configs/modules/text_recognition/PP-OCRv5_mobile_rec.yaml +39 -0
  242. paddlex/configs/modules/text_recognition/PP-OCRv5_server_rec.yaml +39 -0
  243. paddlex/configs/modules/text_recognition/arabic_PP-OCRv3_mobile_rec.yaml +39 -0
  244. paddlex/configs/modules/text_recognition/ch_RepSVTR_rec.yaml +39 -0
  245. paddlex/configs/modules/text_recognition/ch_SVTRv2_rec.yaml +39 -0
  246. paddlex/configs/modules/text_recognition/chinese_cht_PP-OCRv3_mobile_rec.yaml +39 -0
  247. paddlex/configs/modules/text_recognition/cyrillic_PP-OCRv3_mobile_rec.yaml +39 -0
  248. paddlex/configs/modules/text_recognition/devanagari_PP-OCRv3_mobile_rec.yaml +39 -0
  249. paddlex/configs/modules/text_recognition/en_PP-OCRv3_mobile_rec.yaml +39 -0
  250. paddlex/configs/modules/text_recognition/en_PP-OCRv4_mobile_rec.yaml +39 -0
  251. paddlex/configs/modules/text_recognition/japan_PP-OCRv3_mobile_rec.yaml +39 -0
  252. paddlex/configs/modules/text_recognition/ka_PP-OCRv3_mobile_rec.yaml +39 -0
  253. paddlex/configs/modules/text_recognition/korean_PP-OCRv3_mobile_rec.yaml +39 -0
  254. paddlex/configs/modules/text_recognition/latin_PP-OCRv3_mobile_rec.yaml +39 -0
  255. paddlex/configs/modules/text_recognition/ta_PP-OCRv3_mobile_rec.yaml +39 -0
  256. paddlex/configs/modules/text_recognition/te_PP-OCRv3_mobile_rec.yaml +39 -0
  257. paddlex/configs/modules/textline_orientation/PP-LCNet_x0_25_textline_ori.yaml +41 -0
  258. paddlex/configs/modules/ts_anomaly_detection/AutoEncoder_ad.yaml +37 -0
  259. paddlex/configs/modules/ts_anomaly_detection/DLinear_ad.yaml +37 -0
  260. paddlex/configs/modules/ts_anomaly_detection/Nonstationary_ad.yaml +37 -0
  261. paddlex/configs/modules/ts_anomaly_detection/PatchTST_ad.yaml +37 -0
  262. paddlex/configs/modules/ts_anomaly_detection/TimesNet_ad.yaml +37 -0
  263. paddlex/configs/modules/ts_classification/TimesNet_cls.yaml +37 -0
  264. paddlex/configs/modules/ts_forecast/DLinear.yaml +38 -0
  265. paddlex/configs/modules/ts_forecast/NLinear.yaml +38 -0
  266. paddlex/configs/modules/ts_forecast/Nonstationary.yaml +38 -0
  267. paddlex/configs/modules/ts_forecast/PatchTST.yaml +38 -0
  268. paddlex/configs/modules/ts_forecast/RLinear.yaml +38 -0
  269. paddlex/configs/modules/ts_forecast/TiDE.yaml +38 -0
  270. paddlex/configs/modules/ts_forecast/TimesNet.yaml +38 -0
  271. paddlex/configs/modules/vehicle_attribute_recognition/PP-LCNet_x1_0_vehicle_attribute.yaml +41 -0
  272. paddlex/configs/modules/vehicle_detection/PP-YOLOE-L_vehicle.yaml +41 -0
  273. paddlex/configs/modules/vehicle_detection/PP-YOLOE-S_vehicle.yaml +42 -0
  274. paddlex/configs/modules/video_classification/PP-TSM-R50_8frames_uniform.yaml +42 -0
  275. paddlex/configs/modules/video_classification/PP-TSMv2-LCNetV2_16frames_uniform.yaml +42 -0
  276. paddlex/configs/modules/video_classification/PP-TSMv2-LCNetV2_8frames_uniform.yaml +42 -0
  277. paddlex/configs/modules/video_detection/YOWO.yaml +40 -0
  278. paddlex/configs/pipelines/3d_bev_detection.yaml +9 -0
  279. paddlex/configs/pipelines/OCR.yaml +45 -0
  280. paddlex/configs/pipelines/PP-ChatOCRv3-doc.yaml +151 -0
  281. paddlex/configs/pipelines/PP-ChatOCRv4-doc.yaml +237 -0
  282. paddlex/configs/pipelines/PP-ShiTuV2.yaml +18 -0
  283. paddlex/configs/pipelines/PP-StructureV3.yaml +226 -0
  284. paddlex/configs/pipelines/anomaly_detection.yaml +8 -0
  285. paddlex/configs/pipelines/doc_preprocessor.yaml +15 -0
  286. paddlex/configs/pipelines/doc_understanding.yaml +9 -0
  287. paddlex/configs/pipelines/face_recognition.yaml +18 -0
  288. paddlex/configs/pipelines/formula_recognition.yaml +39 -0
  289. paddlex/configs/pipelines/human_keypoint_detection.yaml +17 -0
  290. paddlex/configs/pipelines/image_classification.yaml +10 -0
  291. paddlex/configs/pipelines/image_multilabel_classification.yaml +9 -0
  292. paddlex/configs/pipelines/instance_segmentation.yaml +10 -0
  293. paddlex/configs/pipelines/layout_parsing.yaml +102 -0
  294. paddlex/configs/pipelines/multilingual_speech_recognition.yaml +9 -0
  295. paddlex/configs/pipelines/object_detection.yaml +10 -0
  296. paddlex/configs/pipelines/open_vocabulary_detection.yaml +12 -0
  297. paddlex/configs/pipelines/open_vocabulary_segmentation.yaml +13 -0
  298. paddlex/configs/pipelines/pedestrian_attribute_recognition.yaml +15 -0
  299. paddlex/configs/pipelines/rotated_object_detection.yaml +10 -0
  300. paddlex/configs/pipelines/seal_recognition.yaml +52 -0
  301. paddlex/configs/pipelines/semantic_segmentation.yaml +10 -0
  302. paddlex/configs/pipelines/small_object_detection.yaml +10 -0
  303. paddlex/configs/pipelines/table_recognition.yaml +57 -0
  304. paddlex/configs/pipelines/table_recognition_v2.yaml +82 -0
  305. paddlex/configs/pipelines/ts_anomaly_detection.yaml +8 -0
  306. paddlex/configs/pipelines/ts_classification.yaml +8 -0
  307. paddlex/configs/pipelines/ts_forecast.yaml +8 -0
  308. paddlex/configs/pipelines/vehicle_attribute_recognition.yaml +15 -0
  309. paddlex/configs/pipelines/video_classification.yaml +9 -0
  310. paddlex/configs/pipelines/video_detection.yaml +10 -0
  311. paddlex/constants.py +17 -0
  312. paddlex/engine.py +56 -0
  313. paddlex/hpip_links.html +31 -0
  314. paddlex/inference/__init__.py +19 -0
  315. paddlex/inference/common/__init__.py +13 -0
  316. paddlex/inference/common/batch_sampler/__init__.py +21 -0
  317. paddlex/inference/common/batch_sampler/audio_batch_sampler.py +83 -0
  318. paddlex/inference/common/batch_sampler/base_batch_sampler.py +94 -0
  319. paddlex/inference/common/batch_sampler/det_3d_batch_sampler.py +144 -0
  320. paddlex/inference/common/batch_sampler/doc_vlm_batch_sampler.py +87 -0
  321. paddlex/inference/common/batch_sampler/image_batch_sampler.py +121 -0
  322. paddlex/inference/common/batch_sampler/ts_batch_sampler.py +109 -0
  323. paddlex/inference/common/batch_sampler/video_batch_sampler.py +74 -0
  324. paddlex/inference/common/reader/__init__.py +19 -0
  325. paddlex/inference/common/reader/audio_reader.py +46 -0
  326. paddlex/inference/common/reader/det_3d_reader.py +241 -0
  327. paddlex/inference/common/reader/image_reader.py +73 -0
  328. paddlex/inference/common/reader/ts_reader.py +46 -0
  329. paddlex/inference/common/reader/video_reader.py +42 -0
  330. paddlex/inference/common/result/__init__.py +29 -0
  331. paddlex/inference/common/result/base_cv_result.py +41 -0
  332. paddlex/inference/common/result/base_result.py +72 -0
  333. paddlex/inference/common/result/base_ts_result.py +41 -0
  334. paddlex/inference/common/result/base_video_result.py +36 -0
  335. paddlex/inference/common/result/mixin.py +709 -0
  336. paddlex/inference/models/__init__.py +86 -0
  337. paddlex/inference/models/anomaly_detection/__init__.py +15 -0
  338. paddlex/inference/models/anomaly_detection/predictor.py +135 -0
  339. paddlex/inference/models/anomaly_detection/processors.py +53 -0
  340. paddlex/inference/models/anomaly_detection/result.py +71 -0
  341. paddlex/inference/models/base/__init__.py +15 -0
  342. paddlex/inference/models/base/predictor/__init__.py +15 -0
  343. paddlex/inference/models/base/predictor/base_predictor.py +414 -0
  344. paddlex/inference/models/common/__init__.py +26 -0
  345. paddlex/inference/models/common/static_infer.py +801 -0
  346. paddlex/inference/models/common/tokenizer/__init__.py +21 -0
  347. paddlex/inference/models/common/tokenizer/bert_tokenizer.py +655 -0
  348. paddlex/inference/models/common/tokenizer/clip_tokenizer.py +609 -0
  349. paddlex/inference/models/common/tokenizer/gpt_tokenizer.py +453 -0
  350. paddlex/inference/models/common/tokenizer/qwen2_5_tokenizer.py +112 -0
  351. paddlex/inference/models/common/tokenizer/qwen2_tokenizer.py +438 -0
  352. paddlex/inference/models/common/tokenizer/qwen_tokenizer.py +288 -0
  353. paddlex/inference/models/common/tokenizer/tokenizer_utils.py +2149 -0
  354. paddlex/inference/models/common/tokenizer/tokenizer_utils_base.py +3720 -0
  355. paddlex/inference/models/common/tokenizer/utils.py +66 -0
  356. paddlex/inference/models/common/tokenizer/vocab.py +647 -0
  357. paddlex/inference/models/common/ts/__init__.py +15 -0
  358. paddlex/inference/models/common/ts/funcs.py +540 -0
  359. paddlex/inference/models/common/ts/processors.py +322 -0
  360. paddlex/inference/models/common/vision/__init__.py +23 -0
  361. paddlex/inference/models/common/vision/funcs.py +98 -0
  362. paddlex/inference/models/common/vision/processors.py +285 -0
  363. paddlex/inference/models/common/vlm/__init__.py +13 -0
  364. paddlex/inference/models/common/vlm/activations.py +189 -0
  365. paddlex/inference/models/common/vlm/bert_padding.py +127 -0
  366. paddlex/inference/models/common/vlm/conversion_utils.py +99 -0
  367. paddlex/inference/models/common/vlm/distributed.py +229 -0
  368. paddlex/inference/models/common/vlm/flash_attn_utils.py +119 -0
  369. paddlex/inference/models/common/vlm/fusion_ops.py +205 -0
  370. paddlex/inference/models/common/vlm/generation/__init__.py +34 -0
  371. paddlex/inference/models/common/vlm/generation/configuration_utils.py +533 -0
  372. paddlex/inference/models/common/vlm/generation/logits_process.py +730 -0
  373. paddlex/inference/models/common/vlm/generation/stopping_criteria.py +106 -0
  374. paddlex/inference/models/common/vlm/generation/utils.py +2162 -0
  375. paddlex/inference/models/common/vlm/transformers/__init__.py +16 -0
  376. paddlex/inference/models/common/vlm/transformers/configuration_utils.py +1037 -0
  377. paddlex/inference/models/common/vlm/transformers/conversion_utils.py +408 -0
  378. paddlex/inference/models/common/vlm/transformers/model_outputs.py +1612 -0
  379. paddlex/inference/models/common/vlm/transformers/model_utils.py +2014 -0
  380. paddlex/inference/models/common/vlm/transformers/utils.py +178 -0
  381. paddlex/inference/models/common/vlm/utils.py +109 -0
  382. paddlex/inference/models/doc_vlm/__init__.py +15 -0
  383. paddlex/inference/models/doc_vlm/modeling/GOT_ocr_2_0.py +830 -0
  384. paddlex/inference/models/doc_vlm/modeling/__init__.py +17 -0
  385. paddlex/inference/models/doc_vlm/modeling/qwen2.py +1606 -0
  386. paddlex/inference/models/doc_vlm/modeling/qwen2_5_vl.py +3006 -0
  387. paddlex/inference/models/doc_vlm/modeling/qwen2_vl.py +2495 -0
  388. paddlex/inference/models/doc_vlm/predictor.py +253 -0
  389. paddlex/inference/models/doc_vlm/processors/GOT_ocr_2_0.py +97 -0
  390. paddlex/inference/models/doc_vlm/processors/__init__.py +17 -0
  391. paddlex/inference/models/doc_vlm/processors/common.py +561 -0
  392. paddlex/inference/models/doc_vlm/processors/qwen2_5_vl.py +548 -0
  393. paddlex/inference/models/doc_vlm/processors/qwen2_vl.py +543 -0
  394. paddlex/inference/models/doc_vlm/result.py +21 -0
  395. paddlex/inference/models/face_feature/__init__.py +15 -0
  396. paddlex/inference/models/face_feature/predictor.py +66 -0
  397. paddlex/inference/models/formula_recognition/__init__.py +15 -0
  398. paddlex/inference/models/formula_recognition/predictor.py +193 -0
  399. paddlex/inference/models/formula_recognition/processors.py +1015 -0
  400. paddlex/inference/models/formula_recognition/result.py +411 -0
  401. paddlex/inference/models/image_classification/__init__.py +15 -0
  402. paddlex/inference/models/image_classification/predictor.py +172 -0
  403. paddlex/inference/models/image_classification/processors.py +89 -0
  404. paddlex/inference/models/image_classification/result.py +93 -0
  405. paddlex/inference/models/image_feature/__init__.py +15 -0
  406. paddlex/inference/models/image_feature/predictor.py +146 -0
  407. paddlex/inference/models/image_feature/processors.py +31 -0
  408. paddlex/inference/models/image_feature/result.py +32 -0
  409. paddlex/inference/models/image_multilabel_classification/__init__.py +15 -0
  410. paddlex/inference/models/image_multilabel_classification/predictor.py +95 -0
  411. paddlex/inference/models/image_multilabel_classification/processors.py +89 -0
  412. paddlex/inference/models/image_multilabel_classification/result.py +96 -0
  413. paddlex/inference/models/image_unwarping/__init__.py +15 -0
  414. paddlex/inference/models/image_unwarping/predictor.py +97 -0
  415. paddlex/inference/models/image_unwarping/processors.py +92 -0
  416. paddlex/inference/models/image_unwarping/result.py +47 -0
  417. paddlex/inference/models/instance_segmentation/__init__.py +15 -0
  418. paddlex/inference/models/instance_segmentation/predictor.py +202 -0
  419. paddlex/inference/models/instance_segmentation/processors.py +102 -0
  420. paddlex/inference/models/instance_segmentation/result.py +162 -0
  421. paddlex/inference/models/keypoint_detection/__init__.py +15 -0
  422. paddlex/inference/models/keypoint_detection/predictor.py +190 -0
  423. paddlex/inference/models/keypoint_detection/processors.py +367 -0
  424. paddlex/inference/models/keypoint_detection/result.py +197 -0
  425. paddlex/inference/models/m_3d_bev_detection/__init__.py +15 -0
  426. paddlex/inference/models/m_3d_bev_detection/predictor.py +303 -0
  427. paddlex/inference/models/m_3d_bev_detection/processors.py +990 -0
  428. paddlex/inference/models/m_3d_bev_detection/result.py +68 -0
  429. paddlex/inference/models/m_3d_bev_detection/visualizer_3d.py +169 -0
  430. paddlex/inference/models/multilingual_speech_recognition/__init__.py +15 -0
  431. paddlex/inference/models/multilingual_speech_recognition/predictor.py +137 -0
  432. paddlex/inference/models/multilingual_speech_recognition/processors.py +1933 -0
  433. paddlex/inference/models/multilingual_speech_recognition/result.py +21 -0
  434. paddlex/inference/models/object_detection/__init__.py +15 -0
  435. paddlex/inference/models/object_detection/predictor.py +344 -0
  436. paddlex/inference/models/object_detection/processors.py +885 -0
  437. paddlex/inference/models/object_detection/result.py +114 -0
  438. paddlex/inference/models/object_detection/utils.py +70 -0
  439. paddlex/inference/models/open_vocabulary_detection/__init__.py +15 -0
  440. paddlex/inference/models/open_vocabulary_detection/predictor.py +172 -0
  441. paddlex/inference/models/open_vocabulary_detection/processors/__init__.py +16 -0
  442. paddlex/inference/models/open_vocabulary_detection/processors/common.py +114 -0
  443. paddlex/inference/models/open_vocabulary_detection/processors/groundingdino_processors.py +496 -0
  444. paddlex/inference/models/open_vocabulary_detection/processors/yoloworld_processors.py +209 -0
  445. paddlex/inference/models/open_vocabulary_segmentation/__init__.py +15 -0
  446. paddlex/inference/models/open_vocabulary_segmentation/predictor.py +113 -0
  447. paddlex/inference/models/open_vocabulary_segmentation/processors/__init__.py +15 -0
  448. paddlex/inference/models/open_vocabulary_segmentation/processors/sam_processer.py +249 -0
  449. paddlex/inference/models/open_vocabulary_segmentation/results/__init__.py +15 -0
  450. paddlex/inference/models/open_vocabulary_segmentation/results/sam_result.py +149 -0
  451. paddlex/inference/models/semantic_segmentation/__init__.py +15 -0
  452. paddlex/inference/models/semantic_segmentation/predictor.py +158 -0
  453. paddlex/inference/models/semantic_segmentation/processors.py +117 -0
  454. paddlex/inference/models/semantic_segmentation/result.py +73 -0
  455. paddlex/inference/models/table_structure_recognition/__init__.py +15 -0
  456. paddlex/inference/models/table_structure_recognition/predictor.py +161 -0
  457. paddlex/inference/models/table_structure_recognition/processors.py +229 -0
  458. paddlex/inference/models/table_structure_recognition/result.py +63 -0
  459. paddlex/inference/models/text_detection/__init__.py +15 -0
  460. paddlex/inference/models/text_detection/predictor.py +191 -0
  461. paddlex/inference/models/text_detection/processors.py +538 -0
  462. paddlex/inference/models/text_detection/result.py +46 -0
  463. paddlex/inference/models/text_recognition/__init__.py +15 -0
  464. paddlex/inference/models/text_recognition/predictor.py +98 -0
  465. paddlex/inference/models/text_recognition/processors.py +245 -0
  466. paddlex/inference/models/text_recognition/result.py +76 -0
  467. paddlex/inference/models/ts_anomaly_detection/__init__.py +15 -0
  468. paddlex/inference/models/ts_anomaly_detection/predictor.py +141 -0
  469. paddlex/inference/models/ts_anomaly_detection/processors.py +98 -0
  470. paddlex/inference/models/ts_anomaly_detection/result.py +83 -0
  471. paddlex/inference/models/ts_classification/__init__.py +15 -0
  472. paddlex/inference/models/ts_classification/predictor.py +122 -0
  473. paddlex/inference/models/ts_classification/processors.py +122 -0
  474. paddlex/inference/models/ts_classification/result.py +87 -0
  475. paddlex/inference/models/ts_forecasting/__init__.py +15 -0
  476. paddlex/inference/models/ts_forecasting/predictor.py +154 -0
  477. paddlex/inference/models/ts_forecasting/processors.py +158 -0
  478. paddlex/inference/models/ts_forecasting/result.py +96 -0
  479. paddlex/inference/models/video_classification/__init__.py +15 -0
  480. paddlex/inference/models/video_classification/predictor.py +141 -0
  481. paddlex/inference/models/video_classification/processors.py +409 -0
  482. paddlex/inference/models/video_classification/result.py +96 -0
  483. paddlex/inference/models/video_detection/__init__.py +15 -0
  484. paddlex/inference/models/video_detection/predictor.py +129 -0
  485. paddlex/inference/models/video_detection/processors.py +463 -0
  486. paddlex/inference/models/video_detection/result.py +109 -0
  487. paddlex/inference/pipelines/__init__.py +239 -0
  488. paddlex/inference/pipelines/_parallel.py +172 -0
  489. paddlex/inference/pipelines/anomaly_detection/__init__.py +15 -0
  490. paddlex/inference/pipelines/anomaly_detection/pipeline.py +82 -0
  491. paddlex/inference/pipelines/attribute_recognition/__init__.py +15 -0
  492. paddlex/inference/pipelines/attribute_recognition/pipeline.py +120 -0
  493. paddlex/inference/pipelines/attribute_recognition/result.py +102 -0
  494. paddlex/inference/pipelines/base.py +156 -0
  495. paddlex/inference/pipelines/components/__init__.py +29 -0
  496. paddlex/inference/pipelines/components/chat_server/__init__.py +16 -0
  497. paddlex/inference/pipelines/components/chat_server/base.py +39 -0
  498. paddlex/inference/pipelines/components/chat_server/openai_bot_chat.py +236 -0
  499. paddlex/inference/pipelines/components/common/__init__.py +19 -0
  500. paddlex/inference/pipelines/components/common/base_operator.py +37 -0
  501. paddlex/inference/pipelines/components/common/base_result.py +66 -0
  502. paddlex/inference/pipelines/components/common/convert_points_and_boxes.py +45 -0
  503. paddlex/inference/pipelines/components/common/crop_image_regions.py +556 -0
  504. paddlex/inference/pipelines/components/common/seal_det_warp.py +972 -0
  505. paddlex/inference/pipelines/components/common/sort_boxes.py +85 -0
  506. paddlex/inference/pipelines/components/common/warp_image.py +50 -0
  507. paddlex/inference/pipelines/components/faisser.py +357 -0
  508. paddlex/inference/pipelines/components/prompt_engineering/__init__.py +16 -0
  509. paddlex/inference/pipelines/components/prompt_engineering/base.py +35 -0
  510. paddlex/inference/pipelines/components/prompt_engineering/generate_ensemble_prompt.py +128 -0
  511. paddlex/inference/pipelines/components/prompt_engineering/generate_kie_prompt.py +148 -0
  512. paddlex/inference/pipelines/components/retriever/__init__.py +16 -0
  513. paddlex/inference/pipelines/components/retriever/base.py +228 -0
  514. paddlex/inference/pipelines/components/retriever/openai_bot_retriever.py +70 -0
  515. paddlex/inference/pipelines/components/retriever/qianfan_bot_retriever.py +166 -0
  516. paddlex/inference/pipelines/components/utils/__init__.py +13 -0
  517. paddlex/inference/pipelines/components/utils/mixin.py +206 -0
  518. paddlex/inference/pipelines/doc_preprocessor/__init__.py +15 -0
  519. paddlex/inference/pipelines/doc_preprocessor/pipeline.py +209 -0
  520. paddlex/inference/pipelines/doc_preprocessor/result.py +98 -0
  521. paddlex/inference/pipelines/doc_understanding/__init__.py +15 -0
  522. paddlex/inference/pipelines/doc_understanding/pipeline.py +71 -0
  523. paddlex/inference/pipelines/face_recognition/__init__.py +15 -0
  524. paddlex/inference/pipelines/face_recognition/pipeline.py +63 -0
  525. paddlex/inference/pipelines/face_recognition/result.py +44 -0
  526. paddlex/inference/pipelines/formula_recognition/__init__.py +15 -0
  527. paddlex/inference/pipelines/formula_recognition/pipeline.py +347 -0
  528. paddlex/inference/pipelines/formula_recognition/result.py +282 -0
  529. paddlex/inference/pipelines/image_classification/__init__.py +15 -0
  530. paddlex/inference/pipelines/image_classification/pipeline.py +90 -0
  531. paddlex/inference/pipelines/image_multilabel_classification/__init__.py +15 -0
  532. paddlex/inference/pipelines/image_multilabel_classification/pipeline.py +97 -0
  533. paddlex/inference/pipelines/instance_segmentation/__init__.py +15 -0
  534. paddlex/inference/pipelines/instance_segmentation/pipeline.py +91 -0
  535. paddlex/inference/pipelines/keypoint_detection/__init__.py +15 -0
  536. paddlex/inference/pipelines/keypoint_detection/pipeline.py +158 -0
  537. paddlex/inference/pipelines/layout_parsing/__init__.py +16 -0
  538. paddlex/inference/pipelines/layout_parsing/pipeline.py +568 -0
  539. paddlex/inference/pipelines/layout_parsing/pipeline_v2.py +1382 -0
  540. paddlex/inference/pipelines/layout_parsing/result.py +191 -0
  541. paddlex/inference/pipelines/layout_parsing/result_v2.py +745 -0
  542. paddlex/inference/pipelines/layout_parsing/setting.py +87 -0
  543. paddlex/inference/pipelines/layout_parsing/utils.py +951 -0
  544. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/__init__.py +16 -0
  545. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/utils.py +1143 -0
  546. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/xycuts.py +562 -0
  547. paddlex/inference/pipelines/m_3d_bev_detection/__init__.py +15 -0
  548. paddlex/inference/pipelines/m_3d_bev_detection/pipeline.py +74 -0
  549. paddlex/inference/pipelines/multilingual_speech_recognition/__init__.py +15 -0
  550. paddlex/inference/pipelines/multilingual_speech_recognition/pipeline.py +78 -0
  551. paddlex/inference/pipelines/object_detection/__init__.py +15 -0
  552. paddlex/inference/pipelines/object_detection/pipeline.py +115 -0
  553. paddlex/inference/pipelines/ocr/__init__.py +15 -0
  554. paddlex/inference/pipelines/ocr/pipeline.py +463 -0
  555. paddlex/inference/pipelines/ocr/result.py +255 -0
  556. paddlex/inference/pipelines/open_vocabulary_detection/__init__.py +15 -0
  557. paddlex/inference/pipelines/open_vocabulary_detection/pipeline.py +86 -0
  558. paddlex/inference/pipelines/open_vocabulary_segmentation/__init__.py +15 -0
  559. paddlex/inference/pipelines/open_vocabulary_segmentation/pipeline.py +100 -0
  560. paddlex/inference/pipelines/pp_chatocr/__init__.py +16 -0
  561. paddlex/inference/pipelines/pp_chatocr/pipeline_base.py +111 -0
  562. paddlex/inference/pipelines/pp_chatocr/pipeline_v3.py +781 -0
  563. paddlex/inference/pipelines/pp_chatocr/pipeline_v4.py +992 -0
  564. paddlex/inference/pipelines/pp_shitu_v2/__init__.py +15 -0
  565. paddlex/inference/pipelines/pp_shitu_v2/pipeline.py +156 -0
  566. paddlex/inference/pipelines/pp_shitu_v2/result.py +126 -0
  567. paddlex/inference/pipelines/rotated_object_detection/__init__.py +15 -0
  568. paddlex/inference/pipelines/rotated_object_detection/pipeline.py +95 -0
  569. paddlex/inference/pipelines/seal_recognition/__init__.py +15 -0
  570. paddlex/inference/pipelines/seal_recognition/pipeline.py +335 -0
  571. paddlex/inference/pipelines/seal_recognition/result.py +89 -0
  572. paddlex/inference/pipelines/semantic_segmentation/__init__.py +15 -0
  573. paddlex/inference/pipelines/semantic_segmentation/pipeline.py +95 -0
  574. paddlex/inference/pipelines/small_object_detection/__init__.py +15 -0
  575. paddlex/inference/pipelines/small_object_detection/pipeline.py +95 -0
  576. paddlex/inference/pipelines/table_recognition/__init__.py +16 -0
  577. paddlex/inference/pipelines/table_recognition/pipeline.py +486 -0
  578. paddlex/inference/pipelines/table_recognition/pipeline_v2.py +1395 -0
  579. paddlex/inference/pipelines/table_recognition/result.py +218 -0
  580. paddlex/inference/pipelines/table_recognition/table_recognition_post_processing.py +366 -0
  581. paddlex/inference/pipelines/table_recognition/table_recognition_post_processing_v2.py +488 -0
  582. paddlex/inference/pipelines/table_recognition/utils.py +44 -0
  583. paddlex/inference/pipelines/ts_anomaly_detection/__init__.py +15 -0
  584. paddlex/inference/pipelines/ts_anomaly_detection/pipeline.py +72 -0
  585. paddlex/inference/pipelines/ts_classification/__init__.py +15 -0
  586. paddlex/inference/pipelines/ts_classification/pipeline.py +72 -0
  587. paddlex/inference/pipelines/ts_forecasting/__init__.py +15 -0
  588. paddlex/inference/pipelines/ts_forecasting/pipeline.py +72 -0
  589. paddlex/inference/pipelines/video_classification/__init__.py +15 -0
  590. paddlex/inference/pipelines/video_classification/pipeline.py +79 -0
  591. paddlex/inference/pipelines/video_detection/__init__.py +15 -0
  592. paddlex/inference/pipelines/video_detection/pipeline.py +86 -0
  593. paddlex/inference/serving/__init__.py +17 -0
  594. paddlex/inference/serving/basic_serving/__init__.py +18 -0
  595. paddlex/inference/serving/basic_serving/_app.py +221 -0
  596. paddlex/inference/serving/basic_serving/_pipeline_apps/__init__.py +44 -0
  597. paddlex/inference/serving/basic_serving/_pipeline_apps/_common/__init__.py +13 -0
  598. paddlex/inference/serving/basic_serving/_pipeline_apps/_common/common.py +104 -0
  599. paddlex/inference/serving/basic_serving/_pipeline_apps/_common/image_recognition.py +36 -0
  600. paddlex/inference/serving/basic_serving/_pipeline_apps/_common/ocr.py +95 -0
  601. paddlex/inference/serving/basic_serving/_pipeline_apps/anomaly_detection.py +67 -0
  602. paddlex/inference/serving/basic_serving/_pipeline_apps/doc_preprocessor.py +100 -0
  603. paddlex/inference/serving/basic_serving/_pipeline_apps/doc_understanding.py +153 -0
  604. paddlex/inference/serving/basic_serving/_pipeline_apps/face_recognition.py +226 -0
  605. paddlex/inference/serving/basic_serving/_pipeline_apps/formula_recognition.py +100 -0
  606. paddlex/inference/serving/basic_serving/_pipeline_apps/human_keypoint_detection.py +81 -0
  607. paddlex/inference/serving/basic_serving/_pipeline_apps/image_classification.py +69 -0
  608. paddlex/inference/serving/basic_serving/_pipeline_apps/image_multilabel_classification.py +73 -0
  609. paddlex/inference/serving/basic_serving/_pipeline_apps/instance_segmentation.py +87 -0
  610. paddlex/inference/serving/basic_serving/_pipeline_apps/layout_parsing.py +117 -0
  611. paddlex/inference/serving/basic_serving/_pipeline_apps/m_3d_bev_detection.py +79 -0
  612. paddlex/inference/serving/basic_serving/_pipeline_apps/multilingual_speech_recognition.py +92 -0
  613. paddlex/inference/serving/basic_serving/_pipeline_apps/object_detection.py +77 -0
  614. paddlex/inference/serving/basic_serving/_pipeline_apps/ocr.py +102 -0
  615. paddlex/inference/serving/basic_serving/_pipeline_apps/open_vocabulary_detection.py +81 -0
  616. paddlex/inference/serving/basic_serving/_pipeline_apps/open_vocabulary_segmentation.py +91 -0
  617. paddlex/inference/serving/basic_serving/_pipeline_apps/pedestrian_attribute_recognition.py +84 -0
  618. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv3_doc.py +193 -0
  619. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv4_doc.py +223 -0
  620. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_shituv2.py +221 -0
  621. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_structurev3.py +143 -0
  622. paddlex/inference/serving/basic_serving/_pipeline_apps/rotated_object_detection.py +81 -0
  623. paddlex/inference/serving/basic_serving/_pipeline_apps/seal_recognition.py +106 -0
  624. paddlex/inference/serving/basic_serving/_pipeline_apps/semantic_segmentation.py +67 -0
  625. paddlex/inference/serving/basic_serving/_pipeline_apps/small_object_detection.py +72 -0
  626. paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition.py +108 -0
  627. paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition_v2.py +113 -0
  628. paddlex/inference/serving/basic_serving/_pipeline_apps/ts_anomaly_detection.py +65 -0
  629. paddlex/inference/serving/basic_serving/_pipeline_apps/ts_classification.py +64 -0
  630. paddlex/inference/serving/basic_serving/_pipeline_apps/ts_forecast.py +65 -0
  631. paddlex/inference/serving/basic_serving/_pipeline_apps/vehicle_attribute_recognition.py +84 -0
  632. paddlex/inference/serving/basic_serving/_pipeline_apps/video_classification.py +76 -0
  633. paddlex/inference/serving/basic_serving/_pipeline_apps/video_detection.py +92 -0
  634. paddlex/inference/serving/basic_serving/_server.py +40 -0
  635. paddlex/inference/serving/infra/__init__.py +13 -0
  636. paddlex/inference/serving/infra/config.py +36 -0
  637. paddlex/inference/serving/infra/models.py +79 -0
  638. paddlex/inference/serving/infra/storage.py +180 -0
  639. paddlex/inference/serving/infra/utils.py +285 -0
  640. paddlex/inference/serving/schemas/__init__.py +13 -0
  641. paddlex/inference/serving/schemas/anomaly_detection.py +39 -0
  642. paddlex/inference/serving/schemas/doc_preprocessor.py +54 -0
  643. paddlex/inference/serving/schemas/doc_understanding.py +78 -0
  644. paddlex/inference/serving/schemas/face_recognition.py +124 -0
  645. paddlex/inference/serving/schemas/formula_recognition.py +56 -0
  646. paddlex/inference/serving/schemas/human_keypoint_detection.py +55 -0
  647. paddlex/inference/serving/schemas/image_classification.py +45 -0
  648. paddlex/inference/serving/schemas/image_multilabel_classification.py +47 -0
  649. paddlex/inference/serving/schemas/instance_segmentation.py +53 -0
  650. paddlex/inference/serving/schemas/layout_parsing.py +71 -0
  651. paddlex/inference/serving/schemas/m_3d_bev_detection.py +48 -0
  652. paddlex/inference/serving/schemas/multilingual_speech_recognition.py +57 -0
  653. paddlex/inference/serving/schemas/object_detection.py +52 -0
  654. paddlex/inference/serving/schemas/ocr.py +60 -0
  655. paddlex/inference/serving/schemas/open_vocabulary_detection.py +52 -0
  656. paddlex/inference/serving/schemas/open_vocabulary_segmentation.py +52 -0
  657. paddlex/inference/serving/schemas/pedestrian_attribute_recognition.py +61 -0
  658. paddlex/inference/serving/schemas/pp_chatocrv3_doc.py +133 -0
  659. paddlex/inference/serving/schemas/pp_chatocrv4_doc.py +150 -0
  660. paddlex/inference/serving/schemas/pp_shituv2.py +124 -0
  661. paddlex/inference/serving/schemas/pp_structurev3.py +88 -0
  662. paddlex/inference/serving/schemas/rotated_object_detection.py +52 -0
  663. paddlex/inference/serving/schemas/seal_recognition.py +62 -0
  664. paddlex/inference/serving/schemas/semantic_segmentation.py +45 -0
  665. paddlex/inference/serving/schemas/shared/__init__.py +13 -0
  666. paddlex/inference/serving/schemas/shared/classification.py +23 -0
  667. paddlex/inference/serving/schemas/shared/image_segmentation.py +28 -0
  668. paddlex/inference/serving/schemas/shared/object_detection.py +24 -0
  669. paddlex/inference/serving/schemas/shared/ocr.py +25 -0
  670. paddlex/inference/serving/schemas/small_object_detection.py +52 -0
  671. paddlex/inference/serving/schemas/table_recognition.py +64 -0
  672. paddlex/inference/serving/schemas/table_recognition_v2.py +69 -0
  673. paddlex/inference/serving/schemas/ts_anomaly_detection.py +37 -0
  674. paddlex/inference/serving/schemas/ts_classification.py +38 -0
  675. paddlex/inference/serving/schemas/ts_forecast.py +37 -0
  676. paddlex/inference/serving/schemas/vehicle_attribute_recognition.py +61 -0
  677. paddlex/inference/serving/schemas/video_classification.py +44 -0
  678. paddlex/inference/serving/schemas/video_detection.py +56 -0
  679. paddlex/inference/utils/__init__.py +13 -0
  680. paddlex/inference/utils/benchmark.py +379 -0
  681. paddlex/inference/utils/color_map.py +123 -0
  682. paddlex/inference/utils/get_pipeline_path.py +27 -0
  683. paddlex/inference/utils/hpi.py +254 -0
  684. paddlex/inference/utils/hpi_model_info_collection.json +2331 -0
  685. paddlex/inference/utils/io/__init__.py +36 -0
  686. paddlex/inference/utils/io/readers.py +504 -0
  687. paddlex/inference/utils/io/style.py +381 -0
  688. paddlex/inference/utils/io/tablepyxl.py +157 -0
  689. paddlex/inference/utils/io/writers.py +458 -0
  690. paddlex/inference/utils/model_paths.py +48 -0
  691. paddlex/inference/utils/new_ir_blocklist.py +27 -0
  692. paddlex/inference/utils/official_models.py +367 -0
  693. paddlex/inference/utils/pp_option.py +339 -0
  694. paddlex/inference/utils/trt_blocklist.py +43 -0
  695. paddlex/inference/utils/trt_config.py +420 -0
  696. paddlex/model.py +131 -0
  697. paddlex/modules/__init__.py +115 -0
  698. paddlex/modules/anomaly_detection/__init__.py +18 -0
  699. paddlex/modules/anomaly_detection/dataset_checker/__init__.py +94 -0
  700. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/__init__.py +19 -0
  701. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/analyse_dataset.py +82 -0
  702. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/check_dataset.py +91 -0
  703. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/convert_dataset.py +233 -0
  704. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/split_dataset.py +87 -0
  705. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/utils/__init__.py +13 -0
  706. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/utils/visualizer.py +76 -0
  707. paddlex/modules/anomaly_detection/evaluator.py +58 -0
  708. paddlex/modules/anomaly_detection/exportor.py +22 -0
  709. paddlex/modules/anomaly_detection/model_list.py +16 -0
  710. paddlex/modules/anomaly_detection/trainer.py +70 -0
  711. paddlex/modules/base/__init__.py +18 -0
  712. paddlex/modules/base/build_model.py +33 -0
  713. paddlex/modules/base/dataset_checker/__init__.py +16 -0
  714. paddlex/modules/base/dataset_checker/dataset_checker.py +169 -0
  715. paddlex/modules/base/dataset_checker/utils.py +108 -0
  716. paddlex/modules/base/evaluator.py +170 -0
  717. paddlex/modules/base/exportor.py +145 -0
  718. paddlex/modules/base/trainer.py +144 -0
  719. paddlex/modules/base/utils/__init__.py +13 -0
  720. paddlex/modules/base/utils/cinn_setting.py +89 -0
  721. paddlex/modules/base/utils/coco_eval.py +94 -0
  722. paddlex/modules/base/utils/topk_eval.py +118 -0
  723. paddlex/modules/doc_vlm/__init__.py +18 -0
  724. paddlex/modules/doc_vlm/dataset_checker.py +29 -0
  725. paddlex/modules/doc_vlm/evaluator.py +29 -0
  726. paddlex/modules/doc_vlm/exportor.py +29 -0
  727. paddlex/modules/doc_vlm/model_list.py +16 -0
  728. paddlex/modules/doc_vlm/trainer.py +41 -0
  729. paddlex/modules/face_recognition/__init__.py +18 -0
  730. paddlex/modules/face_recognition/dataset_checker/__init__.py +71 -0
  731. paddlex/modules/face_recognition/dataset_checker/dataset_src/__init__.py +16 -0
  732. paddlex/modules/face_recognition/dataset_checker/dataset_src/check_dataset.py +172 -0
  733. paddlex/modules/face_recognition/dataset_checker/dataset_src/utils/__init__.py +13 -0
  734. paddlex/modules/face_recognition/dataset_checker/dataset_src/utils/visualizer.py +153 -0
  735. paddlex/modules/face_recognition/evaluator.py +52 -0
  736. paddlex/modules/face_recognition/exportor.py +22 -0
  737. paddlex/modules/face_recognition/model_list.py +15 -0
  738. paddlex/modules/face_recognition/trainer.py +75 -0
  739. paddlex/modules/formula_recognition/__init__.py +18 -0
  740. paddlex/modules/formula_recognition/dataset_checker/__init__.py +113 -0
  741. paddlex/modules/formula_recognition/dataset_checker/dataset_src/__init__.py +19 -0
  742. paddlex/modules/formula_recognition/dataset_checker/dataset_src/analyse_dataset.py +158 -0
  743. paddlex/modules/formula_recognition/dataset_checker/dataset_src/check_dataset.py +76 -0
  744. paddlex/modules/formula_recognition/dataset_checker/dataset_src/convert_dataset.py +95 -0
  745. paddlex/modules/formula_recognition/dataset_checker/dataset_src/split_dataset.py +80 -0
  746. paddlex/modules/formula_recognition/evaluator.py +80 -0
  747. paddlex/modules/formula_recognition/exportor.py +22 -0
  748. paddlex/modules/formula_recognition/model_list.py +23 -0
  749. paddlex/modules/formula_recognition/trainer.py +123 -0
  750. paddlex/modules/general_recognition/__init__.py +18 -0
  751. paddlex/modules/general_recognition/dataset_checker/__init__.py +107 -0
  752. paddlex/modules/general_recognition/dataset_checker/dataset_src/__init__.py +19 -0
  753. paddlex/modules/general_recognition/dataset_checker/dataset_src/analyse_dataset.py +96 -0
  754. paddlex/modules/general_recognition/dataset_checker/dataset_src/check_dataset.py +99 -0
  755. paddlex/modules/general_recognition/dataset_checker/dataset_src/convert_dataset.py +100 -0
  756. paddlex/modules/general_recognition/dataset_checker/dataset_src/split_dataset.py +82 -0
  757. paddlex/modules/general_recognition/dataset_checker/dataset_src/utils/__init__.py +13 -0
  758. paddlex/modules/general_recognition/dataset_checker/dataset_src/utils/visualizer.py +147 -0
  759. paddlex/modules/general_recognition/evaluator.py +31 -0
  760. paddlex/modules/general_recognition/exportor.py +22 -0
  761. paddlex/modules/general_recognition/model_list.py +19 -0
  762. paddlex/modules/general_recognition/trainer.py +52 -0
  763. paddlex/modules/image_classification/__init__.py +18 -0
  764. paddlex/modules/image_classification/dataset_checker/__init__.py +104 -0
  765. paddlex/modules/image_classification/dataset_checker/dataset_src/__init__.py +19 -0
  766. paddlex/modules/image_classification/dataset_checker/dataset_src/analyse_dataset.py +92 -0
  767. paddlex/modules/image_classification/dataset_checker/dataset_src/check_dataset.py +132 -0
  768. paddlex/modules/image_classification/dataset_checker/dataset_src/convert_dataset.py +51 -0
  769. paddlex/modules/image_classification/dataset_checker/dataset_src/split_dataset.py +81 -0
  770. paddlex/modules/image_classification/dataset_checker/dataset_src/utils/__init__.py +13 -0
  771. paddlex/modules/image_classification/dataset_checker/dataset_src/utils/visualizer.py +153 -0
  772. paddlex/modules/image_classification/evaluator.py +43 -0
  773. paddlex/modules/image_classification/exportor.py +22 -0
  774. paddlex/modules/image_classification/model_list.py +99 -0
  775. paddlex/modules/image_classification/trainer.py +82 -0
  776. paddlex/modules/image_unwarping/__init__.py +13 -0
  777. paddlex/modules/image_unwarping/model_list.py +17 -0
  778. paddlex/modules/instance_segmentation/__init__.py +18 -0
  779. paddlex/modules/instance_segmentation/dataset_checker/__init__.py +107 -0
  780. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/__init__.py +19 -0
  781. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/analyse_dataset.py +82 -0
  782. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/check_dataset.py +95 -0
  783. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/convert_dataset.py +241 -0
  784. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/split_dataset.py +122 -0
  785. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/utils/__init__.py +13 -0
  786. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/utils/visualizer.py +223 -0
  787. paddlex/modules/instance_segmentation/evaluator.py +32 -0
  788. paddlex/modules/instance_segmentation/exportor.py +22 -0
  789. paddlex/modules/instance_segmentation/model_list.py +33 -0
  790. paddlex/modules/instance_segmentation/trainer.py +31 -0
  791. paddlex/modules/keypoint_detection/__init__.py +18 -0
  792. paddlex/modules/keypoint_detection/dataset_checker/__init__.py +56 -0
  793. paddlex/modules/keypoint_detection/dataset_checker/dataset_src/__init__.py +15 -0
  794. paddlex/modules/keypoint_detection/dataset_checker/dataset_src/check_dataset.py +91 -0
  795. paddlex/modules/keypoint_detection/dataset_checker/dataset_src/utils/__init__.py +13 -0
  796. paddlex/modules/keypoint_detection/dataset_checker/dataset_src/utils/visualizer.py +124 -0
  797. paddlex/modules/keypoint_detection/evaluator.py +41 -0
  798. paddlex/modules/keypoint_detection/exportor.py +22 -0
  799. paddlex/modules/keypoint_detection/model_list.py +16 -0
  800. paddlex/modules/keypoint_detection/trainer.py +39 -0
  801. paddlex/modules/m_3d_bev_detection/__init__.py +18 -0
  802. paddlex/modules/m_3d_bev_detection/dataset_checker/__init__.py +95 -0
  803. paddlex/modules/m_3d_bev_detection/dataset_checker/dataset_src/__init__.py +17 -0
  804. paddlex/modules/m_3d_bev_detection/dataset_checker/dataset_src/analyse_dataset.py +106 -0
  805. paddlex/modules/m_3d_bev_detection/dataset_checker/dataset_src/check_dataset.py +101 -0
  806. paddlex/modules/m_3d_bev_detection/evaluator.py +46 -0
  807. paddlex/modules/m_3d_bev_detection/exportor.py +22 -0
  808. paddlex/modules/m_3d_bev_detection/model_list.py +18 -0
  809. paddlex/modules/m_3d_bev_detection/trainer.py +68 -0
  810. paddlex/modules/multilabel_classification/__init__.py +18 -0
  811. paddlex/modules/multilabel_classification/dataset_checker/__init__.py +106 -0
  812. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/__init__.py +19 -0
  813. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/analyse_dataset.py +94 -0
  814. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/check_dataset.py +132 -0
  815. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/convert_dataset.py +120 -0
  816. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/split_dataset.py +81 -0
  817. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/utils/__init__.py +13 -0
  818. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/utils/visualizer.py +149 -0
  819. paddlex/modules/multilabel_classification/evaluator.py +43 -0
  820. paddlex/modules/multilabel_classification/exportor.py +22 -0
  821. paddlex/modules/multilabel_classification/model_list.py +24 -0
  822. paddlex/modules/multilabel_classification/trainer.py +85 -0
  823. paddlex/modules/multilingual_speech_recognition/__init__.py +18 -0
  824. paddlex/modules/multilingual_speech_recognition/dataset_checker.py +27 -0
  825. paddlex/modules/multilingual_speech_recognition/evaluator.py +27 -0
  826. paddlex/modules/multilingual_speech_recognition/exportor.py +27 -0
  827. paddlex/modules/multilingual_speech_recognition/model_list.py +22 -0
  828. paddlex/modules/multilingual_speech_recognition/trainer.py +42 -0
  829. paddlex/modules/object_detection/__init__.py +18 -0
  830. paddlex/modules/object_detection/dataset_checker/__init__.py +106 -0
  831. paddlex/modules/object_detection/dataset_checker/dataset_src/__init__.py +19 -0
  832. paddlex/modules/object_detection/dataset_checker/dataset_src/analyse_dataset.py +82 -0
  833. paddlex/modules/object_detection/dataset_checker/dataset_src/check_dataset.py +91 -0
  834. paddlex/modules/object_detection/dataset_checker/dataset_src/convert_dataset.py +438 -0
  835. paddlex/modules/object_detection/dataset_checker/dataset_src/split_dataset.py +123 -0
  836. paddlex/modules/object_detection/dataset_checker/dataset_src/utils/__init__.py +13 -0
  837. paddlex/modules/object_detection/dataset_checker/dataset_src/utils/visualizer.py +193 -0
  838. paddlex/modules/object_detection/evaluator.py +57 -0
  839. paddlex/modules/object_detection/exportor.py +22 -0
  840. paddlex/modules/object_detection/model_list.py +86 -0
  841. paddlex/modules/object_detection/trainer.py +98 -0
  842. paddlex/modules/open_vocabulary_detection/__init__.py +18 -0
  843. paddlex/modules/open_vocabulary_detection/dataset_checker.py +29 -0
  844. paddlex/modules/open_vocabulary_detection/evaluator.py +29 -0
  845. paddlex/modules/open_vocabulary_detection/exportor.py +29 -0
  846. paddlex/modules/open_vocabulary_detection/model_list.py +16 -0
  847. paddlex/modules/open_vocabulary_detection/trainer.py +44 -0
  848. paddlex/modules/open_vocabulary_segmentation/__init__.py +18 -0
  849. paddlex/modules/open_vocabulary_segmentation/dataset_checker.py +29 -0
  850. paddlex/modules/open_vocabulary_segmentation/evaluator.py +29 -0
  851. paddlex/modules/open_vocabulary_segmentation/exportor.py +29 -0
  852. paddlex/modules/open_vocabulary_segmentation/model_list.py +19 -0
  853. paddlex/modules/open_vocabulary_segmentation/trainer.py +44 -0
  854. paddlex/modules/semantic_segmentation/__init__.py +18 -0
  855. paddlex/modules/semantic_segmentation/dataset_checker/__init__.py +109 -0
  856. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/__init__.py +19 -0
  857. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/analyse_dataset.py +76 -0
  858. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/check_dataset.py +80 -0
  859. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/convert_dataset.py +165 -0
  860. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/split_dataset.py +87 -0
  861. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/utils/__init__.py +13 -0
  862. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/utils/visualizer.py +75 -0
  863. paddlex/modules/semantic_segmentation/evaluator.py +58 -0
  864. paddlex/modules/semantic_segmentation/exportor.py +31 -0
  865. paddlex/modules/semantic_segmentation/model_list.py +37 -0
  866. paddlex/modules/semantic_segmentation/trainer.py +72 -0
  867. paddlex/modules/table_recognition/__init__.py +18 -0
  868. paddlex/modules/table_recognition/dataset_checker/__init__.py +98 -0
  869. paddlex/modules/table_recognition/dataset_checker/dataset_src/__init__.py +18 -0
  870. paddlex/modules/table_recognition/dataset_checker/dataset_src/analyse_dataset.py +59 -0
  871. paddlex/modules/table_recognition/dataset_checker/dataset_src/check_dataset.py +87 -0
  872. paddlex/modules/table_recognition/dataset_checker/dataset_src/split_dataset.py +80 -0
  873. paddlex/modules/table_recognition/evaluator.py +43 -0
  874. paddlex/modules/table_recognition/exportor.py +22 -0
  875. paddlex/modules/table_recognition/model_list.py +21 -0
  876. paddlex/modules/table_recognition/trainer.py +67 -0
  877. paddlex/modules/text_detection/__init__.py +18 -0
  878. paddlex/modules/text_detection/dataset_checker/__init__.py +107 -0
  879. paddlex/modules/text_detection/dataset_checker/dataset_src/__init__.py +18 -0
  880. paddlex/modules/text_detection/dataset_checker/dataset_src/analyse_dataset.py +220 -0
  881. paddlex/modules/text_detection/dataset_checker/dataset_src/check_dataset.py +106 -0
  882. paddlex/modules/text_detection/dataset_checker/dataset_src/split_dataset.py +140 -0
  883. paddlex/modules/text_detection/evaluator.py +41 -0
  884. paddlex/modules/text_detection/exportor.py +22 -0
  885. paddlex/modules/text_detection/model_list.py +26 -0
  886. paddlex/modules/text_detection/trainer.py +65 -0
  887. paddlex/modules/text_recognition/__init__.py +18 -0
  888. paddlex/modules/text_recognition/dataset_checker/__init__.py +125 -0
  889. paddlex/modules/text_recognition/dataset_checker/dataset_src/__init__.py +19 -0
  890. paddlex/modules/text_recognition/dataset_checker/dataset_src/analyse_dataset.py +162 -0
  891. paddlex/modules/text_recognition/dataset_checker/dataset_src/check_dataset.py +104 -0
  892. paddlex/modules/text_recognition/dataset_checker/dataset_src/convert_dataset.py +95 -0
  893. paddlex/modules/text_recognition/dataset_checker/dataset_src/split_dataset.py +80 -0
  894. paddlex/modules/text_recognition/evaluator.py +64 -0
  895. paddlex/modules/text_recognition/exportor.py +22 -0
  896. paddlex/modules/text_recognition/model_list.py +36 -0
  897. paddlex/modules/text_recognition/trainer.py +105 -0
  898. paddlex/modules/ts_anomaly_detection/__init__.py +19 -0
  899. paddlex/modules/ts_anomaly_detection/dataset_checker/__init__.py +111 -0
  900. paddlex/modules/ts_anomaly_detection/dataset_checker/dataset_src/__init__.py +19 -0
  901. paddlex/modules/ts_anomaly_detection/dataset_checker/dataset_src/analyse_dataset.py +19 -0
  902. paddlex/modules/ts_anomaly_detection/dataset_checker/dataset_src/check_dataset.py +64 -0
  903. paddlex/modules/ts_anomaly_detection/dataset_checker/dataset_src/convert_dataset.py +74 -0
  904. paddlex/modules/ts_anomaly_detection/dataset_checker/dataset_src/split_dataset.py +63 -0
  905. paddlex/modules/ts_anomaly_detection/evaluator.py +67 -0
  906. paddlex/modules/ts_anomaly_detection/exportor.py +44 -0
  907. paddlex/modules/ts_anomaly_detection/model_list.py +22 -0
  908. paddlex/modules/ts_anomaly_detection/trainer.py +113 -0
  909. paddlex/modules/ts_classification/__init__.py +19 -0
  910. paddlex/modules/ts_classification/dataset_checker/__init__.py +111 -0
  911. paddlex/modules/ts_classification/dataset_checker/dataset_src/__init__.py +19 -0
  912. paddlex/modules/ts_classification/dataset_checker/dataset_src/analyse_dataset.py +77 -0
  913. paddlex/modules/ts_classification/dataset_checker/dataset_src/check_dataset.py +64 -0
  914. paddlex/modules/ts_classification/dataset_checker/dataset_src/convert_dataset.py +74 -0
  915. paddlex/modules/ts_classification/dataset_checker/dataset_src/split_dataset.py +88 -0
  916. paddlex/modules/ts_classification/evaluator.py +66 -0
  917. paddlex/modules/ts_classification/exportor.py +44 -0
  918. paddlex/modules/ts_classification/model_list.py +18 -0
  919. paddlex/modules/ts_classification/trainer.py +108 -0
  920. paddlex/modules/ts_forecast/__init__.py +19 -0
  921. paddlex/modules/ts_forecast/dataset_checker/__init__.py +111 -0
  922. paddlex/modules/ts_forecast/dataset_checker/dataset_src/__init__.py +19 -0
  923. paddlex/modules/ts_forecast/dataset_checker/dataset_src/analyse_dataset.py +19 -0
  924. paddlex/modules/ts_forecast/dataset_checker/dataset_src/check_dataset.py +64 -0
  925. paddlex/modules/ts_forecast/dataset_checker/dataset_src/convert_dataset.py +73 -0
  926. paddlex/modules/ts_forecast/dataset_checker/dataset_src/split_dataset.py +63 -0
  927. paddlex/modules/ts_forecast/evaluator.py +66 -0
  928. paddlex/modules/ts_forecast/exportor.py +44 -0
  929. paddlex/modules/ts_forecast/model_list.py +24 -0
  930. paddlex/modules/ts_forecast/trainer.py +108 -0
  931. paddlex/modules/video_classification/__init__.py +18 -0
  932. paddlex/modules/video_classification/dataset_checker/__init__.py +93 -0
  933. paddlex/modules/video_classification/dataset_checker/dataset_src/__init__.py +18 -0
  934. paddlex/modules/video_classification/dataset_checker/dataset_src/analyse_dataset.py +93 -0
  935. paddlex/modules/video_classification/dataset_checker/dataset_src/check_dataset.py +120 -0
  936. paddlex/modules/video_classification/dataset_checker/dataset_src/split_dataset.py +82 -0
  937. paddlex/modules/video_classification/evaluator.py +44 -0
  938. paddlex/modules/video_classification/exportor.py +22 -0
  939. paddlex/modules/video_classification/model_list.py +19 -0
  940. paddlex/modules/video_classification/trainer.py +88 -0
  941. paddlex/modules/video_detection/__init__.py +18 -0
  942. paddlex/modules/video_detection/dataset_checker/__init__.py +86 -0
  943. paddlex/modules/video_detection/dataset_checker/dataset_src/__init__.py +17 -0
  944. paddlex/modules/video_detection/dataset_checker/dataset_src/analyse_dataset.py +100 -0
  945. paddlex/modules/video_detection/dataset_checker/dataset_src/check_dataset.py +132 -0
  946. paddlex/modules/video_detection/evaluator.py +42 -0
  947. paddlex/modules/video_detection/exportor.py +22 -0
  948. paddlex/modules/video_detection/model_list.py +15 -0
  949. paddlex/modules/video_detection/trainer.py +82 -0
  950. paddlex/ops/__init__.py +152 -0
  951. paddlex/ops/iou3d_nms/iou3d_cpu.cpp +266 -0
  952. paddlex/ops/iou3d_nms/iou3d_cpu.h +28 -0
  953. paddlex/ops/iou3d_nms/iou3d_nms.cpp +206 -0
  954. paddlex/ops/iou3d_nms/iou3d_nms.h +35 -0
  955. paddlex/ops/iou3d_nms/iou3d_nms_api.cpp +114 -0
  956. paddlex/ops/iou3d_nms/iou3d_nms_kernel.cu +484 -0
  957. paddlex/ops/setup.py +37 -0
  958. paddlex/ops/voxel/voxelize_op.cc +194 -0
  959. paddlex/ops/voxel/voxelize_op.cu +346 -0
  960. paddlex/paddlex_cli.py +476 -0
  961. paddlex/repo_apis/Paddle3D_api/__init__.py +17 -0
  962. paddlex/repo_apis/Paddle3D_api/bev_fusion/__init__.py +18 -0
  963. paddlex/repo_apis/Paddle3D_api/bev_fusion/config.py +118 -0
  964. paddlex/repo_apis/Paddle3D_api/bev_fusion/model.py +238 -0
  965. paddlex/repo_apis/Paddle3D_api/bev_fusion/register.py +55 -0
  966. paddlex/repo_apis/Paddle3D_api/bev_fusion/runner.py +104 -0
  967. paddlex/repo_apis/Paddle3D_api/pp3d_config.py +145 -0
  968. paddlex/repo_apis/PaddleClas_api/__init__.py +17 -0
  969. paddlex/repo_apis/PaddleClas_api/cls/__init__.py +19 -0
  970. paddlex/repo_apis/PaddleClas_api/cls/config.py +595 -0
  971. paddlex/repo_apis/PaddleClas_api/cls/model.py +355 -0
  972. paddlex/repo_apis/PaddleClas_api/cls/register.py +907 -0
  973. paddlex/repo_apis/PaddleClas_api/cls/runner.py +218 -0
  974. paddlex/repo_apis/PaddleClas_api/shitu_rec/__init__.py +18 -0
  975. paddlex/repo_apis/PaddleClas_api/shitu_rec/config.py +141 -0
  976. paddlex/repo_apis/PaddleClas_api/shitu_rec/model.py +20 -0
  977. paddlex/repo_apis/PaddleClas_api/shitu_rec/register.py +68 -0
  978. paddlex/repo_apis/PaddleClas_api/shitu_rec/runner.py +50 -0
  979. paddlex/repo_apis/PaddleDetection_api/__init__.py +17 -0
  980. paddlex/repo_apis/PaddleDetection_api/config_helper.py +280 -0
  981. paddlex/repo_apis/PaddleDetection_api/instance_seg/__init__.py +18 -0
  982. paddlex/repo_apis/PaddleDetection_api/instance_seg/config.py +457 -0
  983. paddlex/repo_apis/PaddleDetection_api/instance_seg/model.py +403 -0
  984. paddlex/repo_apis/PaddleDetection_api/instance_seg/register.py +262 -0
  985. paddlex/repo_apis/PaddleDetection_api/instance_seg/runner.py +225 -0
  986. paddlex/repo_apis/PaddleDetection_api/object_det/__init__.py +19 -0
  987. paddlex/repo_apis/PaddleDetection_api/object_det/config.py +540 -0
  988. paddlex/repo_apis/PaddleDetection_api/object_det/model.py +429 -0
  989. paddlex/repo_apis/PaddleDetection_api/object_det/official_categories.py +245 -0
  990. paddlex/repo_apis/PaddleDetection_api/object_det/register.py +1135 -0
  991. paddlex/repo_apis/PaddleDetection_api/object_det/runner.py +225 -0
  992. paddlex/repo_apis/PaddleNLP_api/__init__.py +13 -0
  993. paddlex/repo_apis/PaddleOCR_api/__init__.py +22 -0
  994. paddlex/repo_apis/PaddleOCR_api/config_utils.py +53 -0
  995. paddlex/repo_apis/PaddleOCR_api/formula_rec/__init__.py +16 -0
  996. paddlex/repo_apis/PaddleOCR_api/formula_rec/config.py +571 -0
  997. paddlex/repo_apis/PaddleOCR_api/formula_rec/model.py +398 -0
  998. paddlex/repo_apis/PaddleOCR_api/formula_rec/register.py +99 -0
  999. paddlex/repo_apis/PaddleOCR_api/formula_rec/runner.py +239 -0
  1000. paddlex/repo_apis/PaddleOCR_api/table_rec/__init__.py +16 -0
  1001. paddlex/repo_apis/PaddleOCR_api/table_rec/config.py +64 -0
  1002. paddlex/repo_apis/PaddleOCR_api/table_rec/model.py +126 -0
  1003. paddlex/repo_apis/PaddleOCR_api/table_rec/register.py +70 -0
  1004. paddlex/repo_apis/PaddleOCR_api/table_rec/runner.py +51 -0
  1005. paddlex/repo_apis/PaddleOCR_api/text_det/__init__.py +16 -0
  1006. paddlex/repo_apis/PaddleOCR_api/text_det/config.py +62 -0
  1007. paddlex/repo_apis/PaddleOCR_api/text_det/model.py +72 -0
  1008. paddlex/repo_apis/PaddleOCR_api/text_det/register.py +107 -0
  1009. paddlex/repo_apis/PaddleOCR_api/text_det/runner.py +53 -0
  1010. paddlex/repo_apis/PaddleOCR_api/text_rec/__init__.py +16 -0
  1011. paddlex/repo_apis/PaddleOCR_api/text_rec/config.py +564 -0
  1012. paddlex/repo_apis/PaddleOCR_api/text_rec/model.py +398 -0
  1013. paddlex/repo_apis/PaddleOCR_api/text_rec/register.py +216 -0
  1014. paddlex/repo_apis/PaddleOCR_api/text_rec/runner.py +239 -0
  1015. paddlex/repo_apis/PaddleSeg_api/__init__.py +16 -0
  1016. paddlex/repo_apis/PaddleSeg_api/base_seg_config.py +134 -0
  1017. paddlex/repo_apis/PaddleSeg_api/seg/__init__.py +16 -0
  1018. paddlex/repo_apis/PaddleSeg_api/seg/config.py +183 -0
  1019. paddlex/repo_apis/PaddleSeg_api/seg/model.py +491 -0
  1020. paddlex/repo_apis/PaddleSeg_api/seg/register.py +272 -0
  1021. paddlex/repo_apis/PaddleSeg_api/seg/runner.py +261 -0
  1022. paddlex/repo_apis/PaddleTS_api/__init__.py +20 -0
  1023. paddlex/repo_apis/PaddleTS_api/ts_ad/__init__.py +16 -0
  1024. paddlex/repo_apis/PaddleTS_api/ts_ad/config.py +88 -0
  1025. paddlex/repo_apis/PaddleTS_api/ts_ad/register.py +146 -0
  1026. paddlex/repo_apis/PaddleTS_api/ts_ad/runner.py +158 -0
  1027. paddlex/repo_apis/PaddleTS_api/ts_base/__init__.py +13 -0
  1028. paddlex/repo_apis/PaddleTS_api/ts_base/config.py +244 -0
  1029. paddlex/repo_apis/PaddleTS_api/ts_base/model.py +276 -0
  1030. paddlex/repo_apis/PaddleTS_api/ts_base/runner.py +158 -0
  1031. paddlex/repo_apis/PaddleTS_api/ts_cls/__init__.py +16 -0
  1032. paddlex/repo_apis/PaddleTS_api/ts_cls/config.py +72 -0
  1033. paddlex/repo_apis/PaddleTS_api/ts_cls/register.py +59 -0
  1034. paddlex/repo_apis/PaddleTS_api/ts_cls/runner.py +158 -0
  1035. paddlex/repo_apis/PaddleTS_api/ts_fc/__init__.py +16 -0
  1036. paddlex/repo_apis/PaddleTS_api/ts_fc/config.py +136 -0
  1037. paddlex/repo_apis/PaddleTS_api/ts_fc/register.py +186 -0
  1038. paddlex/repo_apis/PaddleVideo_api/__init__.py +17 -0
  1039. paddlex/repo_apis/PaddleVideo_api/config_utils.py +51 -0
  1040. paddlex/repo_apis/PaddleVideo_api/video_cls/__init__.py +19 -0
  1041. paddlex/repo_apis/PaddleVideo_api/video_cls/config.py +548 -0
  1042. paddlex/repo_apis/PaddleVideo_api/video_cls/model.py +346 -0
  1043. paddlex/repo_apis/PaddleVideo_api/video_cls/register.py +70 -0
  1044. paddlex/repo_apis/PaddleVideo_api/video_cls/runner.py +204 -0
  1045. paddlex/repo_apis/PaddleVideo_api/video_det/__init__.py +19 -0
  1046. paddlex/repo_apis/PaddleVideo_api/video_det/config.py +549 -0
  1047. paddlex/repo_apis/PaddleVideo_api/video_det/model.py +298 -0
  1048. paddlex/repo_apis/PaddleVideo_api/video_det/register.py +44 -0
  1049. paddlex/repo_apis/PaddleVideo_api/video_det/runner.py +199 -0
  1050. paddlex/repo_apis/__init__.py +13 -0
  1051. paddlex/repo_apis/base/__init__.py +22 -0
  1052. paddlex/repo_apis/base/config.py +237 -0
  1053. paddlex/repo_apis/base/model.py +563 -0
  1054. paddlex/repo_apis/base/register.py +135 -0
  1055. paddlex/repo_apis/base/runner.py +390 -0
  1056. paddlex/repo_apis/base/utils/__init__.py +13 -0
  1057. paddlex/repo_apis/base/utils/arg.py +64 -0
  1058. paddlex/repo_apis/base/utils/subprocess.py +107 -0
  1059. paddlex/repo_manager/__init__.py +17 -0
  1060. paddlex/repo_manager/core.py +253 -0
  1061. paddlex/repo_manager/meta.py +180 -0
  1062. paddlex/repo_manager/repo.py +425 -0
  1063. paddlex/repo_manager/utils.py +148 -0
  1064. paddlex/utils/__init__.py +1 -12
  1065. paddlex/utils/cache.py +146 -0
  1066. paddlex/utils/config.py +216 -0
  1067. paddlex/utils/custom_device_list.py +311 -0
  1068. paddlex/utils/deps.py +249 -0
  1069. paddlex/utils/device.py +195 -0
  1070. paddlex/utils/download.py +168 -182
  1071. paddlex/utils/env.py +31 -48
  1072. paddlex/utils/errors/__init__.py +17 -0
  1073. paddlex/utils/errors/dataset_checker.py +78 -0
  1074. paddlex/utils/errors/others.py +138 -0
  1075. paddlex/utils/file_interface.py +211 -0
  1076. paddlex/utils/flags.py +70 -0
  1077. paddlex/utils/fonts/__init__.py +97 -0
  1078. paddlex/utils/func_register.py +41 -0
  1079. paddlex/utils/install.py +87 -0
  1080. paddlex/utils/interactive_get_pipeline.py +55 -0
  1081. paddlex/utils/lazy_loader.py +68 -0
  1082. paddlex/utils/logging.py +140 -33
  1083. paddlex/utils/misc.py +201 -0
  1084. paddlex/utils/pipeline_arguments.py +719 -0
  1085. paddlex/utils/result_saver.py +58 -0
  1086. paddlex/utils/subclass_register.py +99 -0
  1087. paddlex/version.py +55 -0
  1088. paddlex-3.0.0.dist-info/METADATA +1168 -0
  1089. paddlex-3.0.0.dist-info/RECORD +1093 -0
  1090. paddlex-3.0.0.dist-info/WHEEL +5 -0
  1091. paddlex-3.0.0.dist-info/entry_points.txt +2 -0
  1092. paddlex-3.0.0.dist-info/licenses/LICENSE +169 -0
  1093. paddlex-3.0.0.dist-info/top_level.txt +1 -0
  1094. PaddleClas/__init__.py +0 -16
  1095. PaddleClas/paddleclas.py +0 -375
  1096. PaddleClas/ppcls/__init__.py +0 -20
  1097. PaddleClas/ppcls/data/__init__.py +0 -15
  1098. PaddleClas/ppcls/data/imaug/__init__.py +0 -94
  1099. PaddleClas/ppcls/data/imaug/autoaugment.py +0 -264
  1100. PaddleClas/ppcls/data/imaug/batch_operators.py +0 -117
  1101. PaddleClas/ppcls/data/imaug/cutout.py +0 -41
  1102. PaddleClas/ppcls/data/imaug/fmix.py +0 -217
  1103. PaddleClas/ppcls/data/imaug/grid.py +0 -89
  1104. PaddleClas/ppcls/data/imaug/hide_and_seek.py +0 -44
  1105. PaddleClas/ppcls/data/imaug/operators.py +0 -244
  1106. PaddleClas/ppcls/data/imaug/randaugment.py +0 -106
  1107. PaddleClas/ppcls/data/imaug/random_erasing.py +0 -55
  1108. PaddleClas/ppcls/data/reader.py +0 -318
  1109. PaddleClas/ppcls/modeling/__init__.py +0 -20
  1110. PaddleClas/ppcls/modeling/architectures/__init__.py +0 -51
  1111. PaddleClas/ppcls/modeling/architectures/alexnet.py +0 -132
  1112. PaddleClas/ppcls/modeling/architectures/darknet.py +0 -161
  1113. PaddleClas/ppcls/modeling/architectures/densenet.py +0 -308
  1114. PaddleClas/ppcls/modeling/architectures/distillation_models.py +0 -65
  1115. PaddleClas/ppcls/modeling/architectures/distilled_vision_transformer.py +0 -196
  1116. PaddleClas/ppcls/modeling/architectures/dpn.py +0 -425
  1117. PaddleClas/ppcls/modeling/architectures/efficientnet.py +0 -901
  1118. PaddleClas/ppcls/modeling/architectures/ghostnet.py +0 -331
  1119. PaddleClas/ppcls/modeling/architectures/googlenet.py +0 -207
  1120. PaddleClas/ppcls/modeling/architectures/hrnet.py +0 -742
  1121. PaddleClas/ppcls/modeling/architectures/inception_v3.py +0 -481
  1122. PaddleClas/ppcls/modeling/architectures/inception_v4.py +0 -455
  1123. PaddleClas/ppcls/modeling/architectures/mixnet.py +0 -782
  1124. PaddleClas/ppcls/modeling/architectures/mobilenet_v1.py +0 -266
  1125. PaddleClas/ppcls/modeling/architectures/mobilenet_v2.py +0 -248
  1126. PaddleClas/ppcls/modeling/architectures/mobilenet_v3.py +0 -359
  1127. PaddleClas/ppcls/modeling/architectures/regnet.py +0 -383
  1128. PaddleClas/ppcls/modeling/architectures/repvgg.py +0 -339
  1129. PaddleClas/ppcls/modeling/architectures/res2net.py +0 -272
  1130. PaddleClas/ppcls/modeling/architectures/res2net_vd.py +0 -295
  1131. PaddleClas/ppcls/modeling/architectures/resnest.py +0 -705
  1132. PaddleClas/ppcls/modeling/architectures/resnet.py +0 -316
  1133. PaddleClas/ppcls/modeling/architectures/resnet_vc.py +0 -309
  1134. PaddleClas/ppcls/modeling/architectures/resnet_vd.py +0 -354
  1135. PaddleClas/ppcls/modeling/architectures/resnext.py +0 -253
  1136. PaddleClas/ppcls/modeling/architectures/resnext101_wsl.py +0 -447
  1137. PaddleClas/ppcls/modeling/architectures/resnext_vd.py +0 -266
  1138. PaddleClas/ppcls/modeling/architectures/rexnet.py +0 -240
  1139. PaddleClas/ppcls/modeling/architectures/se_resnet_vd.py +0 -378
  1140. PaddleClas/ppcls/modeling/architectures/se_resnext.py +0 -290
  1141. PaddleClas/ppcls/modeling/architectures/se_resnext_vd.py +0 -285
  1142. PaddleClas/ppcls/modeling/architectures/shufflenet_v2.py +0 -320
  1143. PaddleClas/ppcls/modeling/architectures/squeezenet.py +0 -154
  1144. PaddleClas/ppcls/modeling/architectures/vgg.py +0 -152
  1145. PaddleClas/ppcls/modeling/architectures/vision_transformer.py +0 -402
  1146. PaddleClas/ppcls/modeling/architectures/xception.py +0 -345
  1147. PaddleClas/ppcls/modeling/architectures/xception_deeplab.py +0 -386
  1148. PaddleClas/ppcls/modeling/loss.py +0 -154
  1149. PaddleClas/ppcls/modeling/utils.py +0 -53
  1150. PaddleClas/ppcls/optimizer/__init__.py +0 -19
  1151. PaddleClas/ppcls/optimizer/learning_rate.py +0 -159
  1152. PaddleClas/ppcls/optimizer/optimizer.py +0 -165
  1153. PaddleClas/ppcls/utils/__init__.py +0 -27
  1154. PaddleClas/ppcls/utils/check.py +0 -151
  1155. PaddleClas/ppcls/utils/config.py +0 -201
  1156. PaddleClas/ppcls/utils/logger.py +0 -120
  1157. PaddleClas/ppcls/utils/metrics.py +0 -107
  1158. PaddleClas/ppcls/utils/misc.py +0 -62
  1159. PaddleClas/ppcls/utils/model_zoo.py +0 -213
  1160. PaddleClas/ppcls/utils/save_load.py +0 -163
  1161. PaddleClas/setup.py +0 -55
  1162. PaddleClas/tools/__init__.py +0 -15
  1163. PaddleClas/tools/download.py +0 -50
  1164. PaddleClas/tools/ema.py +0 -58
  1165. PaddleClas/tools/eval.py +0 -112
  1166. PaddleClas/tools/export_model.py +0 -85
  1167. PaddleClas/tools/export_serving_model.py +0 -76
  1168. PaddleClas/tools/infer/__init__.py +0 -16
  1169. PaddleClas/tools/infer/infer.py +0 -94
  1170. PaddleClas/tools/infer/predict.py +0 -117
  1171. PaddleClas/tools/infer/utils.py +0 -233
  1172. PaddleClas/tools/program.py +0 -444
  1173. PaddleClas/tools/test_hubserving.py +0 -113
  1174. PaddleClas/tools/train.py +0 -141
  1175. paddlex/cls.py +0 -76
  1176. paddlex/command.py +0 -215
  1177. paddlex/cv/__init__.py +0 -17
  1178. paddlex/cv/datasets/__init__.py +0 -18
  1179. paddlex/cv/datasets/coco.py +0 -169
  1180. paddlex/cv/datasets/imagenet.py +0 -88
  1181. paddlex/cv/datasets/seg_dataset.py +0 -91
  1182. paddlex/cv/datasets/voc.py +0 -301
  1183. paddlex/cv/models/__init__.py +0 -18
  1184. paddlex/cv/models/base.py +0 -623
  1185. paddlex/cv/models/classifier.py +0 -814
  1186. paddlex/cv/models/detector.py +0 -1747
  1187. paddlex/cv/models/load_model.py +0 -126
  1188. paddlex/cv/models/segmenter.py +0 -673
  1189. paddlex/cv/models/slim/__init__.py +0 -13
  1190. paddlex/cv/models/slim/prune.py +0 -55
  1191. paddlex/cv/models/utils/__init__.py +0 -13
  1192. paddlex/cv/models/utils/det_metrics/__init__.py +0 -15
  1193. paddlex/cv/models/utils/det_metrics/coco_utils.py +0 -217
  1194. paddlex/cv/models/utils/det_metrics/metrics.py +0 -220
  1195. paddlex/cv/models/utils/ema.py +0 -48
  1196. paddlex/cv/models/utils/seg_metrics.py +0 -62
  1197. paddlex/cv/models/utils/visualize.py +0 -394
  1198. paddlex/cv/transforms/__init__.py +0 -46
  1199. paddlex/cv/transforms/batch_operators.py +0 -286
  1200. paddlex/cv/transforms/box_utils.py +0 -41
  1201. paddlex/cv/transforms/functions.py +0 -193
  1202. paddlex/cv/transforms/operators.py +0 -1402
  1203. paddlex/det.py +0 -43
  1204. paddlex/paddleseg/__init__.py +0 -17
  1205. paddlex/paddleseg/core/__init__.py +0 -20
  1206. paddlex/paddleseg/core/infer.py +0 -289
  1207. paddlex/paddleseg/core/predict.py +0 -145
  1208. paddlex/paddleseg/core/train.py +0 -258
  1209. paddlex/paddleseg/core/val.py +0 -172
  1210. paddlex/paddleseg/cvlibs/__init__.py +0 -17
  1211. paddlex/paddleseg/cvlibs/callbacks.py +0 -279
  1212. paddlex/paddleseg/cvlibs/config.py +0 -359
  1213. paddlex/paddleseg/cvlibs/manager.py +0 -142
  1214. paddlex/paddleseg/cvlibs/param_init.py +0 -91
  1215. paddlex/paddleseg/datasets/__init__.py +0 -21
  1216. paddlex/paddleseg/datasets/ade.py +0 -112
  1217. paddlex/paddleseg/datasets/cityscapes.py +0 -86
  1218. paddlex/paddleseg/datasets/cocostuff.py +0 -79
  1219. paddlex/paddleseg/datasets/dataset.py +0 -164
  1220. paddlex/paddleseg/datasets/mini_deep_globe_road_extraction.py +0 -95
  1221. paddlex/paddleseg/datasets/optic_disc_seg.py +0 -97
  1222. paddlex/paddleseg/datasets/pascal_context.py +0 -80
  1223. paddlex/paddleseg/datasets/voc.py +0 -113
  1224. paddlex/paddleseg/models/__init__.py +0 -39
  1225. paddlex/paddleseg/models/ann.py +0 -436
  1226. paddlex/paddleseg/models/attention_unet.py +0 -189
  1227. paddlex/paddleseg/models/backbones/__init__.py +0 -18
  1228. paddlex/paddleseg/models/backbones/hrnet.py +0 -815
  1229. paddlex/paddleseg/models/backbones/mobilenetv3.py +0 -365
  1230. paddlex/paddleseg/models/backbones/resnet_vd.py +0 -364
  1231. paddlex/paddleseg/models/backbones/xception_deeplab.py +0 -415
  1232. paddlex/paddleseg/models/bisenet.py +0 -311
  1233. paddlex/paddleseg/models/danet.py +0 -220
  1234. paddlex/paddleseg/models/decoupled_segnet.py +0 -233
  1235. paddlex/paddleseg/models/deeplab.py +0 -258
  1236. paddlex/paddleseg/models/dnlnet.py +0 -231
  1237. paddlex/paddleseg/models/emanet.py +0 -219
  1238. paddlex/paddleseg/models/fast_scnn.py +0 -318
  1239. paddlex/paddleseg/models/fcn.py +0 -135
  1240. paddlex/paddleseg/models/gcnet.py +0 -223
  1241. paddlex/paddleseg/models/gscnn.py +0 -357
  1242. paddlex/paddleseg/models/hardnet.py +0 -309
  1243. paddlex/paddleseg/models/isanet.py +0 -202
  1244. paddlex/paddleseg/models/layers/__init__.py +0 -19
  1245. paddlex/paddleseg/models/layers/activation.py +0 -73
  1246. paddlex/paddleseg/models/layers/attention.py +0 -146
  1247. paddlex/paddleseg/models/layers/layer_libs.py +0 -168
  1248. paddlex/paddleseg/models/layers/nonlocal2d.py +0 -155
  1249. paddlex/paddleseg/models/layers/pyramid_pool.py +0 -182
  1250. paddlex/paddleseg/models/losses/__init__.py +0 -27
  1251. paddlex/paddleseg/models/losses/binary_cross_entropy_loss.py +0 -174
  1252. paddlex/paddleseg/models/losses/bootstrapped_cross_entropy.py +0 -73
  1253. paddlex/paddleseg/models/losses/cross_entropy_loss.py +0 -94
  1254. paddlex/paddleseg/models/losses/decoupledsegnet_relax_boundary_loss.py +0 -129
  1255. paddlex/paddleseg/models/losses/dice_loss.py +0 -61
  1256. paddlex/paddleseg/models/losses/edge_attention_loss.py +0 -78
  1257. paddlex/paddleseg/models/losses/gscnn_dual_task_loss.py +0 -141
  1258. paddlex/paddleseg/models/losses/l1_loss.py +0 -76
  1259. paddlex/paddleseg/models/losses/lovasz_loss.py +0 -222
  1260. paddlex/paddleseg/models/losses/mean_square_error_loss.py +0 -65
  1261. paddlex/paddleseg/models/losses/mixed_loss.py +0 -58
  1262. paddlex/paddleseg/models/losses/ohem_cross_entropy_loss.py +0 -99
  1263. paddlex/paddleseg/models/losses/ohem_edge_attention_loss.py +0 -114
  1264. paddlex/paddleseg/models/ocrnet.py +0 -248
  1265. paddlex/paddleseg/models/pspnet.py +0 -147
  1266. paddlex/paddleseg/models/sfnet.py +0 -236
  1267. paddlex/paddleseg/models/shufflenet_slim.py +0 -268
  1268. paddlex/paddleseg/models/u2net.py +0 -574
  1269. paddlex/paddleseg/models/unet.py +0 -155
  1270. paddlex/paddleseg/models/unet_3plus.py +0 -316
  1271. paddlex/paddleseg/models/unet_plusplus.py +0 -237
  1272. paddlex/paddleseg/transforms/__init__.py +0 -16
  1273. paddlex/paddleseg/transforms/functional.py +0 -161
  1274. paddlex/paddleseg/transforms/transforms.py +0 -937
  1275. paddlex/paddleseg/utils/__init__.py +0 -22
  1276. paddlex/paddleseg/utils/config_check.py +0 -60
  1277. paddlex/paddleseg/utils/download.py +0 -163
  1278. paddlex/paddleseg/utils/env/__init__.py +0 -16
  1279. paddlex/paddleseg/utils/env/seg_env.py +0 -56
  1280. paddlex/paddleseg/utils/env/sys_env.py +0 -122
  1281. paddlex/paddleseg/utils/logger.py +0 -48
  1282. paddlex/paddleseg/utils/metrics.py +0 -146
  1283. paddlex/paddleseg/utils/progbar.py +0 -212
  1284. paddlex/paddleseg/utils/timer.py +0 -53
  1285. paddlex/paddleseg/utils/utils.py +0 -120
  1286. paddlex/paddleseg/utils/visualize.py +0 -90
  1287. paddlex/ppcls/__init__.py +0 -20
  1288. paddlex/ppcls/data/__init__.py +0 -15
  1289. paddlex/ppcls/data/imaug/__init__.py +0 -94
  1290. paddlex/ppcls/data/imaug/autoaugment.py +0 -264
  1291. paddlex/ppcls/data/imaug/batch_operators.py +0 -117
  1292. paddlex/ppcls/data/imaug/cutout.py +0 -41
  1293. paddlex/ppcls/data/imaug/fmix.py +0 -217
  1294. paddlex/ppcls/data/imaug/grid.py +0 -89
  1295. paddlex/ppcls/data/imaug/hide_and_seek.py +0 -44
  1296. paddlex/ppcls/data/imaug/operators.py +0 -256
  1297. paddlex/ppcls/data/imaug/randaugment.py +0 -106
  1298. paddlex/ppcls/data/imaug/random_erasing.py +0 -55
  1299. paddlex/ppcls/data/reader.py +0 -318
  1300. paddlex/ppcls/modeling/__init__.py +0 -20
  1301. paddlex/ppcls/modeling/architectures/__init__.py +0 -51
  1302. paddlex/ppcls/modeling/architectures/alexnet.py +0 -132
  1303. paddlex/ppcls/modeling/architectures/darknet.py +0 -161
  1304. paddlex/ppcls/modeling/architectures/densenet.py +0 -308
  1305. paddlex/ppcls/modeling/architectures/distillation_models.py +0 -65
  1306. paddlex/ppcls/modeling/architectures/distilled_vision_transformer.py +0 -196
  1307. paddlex/ppcls/modeling/architectures/dpn.py +0 -425
  1308. paddlex/ppcls/modeling/architectures/efficientnet.py +0 -901
  1309. paddlex/ppcls/modeling/architectures/ghostnet.py +0 -331
  1310. paddlex/ppcls/modeling/architectures/googlenet.py +0 -207
  1311. paddlex/ppcls/modeling/architectures/hrnet.py +0 -742
  1312. paddlex/ppcls/modeling/architectures/inception_v3.py +0 -541
  1313. paddlex/ppcls/modeling/architectures/inception_v4.py +0 -455
  1314. paddlex/ppcls/modeling/architectures/mixnet.py +0 -782
  1315. paddlex/ppcls/modeling/architectures/mobilenet_v1.py +0 -266
  1316. paddlex/ppcls/modeling/architectures/mobilenet_v2.py +0 -248
  1317. paddlex/ppcls/modeling/architectures/mobilenet_v3.py +0 -359
  1318. paddlex/ppcls/modeling/architectures/regnet.py +0 -383
  1319. paddlex/ppcls/modeling/architectures/repvgg.py +0 -339
  1320. paddlex/ppcls/modeling/architectures/res2net.py +0 -272
  1321. paddlex/ppcls/modeling/architectures/res2net_vd.py +0 -295
  1322. paddlex/ppcls/modeling/architectures/resnest.py +0 -705
  1323. paddlex/ppcls/modeling/architectures/resnet.py +0 -317
  1324. paddlex/ppcls/modeling/architectures/resnet_vc.py +0 -309
  1325. paddlex/ppcls/modeling/architectures/resnet_vd.py +0 -354
  1326. paddlex/ppcls/modeling/architectures/resnext.py +0 -259
  1327. paddlex/ppcls/modeling/architectures/resnext101_wsl.py +0 -447
  1328. paddlex/ppcls/modeling/architectures/resnext_vd.py +0 -266
  1329. paddlex/ppcls/modeling/architectures/rexnet.py +0 -240
  1330. paddlex/ppcls/modeling/architectures/se_resnet_vd.py +0 -378
  1331. paddlex/ppcls/modeling/architectures/se_resnext.py +0 -290
  1332. paddlex/ppcls/modeling/architectures/se_resnext_vd.py +0 -285
  1333. paddlex/ppcls/modeling/architectures/shufflenet_v2.py +0 -320
  1334. paddlex/ppcls/modeling/architectures/squeezenet.py +0 -154
  1335. paddlex/ppcls/modeling/architectures/vgg.py +0 -152
  1336. paddlex/ppcls/modeling/architectures/vision_transformer.py +0 -402
  1337. paddlex/ppcls/modeling/architectures/xception.py +0 -345
  1338. paddlex/ppcls/modeling/architectures/xception_deeplab.py +0 -386
  1339. paddlex/ppcls/modeling/loss.py +0 -158
  1340. paddlex/ppcls/modeling/utils.py +0 -53
  1341. paddlex/ppcls/optimizer/__init__.py +0 -19
  1342. paddlex/ppcls/optimizer/learning_rate.py +0 -159
  1343. paddlex/ppcls/optimizer/optimizer.py +0 -165
  1344. paddlex/ppcls/utils/__init__.py +0 -27
  1345. paddlex/ppcls/utils/check.py +0 -151
  1346. paddlex/ppcls/utils/config.py +0 -201
  1347. paddlex/ppcls/utils/logger.py +0 -120
  1348. paddlex/ppcls/utils/metrics.py +0 -112
  1349. paddlex/ppcls/utils/misc.py +0 -62
  1350. paddlex/ppcls/utils/model_zoo.py +0 -213
  1351. paddlex/ppcls/utils/save_load.py +0 -163
  1352. paddlex/ppdet/__init__.py +0 -16
  1353. paddlex/ppdet/core/__init__.py +0 -15
  1354. paddlex/ppdet/core/config/__init__.py +0 -13
  1355. paddlex/ppdet/core/config/schema.py +0 -248
  1356. paddlex/ppdet/core/config/yaml_helpers.py +0 -118
  1357. paddlex/ppdet/core/workspace.py +0 -279
  1358. paddlex/ppdet/data/__init__.py +0 -21
  1359. paddlex/ppdet/data/reader.py +0 -304
  1360. paddlex/ppdet/data/shm_utils.py +0 -67
  1361. paddlex/ppdet/data/source/__init__.py +0 -27
  1362. paddlex/ppdet/data/source/category.py +0 -823
  1363. paddlex/ppdet/data/source/coco.py +0 -243
  1364. paddlex/ppdet/data/source/dataset.py +0 -192
  1365. paddlex/ppdet/data/source/keypoint_coco.py +0 -656
  1366. paddlex/ppdet/data/source/mot.py +0 -360
  1367. paddlex/ppdet/data/source/voc.py +0 -204
  1368. paddlex/ppdet/data/source/widerface.py +0 -180
  1369. paddlex/ppdet/data/transform/__init__.py +0 -28
  1370. paddlex/ppdet/data/transform/autoaugment_utils.py +0 -1593
  1371. paddlex/ppdet/data/transform/batch_operators.py +0 -758
  1372. paddlex/ppdet/data/transform/gridmask_utils.py +0 -83
  1373. paddlex/ppdet/data/transform/keypoint_operators.py +0 -665
  1374. paddlex/ppdet/data/transform/mot_operators.py +0 -636
  1375. paddlex/ppdet/data/transform/op_helper.py +0 -468
  1376. paddlex/ppdet/data/transform/operators.py +0 -2103
  1377. paddlex/ppdet/engine/__init__.py +0 -29
  1378. paddlex/ppdet/engine/callbacks.py +0 -262
  1379. paddlex/ppdet/engine/env.py +0 -47
  1380. paddlex/ppdet/engine/export_utils.py +0 -118
  1381. paddlex/ppdet/engine/tracker.py +0 -425
  1382. paddlex/ppdet/engine/trainer.py +0 -535
  1383. paddlex/ppdet/metrics/__init__.py +0 -23
  1384. paddlex/ppdet/metrics/coco_utils.py +0 -184
  1385. paddlex/ppdet/metrics/json_results.py +0 -151
  1386. paddlex/ppdet/metrics/keypoint_metrics.py +0 -202
  1387. paddlex/ppdet/metrics/map_utils.py +0 -396
  1388. paddlex/ppdet/metrics/metrics.py +0 -300
  1389. paddlex/ppdet/metrics/mot_eval_utils.py +0 -192
  1390. paddlex/ppdet/metrics/mot_metrics.py +0 -184
  1391. paddlex/ppdet/metrics/widerface_utils.py +0 -393
  1392. paddlex/ppdet/model_zoo/__init__.py +0 -18
  1393. paddlex/ppdet/model_zoo/model_zoo.py +0 -86
  1394. paddlex/ppdet/model_zoo/tests/__init__.py +0 -13
  1395. paddlex/ppdet/model_zoo/tests/test_get_model.py +0 -48
  1396. paddlex/ppdet/model_zoo/tests/test_list_model.py +0 -68
  1397. paddlex/ppdet/modeling/__init__.py +0 -41
  1398. paddlex/ppdet/modeling/architectures/__init__.py +0 -40
  1399. paddlex/ppdet/modeling/architectures/cascade_rcnn.py +0 -144
  1400. paddlex/ppdet/modeling/architectures/centernet.py +0 -103
  1401. paddlex/ppdet/modeling/architectures/deepsort.py +0 -111
  1402. paddlex/ppdet/modeling/architectures/fairmot.py +0 -107
  1403. paddlex/ppdet/modeling/architectures/faster_rcnn.py +0 -106
  1404. paddlex/ppdet/modeling/architectures/fcos.py +0 -105
  1405. paddlex/ppdet/modeling/architectures/jde.py +0 -125
  1406. paddlex/ppdet/modeling/architectures/keypoint_hrhrnet.py +0 -286
  1407. paddlex/ppdet/modeling/architectures/keypoint_hrnet.py +0 -203
  1408. paddlex/ppdet/modeling/architectures/mask_rcnn.py +0 -135
  1409. paddlex/ppdet/modeling/architectures/meta_arch.py +0 -45
  1410. paddlex/ppdet/modeling/architectures/s2anet.py +0 -103
  1411. paddlex/ppdet/modeling/architectures/solov2.py +0 -110
  1412. paddlex/ppdet/modeling/architectures/ssd.py +0 -84
  1413. paddlex/ppdet/modeling/architectures/ttfnet.py +0 -98
  1414. paddlex/ppdet/modeling/architectures/yolo.py +0 -104
  1415. paddlex/ppdet/modeling/backbones/__init__.py +0 -37
  1416. paddlex/ppdet/modeling/backbones/blazenet.py +0 -322
  1417. paddlex/ppdet/modeling/backbones/darknet.py +0 -341
  1418. paddlex/ppdet/modeling/backbones/dla.py +0 -244
  1419. paddlex/ppdet/modeling/backbones/ghostnet.py +0 -476
  1420. paddlex/ppdet/modeling/backbones/hrnet.py +0 -724
  1421. paddlex/ppdet/modeling/backbones/mobilenet_v1.py +0 -410
  1422. paddlex/ppdet/modeling/backbones/mobilenet_v3.py +0 -497
  1423. paddlex/ppdet/modeling/backbones/name_adapter.py +0 -69
  1424. paddlex/ppdet/modeling/backbones/res2net.py +0 -358
  1425. paddlex/ppdet/modeling/backbones/resnet.py +0 -606
  1426. paddlex/ppdet/modeling/backbones/senet.py +0 -140
  1427. paddlex/ppdet/modeling/backbones/vgg.py +0 -216
  1428. paddlex/ppdet/modeling/bbox_utils.py +0 -464
  1429. paddlex/ppdet/modeling/heads/__init__.py +0 -41
  1430. paddlex/ppdet/modeling/heads/bbox_head.py +0 -379
  1431. paddlex/ppdet/modeling/heads/cascade_head.py +0 -285
  1432. paddlex/ppdet/modeling/heads/centernet_head.py +0 -194
  1433. paddlex/ppdet/modeling/heads/face_head.py +0 -113
  1434. paddlex/ppdet/modeling/heads/fcos_head.py +0 -270
  1435. paddlex/ppdet/modeling/heads/keypoint_hrhrnet_head.py +0 -108
  1436. paddlex/ppdet/modeling/heads/mask_head.py +0 -253
  1437. paddlex/ppdet/modeling/heads/roi_extractor.py +0 -111
  1438. paddlex/ppdet/modeling/heads/s2anet_head.py +0 -845
  1439. paddlex/ppdet/modeling/heads/solov2_head.py +0 -537
  1440. paddlex/ppdet/modeling/heads/ssd_head.py +0 -175
  1441. paddlex/ppdet/modeling/heads/ttf_head.py +0 -314
  1442. paddlex/ppdet/modeling/heads/yolo_head.py +0 -124
  1443. paddlex/ppdet/modeling/keypoint_utils.py +0 -302
  1444. paddlex/ppdet/modeling/layers.py +0 -1142
  1445. paddlex/ppdet/modeling/losses/__init__.py +0 -35
  1446. paddlex/ppdet/modeling/losses/ctfocal_loss.py +0 -67
  1447. paddlex/ppdet/modeling/losses/fairmot_loss.py +0 -41
  1448. paddlex/ppdet/modeling/losses/fcos_loss.py +0 -225
  1449. paddlex/ppdet/modeling/losses/iou_aware_loss.py +0 -48
  1450. paddlex/ppdet/modeling/losses/iou_loss.py +0 -210
  1451. paddlex/ppdet/modeling/losses/jde_loss.py +0 -182
  1452. paddlex/ppdet/modeling/losses/keypoint_loss.py +0 -228
  1453. paddlex/ppdet/modeling/losses/solov2_loss.py +0 -101
  1454. paddlex/ppdet/modeling/losses/ssd_loss.py +0 -163
  1455. paddlex/ppdet/modeling/losses/yolo_loss.py +0 -212
  1456. paddlex/ppdet/modeling/mot/__init__.py +0 -25
  1457. paddlex/ppdet/modeling/mot/matching/__init__.py +0 -19
  1458. paddlex/ppdet/modeling/mot/matching/deepsort_matching.py +0 -382
  1459. paddlex/ppdet/modeling/mot/matching/jde_matching.py +0 -145
  1460. paddlex/ppdet/modeling/mot/motion/__init__.py +0 -17
  1461. paddlex/ppdet/modeling/mot/motion/kalman_filter.py +0 -270
  1462. paddlex/ppdet/modeling/mot/tracker/__init__.py +0 -23
  1463. paddlex/ppdet/modeling/mot/tracker/base_jde_tracker.py +0 -267
  1464. paddlex/ppdet/modeling/mot/tracker/base_sde_tracker.py +0 -145
  1465. paddlex/ppdet/modeling/mot/tracker/deepsort_tracker.py +0 -165
  1466. paddlex/ppdet/modeling/mot/tracker/jde_tracker.py +0 -262
  1467. paddlex/ppdet/modeling/mot/utils.py +0 -181
  1468. paddlex/ppdet/modeling/mot/visualization.py +0 -130
  1469. paddlex/ppdet/modeling/necks/__init__.py +0 -25
  1470. paddlex/ppdet/modeling/necks/centernet_fpn.py +0 -185
  1471. paddlex/ppdet/modeling/necks/fpn.py +0 -233
  1472. paddlex/ppdet/modeling/necks/hrfpn.py +0 -131
  1473. paddlex/ppdet/modeling/necks/ttf_fpn.py +0 -243
  1474. paddlex/ppdet/modeling/necks/yolo_fpn.py +0 -1034
  1475. paddlex/ppdet/modeling/ops.py +0 -1599
  1476. paddlex/ppdet/modeling/post_process.py +0 -449
  1477. paddlex/ppdet/modeling/proposal_generator/__init__.py +0 -2
  1478. paddlex/ppdet/modeling/proposal_generator/anchor_generator.py +0 -135
  1479. paddlex/ppdet/modeling/proposal_generator/proposal_generator.py +0 -81
  1480. paddlex/ppdet/modeling/proposal_generator/rpn_head.py +0 -269
  1481. paddlex/ppdet/modeling/proposal_generator/target.py +0 -671
  1482. paddlex/ppdet/modeling/proposal_generator/target_layer.py +0 -476
  1483. paddlex/ppdet/modeling/reid/__init__.py +0 -23
  1484. paddlex/ppdet/modeling/reid/fairmot_embedding_head.py +0 -117
  1485. paddlex/ppdet/modeling/reid/jde_embedding_head.py +0 -189
  1486. paddlex/ppdet/modeling/reid/pyramidal_embedding.py +0 -151
  1487. paddlex/ppdet/modeling/reid/resnet.py +0 -320
  1488. paddlex/ppdet/modeling/shape_spec.py +0 -33
  1489. paddlex/ppdet/modeling/tests/__init__.py +0 -13
  1490. paddlex/ppdet/modeling/tests/test_architectures.py +0 -59
  1491. paddlex/ppdet/modeling/tests/test_base.py +0 -75
  1492. paddlex/ppdet/modeling/tests/test_ops.py +0 -839
  1493. paddlex/ppdet/modeling/tests/test_yolov3_loss.py +0 -420
  1494. paddlex/ppdet/optimizer.py +0 -285
  1495. paddlex/ppdet/slim/__init__.py +0 -62
  1496. paddlex/ppdet/slim/distill.py +0 -111
  1497. paddlex/ppdet/slim/prune.py +0 -85
  1498. paddlex/ppdet/slim/quant.py +0 -52
  1499. paddlex/ppdet/utils/__init__.py +0 -13
  1500. paddlex/ppdet/utils/check.py +0 -93
  1501. paddlex/ppdet/utils/checkpoint.py +0 -216
  1502. paddlex/ppdet/utils/cli.py +0 -151
  1503. paddlex/ppdet/utils/colormap.py +0 -56
  1504. paddlex/ppdet/utils/download.py +0 -477
  1505. paddlex/ppdet/utils/logger.py +0 -71
  1506. paddlex/ppdet/utils/stats.py +0 -95
  1507. paddlex/ppdet/utils/visualizer.py +0 -292
  1508. paddlex/ppdet/utils/voc_utils.py +0 -87
  1509. paddlex/seg.py +0 -38
  1510. paddlex/tools/__init__.py +0 -16
  1511. paddlex/tools/convert.py +0 -52
  1512. paddlex/tools/dataset_conversion/__init__.py +0 -24
  1513. paddlex/tools/dataset_conversion/x2coco.py +0 -379
  1514. paddlex/tools/dataset_conversion/x2imagenet.py +0 -82
  1515. paddlex/tools/dataset_conversion/x2seg.py +0 -343
  1516. paddlex/tools/dataset_conversion/x2voc.py +0 -230
  1517. paddlex/tools/dataset_split/__init__.py +0 -23
  1518. paddlex/tools/dataset_split/coco_split.py +0 -69
  1519. paddlex/tools/dataset_split/imagenet_split.py +0 -75
  1520. paddlex/tools/dataset_split/seg_split.py +0 -96
  1521. paddlex/tools/dataset_split/utils.py +0 -75
  1522. paddlex/tools/dataset_split/voc_split.py +0 -91
  1523. paddlex/tools/split.py +0 -41
  1524. paddlex/utils/checkpoint.py +0 -439
  1525. paddlex/utils/shm.py +0 -67
  1526. paddlex/utils/stats.py +0 -68
  1527. paddlex/utils/utils.py +0 -140
  1528. paddlex-2.0.0rc4.dist-info/LICENSE +0 -201
  1529. paddlex-2.0.0rc4.dist-info/METADATA +0 -29
  1530. paddlex-2.0.0rc4.dist-info/RECORD +0 -445
  1531. paddlex-2.0.0rc4.dist-info/WHEEL +0 -5
  1532. paddlex-2.0.0rc4.dist-info/entry_points.txt +0 -3
  1533. paddlex-2.0.0rc4.dist-info/top_level.txt +0 -2
@@ -0,0 +1,2495 @@
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import math
16
+ import os
17
+ from dataclasses import dataclass
18
+ from typing import Any, Dict, List, Optional, Tuple, Union
19
+
20
+ import paddle
21
+ import paddle.distributed.fleet.meta_parallel as mpu
22
+ import paddle.nn as nn
23
+ import paddle.nn.functional as F
24
+ from paddle import Tensor
25
+ from paddle.distributed import fleet
26
+ from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
27
+ from paddle.distributed.fleet.utils import recompute
28
+
29
+ from .....utils import logging
30
+ from ....utils.benchmark import (
31
+ benchmark,
32
+ get_inference_operations,
33
+ set_inference_operations,
34
+ )
35
+ from ...common.vlm.activations import ACT2FN
36
+ from ...common.vlm.bert_padding import index_first_axis, pad_input, unpad_input
37
+ from ...common.vlm.flash_attn_utils import has_flash_attn_func
38
+ from ...common.vlm.transformers import PretrainedConfig, PretrainedModel
39
+ from ...common.vlm.transformers.model_outputs import (
40
+ BaseModelOutputWithPast,
41
+ ModelOutput,
42
+ )
43
+
44
+ flash_attn_func, flash_attn_varlen_func = has_flash_attn_func()
45
+ _IS_NPU = "npu" in paddle.get_device()
46
+
47
+ Linear = nn.Linear
48
+ ColumnParallelLinear = mpu.ColumnParallelLinear
49
+ RowParallelLinear = mpu.RowParallelLinear
50
+
51
+
52
+ class Qwen2VLVisionConfig(PretrainedConfig):
53
+ model_type = "qwen2_vl"
54
+
55
+ def __init__(
56
+ self,
57
+ depth=32,
58
+ embed_dim=1280,
59
+ hidden_size=3584,
60
+ hidden_act="quick_gelu",
61
+ mlp_ratio=4,
62
+ num_heads=16,
63
+ in_channels=3,
64
+ patch_size=14,
65
+ spatial_merge_size=2,
66
+ temporal_patch_size=2,
67
+ attn_implementation="eager", # new added
68
+ **kwargs,
69
+ ):
70
+ super().__init__(**kwargs)
71
+
72
+ self.depth = depth
73
+ self.embed_dim = embed_dim
74
+ self.hidden_size = hidden_size
75
+ self.hidden_act = hidden_act
76
+ self.mlp_ratio = mlp_ratio
77
+ self.num_heads = num_heads
78
+ self.in_channels = in_channels
79
+ self.patch_size = patch_size
80
+ self.spatial_merge_size = spatial_merge_size
81
+ self.temporal_patch_size = temporal_patch_size
82
+ self.attn_implementation = attn_implementation
83
+
84
+ @classmethod
85
+ def from_pretrained(
86
+ cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
87
+ ) -> "PretrainedConfig":
88
+
89
+ config_dict, kwargs = cls.get_config_dict(
90
+ pretrained_model_name_or_path, **kwargs
91
+ )
92
+
93
+ if config_dict.get("model_type") == "qwen2_vl":
94
+ config_dict = config_dict["vision_config"]
95
+
96
+ if (
97
+ "model_type" in config_dict
98
+ and hasattr(cls, "model_type")
99
+ and config_dict["model_type"] != cls.model_type
100
+ ):
101
+ logging.warning(
102
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
103
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
104
+ )
105
+
106
+ return cls.from_dict(config_dict, **kwargs)
107
+
108
+
109
+ class Qwen2VLConfig(PretrainedConfig):
110
+ r"""
111
+ This is the configuration class to store the configuration of a [`Qwen2VLModel`]. It is used to instantiate a
112
+ Qwen2-VL model according to the specified arguments, defining the model architecture. Instantiating a configuration
113
+ with the defaults will yield a similar configuration to that of
114
+ Qwen2-VL-7B-Instruct [Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct).
115
+
116
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
117
+ documentation from [`PretrainedConfig`] for more information.
118
+
119
+ Args:
120
+ vocab_size (`int`, *optional*, defaults to 152064):
121
+ Vocabulary size of the Qwen2VL model. Defines the number of different tokens that can be represented by the
122
+ `inputs_ids` passed when calling [`Qwen2VLModel`]
123
+ hidden_size (`int`, *optional*, defaults to 8192):
124
+ Dimension of the hidden representations.
125
+ intermediate_size (`int`, *optional*, defaults to 29568):
126
+ Dimension of the MLP representations.
127
+ num_hidden_layers (`int`, *optional*, defaults to 80):
128
+ Number of hidden layers in the Transformer encoder.
129
+ num_attention_heads (`int`, *optional*, defaults to 64):
130
+ Number of attention heads for each attention layer in the Transformer encoder.
131
+ num_key_value_heads (`int`, *optional*, defaults to 8):
132
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
133
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
134
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
135
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
136
+ by meanpooling all the original heads within that group. For more details checkout [this
137
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
138
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
139
+ The non-linear activation function (function or string) in the decoder.
140
+ max_position_embeddings (`int`, *optional*, defaults to 32768):
141
+ The maximum sequence length that this model might ever be used with.
142
+ initializer_range (`float`, *optional*, defaults to 0.02):
143
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
144
+ rms_norm_eps (`float`, *optional*, defaults to 1e-05):
145
+ The epsilon used by the rms normalization layers.
146
+ use_cache (`bool`, *optional*, defaults to `True`):
147
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
148
+ relevant if `config.is_decoder=True`.
149
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
150
+ Whether the model's input and output word embeddings should be tied.
151
+ rope_theta (`float`, *optional*, defaults to 1000000.0):
152
+ The base period of the RoPE embeddings.
153
+ use_sliding_window (`bool`, *optional*, defaults to `False`):
154
+ Whether to use sliding window attention.
155
+ sliding_window (`int`, *optional*, defaults to 4096):
156
+ Sliding window attention (SWA) window size. If not specified, will default to `4096`.
157
+ max_window_layers (`int`, *optional*, defaults to 80):
158
+ The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
159
+ attention_dropout (`float`, *optional*, defaults to 0.0):
160
+ The dropout ratio for the attention probabilities.
161
+ vision_config (`Dict`, *optional*):
162
+ The config for the visual encoder initialization.
163
+ rope_scaling (`Dict`, *optional*):
164
+ Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
165
+ strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
166
+ `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
167
+ `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
168
+ these scaling strategies behave:
169
+ https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
170
+ experimental feature, subject to breaking API changes in future versions.
171
+ """
172
+
173
+ model_type = "qwen2_vl"
174
+ keys_to_ignore_at_inference = ["past_key_values"]
175
+
176
+ def __init__(
177
+ self,
178
+ vocab_size=152064,
179
+ hidden_size=8192,
180
+ intermediate_size=29568,
181
+ num_hidden_layers=80,
182
+ num_attention_heads=64,
183
+ num_key_value_heads=8,
184
+ hidden_act="silu",
185
+ max_position_embeddings=32768,
186
+ initializer_range=0.02,
187
+ rms_norm_eps=1e-05,
188
+ use_cache=True,
189
+ tie_word_embeddings=False,
190
+ rope_theta=1000000.0,
191
+ use_sliding_window=False,
192
+ sliding_window=4096,
193
+ max_window_layers=80,
194
+ attention_dropout=0.0,
195
+ vision_config=None,
196
+ rope_scaling=None,
197
+ **kwargs,
198
+ ):
199
+ if isinstance(vision_config, dict):
200
+ self.vision_config = Qwen2VLVisionConfig(**vision_config)
201
+ elif vision_config is None:
202
+ self.vision_config = Qwen2VLVisionConfig()
203
+
204
+ self.vocab_size = vocab_size
205
+ self.max_position_embeddings = max_position_embeddings
206
+ self.hidden_size = hidden_size
207
+ self.intermediate_size = intermediate_size
208
+ self.num_hidden_layers = num_hidden_layers
209
+ self.num_attention_heads = num_attention_heads
210
+ self.use_sliding_window = use_sliding_window
211
+ self.sliding_window = sliding_window
212
+ self.max_window_layers = max_window_layers
213
+
214
+ if num_key_value_heads is None:
215
+ num_key_value_heads = num_attention_heads
216
+
217
+ self.num_key_value_heads = num_key_value_heads
218
+ self.hidden_act = hidden_act
219
+ self.initializer_range = initializer_range
220
+ self.rms_norm_eps = rms_norm_eps
221
+ self.use_cache = use_cache
222
+ self.rope_theta = rope_theta
223
+ self.attention_dropout = attention_dropout
224
+ self.rope_scaling = rope_scaling
225
+
226
+ super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
227
+
228
+
229
+ def get_triangle_upper_mask(x, mask=None):
230
+ if mask is not None:
231
+ return mask
232
+ shape = x.shape
233
+ shape[1] = 1
234
+ mask = paddle.full(shape, paddle.finfo(x.dtype).min, dtype=x.dtype)
235
+ mask = paddle.triu(mask, diagonal=1)
236
+ mask.stop_gradient = True
237
+ return mask
238
+
239
+
240
+ def parallel_matmul(
241
+ x: Tensor, y: Tensor, transpose_y=True, tensor_parallel_output=True
242
+ ):
243
+ is_fleet_init = True
244
+ tensor_parallel_degree = 1
245
+ try:
246
+ hcg = fleet.get_hybrid_communicate_group()
247
+ model_parallel_group = hcg.get_model_parallel_group()
248
+ tensor_parallel_degree = hcg.get_model_parallel_world_size()
249
+ except:
250
+ is_fleet_init = False
251
+
252
+ if paddle.in_dynamic_mode():
253
+ y_is_distributed = y.is_distributed
254
+ else:
255
+ y_is_distributed = tensor_parallel_degree > 1
256
+
257
+ if is_fleet_init and tensor_parallel_degree > 1 and y_is_distributed:
258
+
259
+ input_parallel = paddle.distributed.collective._c_identity(
260
+ x, group=model_parallel_group
261
+ )
262
+ logits = paddle.matmul(input_parallel, y, transpose_y=transpose_y)
263
+
264
+ if tensor_parallel_output:
265
+ return logits
266
+ return paddle.distributed.collective._c_concat(
267
+ logits, group=model_parallel_group
268
+ )
269
+
270
+ else:
271
+ logits = paddle.matmul(x, y, transpose_y=transpose_y)
272
+ return logits
273
+
274
+
275
+ def _compute_default_rope_parameters(
276
+ config: Optional[PretrainedConfig] = None,
277
+ device: Optional["paddle.device"] = None,
278
+ seq_len: Optional[int] = None,
279
+ **rope_kwargs,
280
+ ) -> Tuple["paddle.Tensor", float]:
281
+ """
282
+ Computes the inverse frequencies according to the original RoPE implementation
283
+ Args:
284
+ config ([`~transformers.PretrainedConfig`]):
285
+ The model configuration.
286
+ device (`paddle.device`):
287
+ The device to use for initialization of the inverse frequencies.
288
+ seq_len (`int`, *optional*):
289
+ The current sequence length. Unused for this type of RoPE.
290
+ rope_kwargs (`Dict`, *optional*):
291
+ BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
292
+ Returns:
293
+ Tuple of (`paddle.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
294
+ post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
295
+ """
296
+ if config is not None and len(rope_kwargs) > 0:
297
+ raise ValueError(
298
+ "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
299
+ f"`_compute_default_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
300
+ )
301
+ if len(rope_kwargs) > 0:
302
+ base = rope_kwargs["base"]
303
+ dim = rope_kwargs["dim"]
304
+ elif config is not None:
305
+ base = config.rope_theta
306
+ partial_rotary_factor = (
307
+ config.partial_rotary_factor
308
+ if hasattr(config, "partial_rotary_factor")
309
+ else 1.0
310
+ )
311
+ head_dim = getattr(
312
+ config, "head_dim", config.hidden_size // config.num_attention_heads
313
+ )
314
+ dim = int(head_dim * partial_rotary_factor)
315
+
316
+ attention_factor = 1.0
317
+
318
+ inv_freq = 1.0 / (
319
+ base ** (paddle.arange(0, dim, 2, dtype="int64").astype("float32") / dim)
320
+ )
321
+ return inv_freq, attention_factor
322
+
323
+
324
+ ROPE_INIT_FUNCTIONS = {
325
+ "default": _compute_default_rope_parameters,
326
+ }
327
+
328
+
329
+ def _get_unpad_data(attention_mask):
330
+ seqlens_in_batch = attention_mask.sum(axis=-1, dtype="int32")
331
+ indices = paddle.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
332
+ max_seqlen_in_batch = seqlens_in_batch.max().item() # [2, 1, 1323]
333
+ cu_seqlens = F.pad(
334
+ paddle.cumsum(seqlens_in_batch, axis=0), (1, 0), data_format="NCL"
335
+ )
336
+ return (
337
+ indices,
338
+ cu_seqlens,
339
+ max_seqlen_in_batch,
340
+ )
341
+
342
+
343
+ def is_casual_mask(attention_mask):
344
+ """
345
+ Upper triangular of attention_mask equals to attention_mask is casual
346
+ """
347
+ return (paddle.triu(attention_mask) == attention_mask).all().item()
348
+
349
+
350
+ def _make_causal_mask(input_ids_shape, past_key_values_length):
351
+ """
352
+ Make causal mask used for self-attention
353
+ """
354
+ batch_size, target_length = input_ids_shape
355
+
356
+ mask = paddle.tril(paddle.ones((target_length, target_length), dtype="bool"))
357
+
358
+ if past_key_values_length > 0:
359
+ mask = paddle.concat(
360
+ [paddle.ones([target_length, past_key_values_length], dtype="bool"), mask],
361
+ axis=-1,
362
+ )
363
+
364
+ return mask[None, None, :, :].expand(
365
+ [batch_size, 1, target_length, target_length + past_key_values_length]
366
+ )
367
+
368
+
369
+ def _expand_2d_mask(mask, dtype, tgt_length):
370
+ """
371
+ Expands attention_mask from `[batch_size, src_length]` to `[batch_size, 1, tgt_length, src_length]`.
372
+ """
373
+ batch_size, src_length = mask.shape[0], mask.shape[-1]
374
+ tgt_length = tgt_length if tgt_length is not None else src_length
375
+
376
+ mask = mask[:, None, None, :].astype("bool")
377
+ mask.stop_gradient = True
378
+ expanded_mask = mask.expand([batch_size, 1, tgt_length, src_length])
379
+
380
+ return expanded_mask
381
+
382
+
383
+ @dataclass
384
+ class Qwen2VLCausalLMOutputWithPast(ModelOutput):
385
+ """
386
+ Base class for Qwen2VL causal language model (or autoregressive) outputs.
387
+
388
+ Args:
389
+ loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
390
+ Language modeling loss (for next-token prediction).
391
+ logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
392
+ Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
393
+ past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
394
+ Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
395
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
396
+
397
+ Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
398
+ `past_key_values` input) to speed up sequential decoding.
399
+ hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
400
+ Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
401
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
402
+
403
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
404
+ attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
405
+ Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
406
+ sequence_length)`.
407
+
408
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
409
+ heads.
410
+ rope_deltas (`paddle.Tensor` of shape `(batch_size, )`, *optional*):
411
+ The rope index difference between sequence length and multimodal rope.
412
+ """
413
+
414
+ loss: Optional[paddle.Tensor] = None
415
+ logits: paddle.Tensor = None
416
+ past_key_values: Optional[List[paddle.Tensor]] = None
417
+ hidden_states: Optional[Tuple[paddle.Tensor]] = None
418
+ attentions: Optional[Tuple[paddle.Tensor]] = None
419
+ rope_deltas: Optional[paddle.Tensor] = None
420
+
421
+
422
+ class Qwen2VLRotaryEmbedding(nn.Layer):
423
+ def __init__(
424
+ self,
425
+ dim=None,
426
+ max_position_embeddings=2048,
427
+ base=10000,
428
+ device=None,
429
+ scaling_factor=1.0,
430
+ rope_type="default",
431
+ config: Optional[Qwen2VLConfig] = None,
432
+ ):
433
+ super().__init__()
434
+ self.rope_kwargs = {}
435
+ if config is None:
436
+ self.rope_kwargs = {
437
+ "rope_type": rope_type,
438
+ "factor": scaling_factor,
439
+ "dim": dim,
440
+ "base": base,
441
+ "max_position_embeddings": max_position_embeddings,
442
+ }
443
+ self.rope_type = rope_type
444
+ self.max_seq_len_cached = max_position_embeddings
445
+ self.original_max_seq_len = max_position_embeddings
446
+ else:
447
+ # BC: "rope_type" was originally "type"
448
+ if config.rope_scaling is not None:
449
+ self.rope_type = config.rope_scaling.get(
450
+ "rope_type", config.rope_scaling.get("type")
451
+ )
452
+ else:
453
+ self.rope_type = "default"
454
+ self.max_seq_len_cached = config.max_position_embeddings
455
+ self.original_max_seq_len = config.max_position_embeddings
456
+
457
+ self.config = config
458
+ self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
459
+
460
+ self.inv_freq, self.attention_scaling = self.rope_init_fn(
461
+ self.config, device, **self.rope_kwargs
462
+ )
463
+ self.original_inv_freq = self.inv_freq
464
+
465
+ self._set_cos_sin_cache(seq_len=max_position_embeddings)
466
+
467
+ def _set_cos_sin_cache(self, seq_len):
468
+ self.max_seq_len_cached = seq_len
469
+ t = paddle.arange(seq_len, dtype="float32")
470
+ freqs = paddle.einsum("i,j->ij", t, self.inv_freq)
471
+ emb = paddle.concat([freqs, freqs], axis=-1)
472
+ self.cos_cached = emb.cos()
473
+ self.sin_cached = emb.sin()
474
+
475
+ def _dynamic_frequency_update(self, position_ids, device):
476
+ """
477
+ dynamic RoPE layers should recompute `inv_freq` in the following situations:
478
+ 1 - growing beyond the cached sequence length (allow scaling)
479
+ 2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
480
+ """
481
+ seq_len = paddle.max(position_ids) + 1
482
+ if seq_len > self.max_seq_len_cached: # growth
483
+ inv_freq, self.attention_scaling = self.rope_init_fn(
484
+ self.config, device, seq_len=seq_len, **self.rope_kwargs
485
+ )
486
+ self.inv_freq = inv_freq
487
+ self.max_seq_len_cached = seq_len
488
+
489
+ if (
490
+ seq_len < self.original_max_seq_len
491
+ and self.max_seq_len_cached > self.original_max_seq_len
492
+ ): # reset
493
+ self.inv_freq = self.original_inv_freq
494
+ self.max_seq_len_cached = self.original_max_seq_len
495
+
496
+ @paddle.no_grad()
497
+ def forward(self, x, position_ids):
498
+ if "dynamic" in self.rope_type:
499
+ self._dynamic_frequency_update(position_ids, device=x.device)
500
+
501
+ inv_freq_expanded = (
502
+ self.inv_freq[None, None, :, None]
503
+ .astype("float32")
504
+ .expand([3, position_ids.shape[1], -1, 1])
505
+ )
506
+ position_ids_expanded = position_ids[:, :, None, :].astype("float32")
507
+ device_type = paddle.get_device()
508
+ device_type = (
509
+ device_type
510
+ if isinstance(device_type, str) and device_type != "mps"
511
+ else "cpu"
512
+ )
513
+ with paddle.amp.auto_cast():
514
+ freqs = paddle.matmul(inv_freq_expanded, position_ids_expanded)
515
+ freqs = freqs.transpose([0, 1, 3, 2])
516
+ emb = paddle.concat((freqs, freqs), axis=-1)
517
+ cos = emb.cos()
518
+ sin = emb.sin()
519
+
520
+ cos = cos * self.attention_scaling
521
+ sin = sin * self.attention_scaling
522
+
523
+ return cos.astype(x.dtype), sin.astype(x.dtype)
524
+
525
+
526
+ # Copied from transformers.models.llama.modeling_llama.rotate_half
527
+ def rotate_half(x):
528
+ """Rotates half the hidden dims of the input."""
529
+ x1 = x[..., : x.shape[-1] // 2]
530
+ x2 = x[..., x.shape[-1] // 2 :]
531
+ return paddle.concat([-x2, x1], axis=-1)
532
+
533
+
534
+ def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim=1):
535
+ """Applies Rotary Position Embedding with Multimodal Sections to the query and key tensors (https://qwenlm.github.io/blog/qwen2-vl/).
536
+
537
+ Explanation:
538
+ Multimodal 3D rotary position embedding is an extension to 1D rotary position embedding. The input embedding
539
+ sequence contains vision (images / videos) embedding and text embedding or just contains text embedding. For
540
+ vision embedding part, we apply rotary position embedding on temporal, height and width dimension separately.
541
+ Here we split the channel dimension to 3 chunks for the temporal, height and width rotary position embedding.
542
+ For text embedding part, we just apply 1D rotary position embedding. The three rotary position index (temporal,
543
+ height and width) of text embedding is always the same, so the text embedding rotary position embedding has no
544
+ difference with modern LLMs.
545
+
546
+ Args:
547
+ q (`paddle.Tensor`): The query tensor.
548
+ k (`paddle.Tensor`): The key tensor.
549
+ cos (`paddle.Tensor`): The cosine part of the rotary embedding.
550
+ sin (`paddle.Tensor`): The sine part of the rotary embedding.
551
+ position_ids (`paddle.Tensor`):
552
+ The position indices of the tokens corresponding to the query and key tensors. For example, this can be
553
+ used to pass offsetted position ids when working with a KV-cache.
554
+ mrope_section(`List(int)`):
555
+ Multimodal rope section is for channel dimension of temporal, height and width in rope calculation.
556
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
557
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
558
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
559
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
560
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
561
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
562
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
563
+ Returns:
564
+ `tuple(paddle.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
565
+ """
566
+
567
+ mrope_section = mrope_section * 2
568
+ cos = paddle.concat(
569
+ x=[m[i % 3] for i, m in enumerate(cos.split(mrope_section, axis=-1))], axis=-1
570
+ ).unsqueeze(axis=unsqueeze_dim)
571
+ sin = paddle.concat(
572
+ x=[m[i % 3] for i, m in enumerate(sin.split(mrope_section, axis=-1))], axis=-1
573
+ ).unsqueeze(axis=unsqueeze_dim)
574
+
575
+ q_embed = (q * cos) + (rotate_half(q) * sin)
576
+ k_embed = (k * cos) + (rotate_half(k) * sin)
577
+ return q_embed, k_embed
578
+
579
+
580
+ def apply_rotary_pos_emb_vision(
581
+ tensor: paddle.Tensor, freqs: paddle.Tensor
582
+ ) -> paddle.Tensor:
583
+ orig_dtype = tensor.dtype
584
+
585
+ with paddle.amp.auto_cast(False):
586
+ tensor = tensor.astype(dtype="float32")
587
+ cos = freqs.cos()
588
+ sin = freqs.sin()
589
+ cos = (
590
+ cos.unsqueeze(1)
591
+ .tile(repeat_times=[1, 1, 2])
592
+ .unsqueeze(0)
593
+ .astype(dtype="float32")
594
+ )
595
+ sin = (
596
+ sin.unsqueeze(1)
597
+ .tile(repeat_times=[1, 1, 2])
598
+ .unsqueeze(0)
599
+ .astype(dtype="float32")
600
+ )
601
+ output = tensor * cos + rotate_half(tensor) * sin
602
+ output = paddle.cast(output, orig_dtype)
603
+ return output
604
+
605
+
606
+ class VisionRotaryEmbedding(nn.Layer):
607
+ def __init__(self, dim: int, theta: float = 10000.0) -> None:
608
+ super().__init__()
609
+ self.inv_freq = 1.0 / theta ** (
610
+ paddle.arange(start=0, end=dim, step=2, dtype="float32") / dim
611
+ )
612
+
613
+ def forward(self, seqlen: int) -> paddle.Tensor:
614
+ seq = paddle.arange(seqlen).cast(self.inv_freq.dtype)
615
+ freqs = paddle.outer(x=seq, y=self.inv_freq)
616
+ return freqs
617
+
618
+
619
+ class PatchEmbed(nn.Layer):
620
+ def __init__(
621
+ self,
622
+ patch_size: int = 14,
623
+ temporal_patch_size: int = 2,
624
+ in_channels: int = 3,
625
+ embed_dim: int = 1152,
626
+ ) -> None:
627
+ super().__init__()
628
+ self.patch_size = patch_size
629
+ self.temporal_patch_size = temporal_patch_size
630
+ self.in_channels = in_channels
631
+ self.embed_dim = embed_dim
632
+
633
+ kernel_size = [temporal_patch_size, patch_size, patch_size]
634
+ self.proj = nn.Conv3D(
635
+ in_channels,
636
+ embed_dim,
637
+ kernel_size=kernel_size,
638
+ stride=kernel_size,
639
+ bias_attr=False,
640
+ )
641
+
642
+ def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
643
+
644
+ target_dtype = self.proj.weight.dtype
645
+ hidden_states = hidden_states.reshape(
646
+ [
647
+ -1,
648
+ self.in_channels,
649
+ self.temporal_patch_size,
650
+ self.patch_size,
651
+ self.patch_size,
652
+ ]
653
+ )
654
+ # NOTE(changwenbin): AttributeError: 'Variable' object has no attribute 'to'.
655
+ # hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).reshape([-1, self.embed_dim])
656
+ # hidden_states = paddle.cast(hidden_states, dtype=target_dtype)
657
+ hidden_states = self.proj(
658
+ paddle.cast(hidden_states, dtype=target_dtype)
659
+ ).reshape([-1, self.embed_dim])
660
+ return hidden_states
661
+
662
+
663
+ class PatchMerger(nn.Layer):
664
+ def __init__(self, dim: int, context_dim: int, spatial_merge_size: int = 2) -> None:
665
+ super().__init__()
666
+ self.hidden_size = context_dim * (spatial_merge_size**2)
667
+ self.ln_q = nn.LayerNorm(context_dim, epsilon=1e-6)
668
+ self.mlp = nn.Sequential(
669
+ nn.Linear(self.hidden_size, self.hidden_size),
670
+ nn.GELU(),
671
+ nn.Linear(self.hidden_size, dim),
672
+ )
673
+
674
+ def forward(self, x: paddle.Tensor) -> paddle.Tensor:
675
+ x = self.mlp(self.ln_q(x).reshape([-1, self.hidden_size]))
676
+ return x
677
+
678
+
679
+ class VisionMlp(nn.Layer):
680
+ def __init__(self, dim: int, hidden_dim: int, hidden_act: str) -> None:
681
+ super().__init__()
682
+ self.fc1 = nn.Linear(dim, hidden_dim)
683
+ self.act = ACT2FN[hidden_act]
684
+ self.fc2 = nn.Linear(hidden_dim, dim)
685
+
686
+ def forward(self, x) -> paddle.Tensor:
687
+ return self.fc2(self.act(self.fc1(x)))
688
+
689
+
690
+ class VisionAttention(nn.Layer):
691
+ def __init__(self, dim: int, num_heads: int = 16) -> None:
692
+ super().__init__()
693
+ self.num_heads = num_heads
694
+ self.qkv = nn.Linear(dim, dim * 3, bias_attr=True)
695
+ self.proj = nn.Linear(dim, dim)
696
+ self.head_dim = dim // num_heads # must added
697
+
698
+ def forward(
699
+ self,
700
+ hidden_states: paddle.Tensor,
701
+ cu_seqlens: paddle.Tensor,
702
+ rotary_pos_emb: paddle.Tensor = None,
703
+ ) -> paddle.Tensor:
704
+ seq_length = hidden_states.shape[0]
705
+ q, k, v = (
706
+ self.qkv(hidden_states)
707
+ .reshape([seq_length, 3, self.num_heads, -1])
708
+ .transpose([1, 0, 2, 3])
709
+ .unbind(0)
710
+ )
711
+ q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
712
+ k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
713
+
714
+ attention_mask = paddle.zeros([1, seq_length, seq_length], dtype="bool")
715
+ for i in range(1, len(cu_seqlens)):
716
+ attention_mask[
717
+ ...,
718
+ cu_seqlens[i - 1] : cu_seqlens[i],
719
+ cu_seqlens[i - 1] : cu_seqlens[i],
720
+ ] = True
721
+
722
+ zero = paddle.zeros(attention_mask.shape, dtype=hidden_states.dtype)
723
+ neg_inf = paddle.full_like(
724
+ attention_mask,
725
+ paddle.finfo(hidden_states.dtype).min,
726
+ dtype=hidden_states.dtype,
727
+ )
728
+ attention_mask = paddle.where(attention_mask, zero, neg_inf)
729
+
730
+ q = q.transpose([1, 0, 2])
731
+ k = k.transpose([1, 0, 2])
732
+ v = v.transpose([1, 0, 2])
733
+ attn_weights = paddle.matmul(q, k.transpose([0, 2, 1])) / math.sqrt(
734
+ self.head_dim
735
+ )
736
+ attn_weights = attn_weights + attention_mask
737
+ attn_weights = nn.functional.softmax(attn_weights, axis=-1, dtype="float32")
738
+ attn_output = paddle.matmul(attn_weights, v)
739
+ attn_output = attn_output.transpose([1, 0, 2])
740
+ attn_output = attn_output.reshape([seq_length, -1])
741
+ attn_output = self.proj(attn_output)
742
+ return attn_output
743
+
744
+
745
+ class VisionFlashAttention2(nn.Layer):
746
+ def __init__(self, dim: int, num_heads: int = 16) -> None:
747
+ super().__init__()
748
+ self.num_heads = num_heads
749
+ self.qkv = nn.Linear(dim, dim * 3, bias_attr=True)
750
+ self.proj = nn.Linear(dim, dim)
751
+ self.head_dim = dim // num_heads # must added
752
+
753
+ def forward(
754
+ self,
755
+ hidden_states: paddle.Tensor,
756
+ cu_seqlens: paddle.Tensor,
757
+ rotary_pos_emb: paddle.Tensor = None,
758
+ ) -> paddle.Tensor:
759
+ seq_length = tuple(hidden_states.shape)[0]
760
+ qkv = (
761
+ self.qkv(hidden_states)
762
+ .reshape([seq_length, 3, self.num_heads, -1])
763
+ .transpose(perm=[1, 0, 2, 3])
764
+ )
765
+ q, k, v = qkv.unbind(axis=0)
766
+ q = apply_rotary_pos_emb_vision(q.unsqueeze(axis=0), rotary_pos_emb).squeeze(
767
+ axis=0
768
+ )
769
+ k = apply_rotary_pos_emb_vision(k.unsqueeze(axis=0), rotary_pos_emb).squeeze(
770
+ axis=0
771
+ )
772
+
773
+ if _IS_NPU:
774
+ attn_output = paddle.nn.functional.flash_attention_npu(
775
+ q.astype("bfloat16"),
776
+ k.astype("bfloat16"),
777
+ v.astype("bfloat16"),
778
+ is_varlen=True,
779
+ batch_size=1,
780
+ seq_length=seq_length,
781
+ ).reshape([seq_length, -1])
782
+ else:
783
+ max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
784
+
785
+ softmax_scale = self.head_dim**-0.5
786
+ attn_output = (
787
+ flash_attn_varlen_func(
788
+ q.astype("bfloat16"),
789
+ k.astype("bfloat16"),
790
+ v.astype("bfloat16"),
791
+ cu_seqlens,
792
+ cu_seqlens,
793
+ max_seqlen,
794
+ max_seqlen,
795
+ scale=softmax_scale,
796
+ )[0]
797
+ .squeeze(0)
798
+ .reshape([seq_length, -1])
799
+ )
800
+ if self.proj.weight.dtype == paddle.bfloat16:
801
+ attn_output = attn_output.astype(paddle.bfloat16)
802
+ elif self.proj.weight.dtype == paddle.float16:
803
+ attn_output = attn_output.astype(paddle.float16)
804
+ elif self.proj.weight.dtype == paddle.float32:
805
+ attn_output = attn_output.astype(paddle.float32)
806
+ attn_output = self.proj(attn_output)
807
+ return attn_output
808
+
809
+
810
+ def create_attention_module(config, module_type, layer_idx=None):
811
+ if flash_attn_func is not None:
812
+ if module_type == "qwen2vl":
813
+ return Qwen2VLFlashAttention2(config, layer_idx)
814
+ elif module_type == "vision":
815
+ return VisionFlashAttention2(config.embed_dim, num_heads=config.num_heads)
816
+ else:
817
+ logging.warning_once(
818
+ f"Warning: Flash Attention2 is not available for {module_type}, fallback to normal attention."
819
+ )
820
+
821
+ if module_type == "qwen2vl":
822
+ return Qwen2VLAttention(config, layer_idx)
823
+ elif module_type == "vision":
824
+ return VisionAttention(config.embed_dim, num_heads=config.num_heads)
825
+
826
+
827
+ class Qwen2VLVisionBlock(nn.Layer):
828
+ def __init__(self, config, attn_implementation: str = "flash_attention_2") -> None:
829
+ super().__init__()
830
+ self.norm1 = nn.LayerNorm(config.embed_dim, epsilon=1e-6)
831
+ self.norm2 = nn.LayerNorm(config.embed_dim, epsilon=1e-6)
832
+ mlp_hidden_dim = int(config.embed_dim * config.mlp_ratio)
833
+
834
+ self.attn = create_attention_module(config, "vision")
835
+ self.mlp = VisionMlp(
836
+ dim=config.embed_dim,
837
+ hidden_dim=mlp_hidden_dim,
838
+ hidden_act=config.hidden_act,
839
+ )
840
+
841
+ def forward(self, hidden_states, cu_seqlens, rotary_pos_emb) -> paddle.Tensor:
842
+ hidden_states = hidden_states + self.attn(
843
+ self.norm1(hidden_states),
844
+ cu_seqlens=cu_seqlens,
845
+ rotary_pos_emb=rotary_pos_emb,
846
+ )
847
+ hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
848
+ return hidden_states
849
+
850
+
851
+ def _prepare_4d_causal_attention_mask_with_cache_position(
852
+ attention_mask: paddle.Tensor,
853
+ sequence_length: int,
854
+ target_length: int,
855
+ dtype: paddle.dtype,
856
+ min_dtype: float,
857
+ cache_position: paddle.Tensor,
858
+ batch_size: int,
859
+ ):
860
+ """
861
+ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
862
+ `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
863
+
864
+ Args:
865
+ attention_mask (`paddle.Tensor`):
866
+ A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
867
+ sequence_length (`int`):
868
+ The sequence length being processed.
869
+ target_length (`int`):
870
+ The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
871
+ dtype (`paddle.dtype`):
872
+ The dtype to use for the 4D attention mask.
873
+ min_dtype (`float`):
874
+ The minimum value representable with the dtype `dtype`.
875
+ cache_position (`paddle.Tensor`):
876
+ Indices depicting the position of the input sequence tokens in the sequence.
877
+ batch_size (`paddle.Tensor`):
878
+ Batch size.
879
+ """
880
+ if attention_mask is not None and attention_mask.dim() == 4:
881
+ causal_mask = attention_mask
882
+ else:
883
+ causal_mask = paddle.full(
884
+ [sequence_length, target_length], fill_value=min_dtype, dtype=dtype
885
+ )
886
+ if sequence_length != 1:
887
+ causal_mask = paddle.triu(x=causal_mask, diagonal=1)
888
+ causal_mask *= paddle.arange(target_length) > cache_position.reshape([-1, 1])
889
+ causal_mask = causal_mask[None, None, :, :].expand(
890
+ shape=[batch_size, 1, -1, -1]
891
+ )
892
+ if attention_mask is not None:
893
+ causal_mask = causal_mask.clone()
894
+ mask_length = tuple(attention_mask.shape)[-1]
895
+ padding_mask = (
896
+ causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
897
+ )
898
+ padding_mask = padding_mask == 0
899
+ causal_mask[:, :, :, :mask_length] = causal_mask[
900
+ :, :, :, :mask_length
901
+ ].masked_fill(mask=padding_mask, value=min_dtype)
902
+
903
+ return causal_mask
904
+
905
+
906
+ class Qwen2RMSNorm(nn.Layer):
907
+ def __init__(self, config: Qwen2VLConfig, hidden_size, eps=1e-6):
908
+ """
909
+ Qwen2RMSNorm is equivalent to T5LayerNorm
910
+ """
911
+ super().__init__()
912
+ self.weight = paddle.create_parameter(
913
+ shape=[hidden_size],
914
+ dtype=paddle.get_default_dtype(),
915
+ default_initializer=nn.initializer.Constant(1.0),
916
+ )
917
+ self.variance_epsilon = eps
918
+
919
+ def forward(self, hidden_states):
920
+ if paddle.in_dynamic_mode():
921
+ with paddle.amp.auto_cast(False):
922
+ variance = hidden_states.astype("float32").pow(2).mean(-1, keepdim=True)
923
+ hidden_states = (
924
+ paddle.rsqrt(variance + self.variance_epsilon) * hidden_states
925
+ )
926
+ else:
927
+ variance = hidden_states.astype("float32").pow(2).mean(-1, keepdim=True)
928
+ hidden_states = (
929
+ paddle.rsqrt(variance + self.variance_epsilon) * hidden_states
930
+ )
931
+
932
+ if self.weight.dtype in [paddle.float16, paddle.bfloat16]:
933
+ hidden_states = paddle.cast(hidden_states, self.weight.dtype)
934
+ return hidden_states * self.weight
935
+
936
+
937
+ class Qwen2MLP(nn.Layer):
938
+ def __init__(self, config):
939
+ super().__init__()
940
+ self.hidden_size = config.hidden_size
941
+ self.intermediate_size = config.intermediate_size
942
+ self.fuse_attention_ffn = config.fuse_attention_ffn
943
+ self.tensor_parallel_degree = config.tensor_parallel_degree
944
+
945
+ if config.tensor_parallel_degree > 1:
946
+
947
+ self.gate_proj = ColumnParallelLinear(
948
+ self.hidden_size,
949
+ self.intermediate_size,
950
+ gather_output=False,
951
+ has_bias=False,
952
+ )
953
+ self.up_proj = ColumnParallelLinear(
954
+ self.hidden_size,
955
+ self.intermediate_size,
956
+ gather_output=False,
957
+ has_bias=False,
958
+ )
959
+ self.down_proj = RowParallelLinear(
960
+ self.intermediate_size,
961
+ self.hidden_size,
962
+ input_is_parallel=True,
963
+ has_bias=False,
964
+ )
965
+ else:
966
+ self.gate_proj = Linear(
967
+ self.hidden_size, self.intermediate_size, bias_attr=False
968
+ ) # w1
969
+ self.up_proj = Linear(
970
+ self.hidden_size, self.intermediate_size, bias_attr=False
971
+ ) # w3
972
+ self.down_proj = Linear(
973
+ self.intermediate_size, self.hidden_size, bias_attr=False
974
+ ) # w2
975
+
976
+ self.act_fn = ACT2FN[config.hidden_act]
977
+ self.fuse_swiglu = False
978
+
979
+ def forward(self, x):
980
+ x, y = self.gate_proj(x), self.up_proj(x)
981
+ if self.fuse_swiglu:
982
+ x = self.act_fn(x, y)
983
+ else:
984
+ x = self.act_fn(x) * y
985
+
986
+ return self.down_proj(x)
987
+
988
+
989
+ # Copied from transformers.models.llama.modeling_llama.repeat_kv
990
+ def repeat_kv(hidden_states: paddle.Tensor, n_rep: int) -> paddle.Tensor:
991
+ """
992
+ This is the equivalent of paddle.repeat_interleave(x, axis=1, repeats=n_rep). The hidden states go from (batch,
993
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
994
+ """
995
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
996
+ if n_rep == 1:
997
+ return hidden_states
998
+ hidden_states = hidden_states[:, :, None, :, :].expand(
999
+ [batch, num_key_value_heads, n_rep, slen, head_dim]
1000
+ )
1001
+ return hidden_states.reshape([batch, num_key_value_heads * n_rep, slen, head_dim])
1002
+
1003
+
1004
+ class Qwen2VLAttention(nn.Layer):
1005
+ """
1006
+ Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
1007
+ and "Generating Long Sequences with Sparse Transformers".
1008
+ """
1009
+
1010
+ def __init__(self, config: Qwen2VLConfig, layer_idx: Optional[int] = None):
1011
+ super().__init__()
1012
+ self.config = config
1013
+ self.layer_idx = layer_idx
1014
+ if layer_idx is None:
1015
+ logging.warning_once(
1016
+ f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
1017
+ "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
1018
+ "when creating this class."
1019
+ )
1020
+
1021
+ self.hidden_size = config.hidden_size
1022
+ self.num_heads = config.num_attention_heads
1023
+ self.head_dim = self.hidden_size // self.num_heads
1024
+ self.num_key_value_heads = config.num_key_value_heads
1025
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
1026
+ self.max_position_embeddings = config.max_position_embeddings
1027
+ self.rope_theta = config.rope_theta
1028
+ self.is_causal = True
1029
+ self.attention_dropout = config.attention_dropout
1030
+ self.rope_scaling = config.rope_scaling
1031
+ # self.sequence_parallel = config.sequence_parallel
1032
+
1033
+ if config.tensor_parallel_degree > 1:
1034
+ assert (
1035
+ self.num_heads % config.tensor_parallel_degree == 0
1036
+ ), f"num_heads: {self.num_heads}, tensor_parallel_degree: {config.tensor_parallel_degree}"
1037
+ self.num_heads = self.num_heads // config.tensor_parallel_degree
1038
+
1039
+ assert (
1040
+ self.num_key_value_heads % config.tensor_parallel_degree == 0
1041
+ ), f"num_key_value_heads: {self.num_key_value_heads}, tensor_parallel_degree: {config.tensor_parallel_degree}"
1042
+ self.num_key_value_heads = (
1043
+ self.num_key_value_heads // config.tensor_parallel_degree
1044
+ )
1045
+
1046
+ if config.tensor_parallel_degree > 1:
1047
+ self.q_proj = ColumnParallelLinear(
1048
+ self.hidden_size, self.hidden_size, has_bias=True, gather_output=False
1049
+ )
1050
+ self.k_proj = ColumnParallelLinear(self.hidden_size, self.config.num_key_value_heads * self.head_dim, has_bias=True, gather_output=False) # fmt:skip
1051
+ self.v_proj = ColumnParallelLinear(self.hidden_size, self.config.num_key_value_heads * self.head_dim, has_bias=True, gather_output=False) # fmt:skip
1052
+ self.o_proj = RowParallelLinear(
1053
+ self.hidden_size,
1054
+ self.hidden_size,
1055
+ has_bias=False,
1056
+ input_is_parallel=True,
1057
+ )
1058
+ else:
1059
+ self.q_proj = Linear(self.hidden_size, self.hidden_size, bias_attr=True)
1060
+ self.k_proj = Linear(
1061
+ self.hidden_size,
1062
+ self.config.num_key_value_heads * self.head_dim,
1063
+ bias_attr=True,
1064
+ )
1065
+ self.v_proj = Linear(
1066
+ self.hidden_size,
1067
+ self.config.num_key_value_heads * self.head_dim,
1068
+ bias_attr=True,
1069
+ )
1070
+ self.o_proj = Linear(self.hidden_size, self.hidden_size, bias_attr=False)
1071
+
1072
+ self.rotary_emb = Qwen2VLRotaryEmbedding(
1073
+ self.head_dim,
1074
+ max_position_embeddings=self.max_position_embeddings,
1075
+ base=self.rope_theta,
1076
+ )
1077
+
1078
+ def forward(
1079
+ self,
1080
+ hidden_states: paddle.Tensor,
1081
+ attention_mask: Optional[paddle.Tensor] = None,
1082
+ position_ids: Optional[paddle.Tensor] = None,
1083
+ past_key_value: Optional[Tuple[paddle.Tensor]] = None,
1084
+ output_attentions: bool = False,
1085
+ use_cache: bool = False, # default true
1086
+ cache_position: Optional[paddle.Tensor] = None,
1087
+ ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
1088
+ bsz, q_len, _ = hidden_states.shape
1089
+
1090
+ try:
1091
+ query_states = self.q_proj(hidden_states)
1092
+ key_states = self.k_proj(hidden_states)
1093
+ value_states = self.v_proj(hidden_states)
1094
+ except:
1095
+ hidden_states = hidden_states.astype(self.config.dtype)
1096
+ query_states = self.q_proj(hidden_states)
1097
+ key_states = self.k_proj(hidden_states)
1098
+ value_states = self.v_proj(hidden_states)
1099
+
1100
+ target_query_shape = [0, 0, self.num_heads, self.head_dim]
1101
+ target_key_value_shape = [0, 0, self.num_key_value_heads, self.head_dim]
1102
+ query_states = query_states.reshape(shape=target_query_shape)
1103
+ key_states = key_states.reshape(shape=target_key_value_shape)
1104
+ value_states = value_states.reshape(shape=target_key_value_shape)
1105
+
1106
+ new_perm = [0, 2, 1, 3]
1107
+ query_states = query_states.transpose(new_perm)
1108
+ key_states = key_states.transpose(new_perm)
1109
+ value_states = value_states.transpose(new_perm)
1110
+
1111
+ kv_seq_len = key_states.shape[-2]
1112
+ if past_key_value is not None:
1113
+ kv_seq_len += cache_position[0] + 1
1114
+
1115
+ cos, sin = self.rotary_emb(value_states, position_ids)
1116
+ query_states, key_states = apply_multimodal_rotary_pos_emb(
1117
+ query_states, key_states, cos, sin, self.rope_scaling["mrope_section"]
1118
+ )
1119
+
1120
+ if past_key_value is not None:
1121
+ key_states = paddle.concat([past_key_value[0], key_states], axis=2)
1122
+ value_states = paddle.concat([past_key_value[1], value_states], axis=2)
1123
+ past_key_value = (key_states, value_states) if use_cache else None
1124
+
1125
+ # repeat k/v heads if n_kv_heads < n_heads
1126
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
1127
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
1128
+
1129
+ query_states = query_states.astype("float32")
1130
+ key_states = key_states.astype("float32")
1131
+ value_states = value_states.astype("float32")
1132
+
1133
+ attn_weights = paddle.matmul(
1134
+ query_states, key_states.transpose([0, 1, 3, 2])
1135
+ ) / math.sqrt(self.head_dim)
1136
+
1137
+ if attention_mask is not None:
1138
+ attn_weights = attn_weights + attention_mask
1139
+ attn_weights = nn.functional.softmax(attn_weights, axis=-1, dtype="float32")
1140
+ attn_weights = nn.functional.dropout(
1141
+ x=attn_weights, p=self.attention_dropout, training=self.training
1142
+ )
1143
+ attn_output = paddle.matmul(
1144
+ attn_weights.cast(self.config.dtype), value_states.cast(self.config.dtype)
1145
+ )
1146
+
1147
+ if attn_output.shape != [bsz, self.num_heads, q_len, self.head_dim]:
1148
+ raise ValueError(
1149
+ f"`attn_output` should be of size {(bsz, q_len, self.num_heads, self.head_dim)}, but is"
1150
+ f" {attn_output.shape}"
1151
+ )
1152
+
1153
+ attn_output = attn_output.transpose([0, 2, 1, 3])
1154
+ attn_output = attn_output.reshape([bsz, q_len, -1])
1155
+
1156
+ if self.o_proj.weight.dtype == paddle.bfloat16:
1157
+ attn_output = attn_output.astype(paddle.bfloat16)
1158
+ elif self.o_proj.weight.dtype == paddle.float16:
1159
+ attn_output = attn_output.astype(paddle.float16)
1160
+ elif self.o_proj.weight.dtype == paddle.float32:
1161
+ attn_output = attn_output.astype(paddle.float32)
1162
+
1163
+ attn_output = self.o_proj(attn_output)
1164
+ if not output_attentions:
1165
+ attn_weights = None
1166
+ return attn_output, attn_weights, past_key_value
1167
+
1168
+
1169
+ class Qwen2VLFlashAttention2(Qwen2VLAttention):
1170
+ """
1171
+ Qwen2VL flash attention module, following Qwen2VL attention module. This module inherits from `Qwen2VLAttention`
1172
+ as the weights of the module stays untouched. The only required change would be on the forward pass
1173
+ where it needs to correctly call the public API of flash attention and deal with padding tokens
1174
+ in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
1175
+ config.max_window_layers layers.
1176
+ """
1177
+
1178
+ def __init__(self, *args, **kwargs):
1179
+ super().__init__(*args, **kwargs)
1180
+
1181
+ def forward(
1182
+ self,
1183
+ hidden_states: paddle.Tensor,
1184
+ attention_mask: Optional[paddle.Tensor] = None,
1185
+ position_ids: Optional[paddle.Tensor] = None,
1186
+ past_key_value: Optional[Tuple[paddle.Tensor]] = None,
1187
+ output_attentions: bool = False,
1188
+ use_cache: bool = False, # default true
1189
+ cache_position: Optional[paddle.Tensor] = None,
1190
+ ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
1191
+ bsz, q_len, _ = tuple(hidden_states.shape)
1192
+
1193
+ try:
1194
+ query_states = self.q_proj(hidden_states)
1195
+ key_states = self.k_proj(hidden_states)
1196
+ value_states = self.v_proj(hidden_states)
1197
+ except:
1198
+ hidden_states = hidden_states.astype("bfloat16")
1199
+ query_states = self.q_proj(hidden_states)
1200
+ key_states = self.k_proj(hidden_states)
1201
+ value_states = self.v_proj(hidden_states)
1202
+
1203
+ target_query_shape = [0, 0, self.num_heads, self.head_dim]
1204
+ target_key_value_shape = [0, 0, self.num_key_value_heads, self.head_dim]
1205
+ query_states = query_states.reshape(shape=target_query_shape)
1206
+ key_states = key_states.reshape(shape=target_key_value_shape)
1207
+ value_states = value_states.reshape(shape=target_key_value_shape)
1208
+
1209
+ new_perm = [0, 2, 1, 3]
1210
+ query_states = query_states.transpose(new_perm)
1211
+ key_states = key_states.transpose(new_perm)
1212
+ value_states = value_states.transpose(new_perm)
1213
+
1214
+ kv_seq_len = key_states.shape[-2]
1215
+ if past_key_value is not None:
1216
+ kv_seq_len += cache_position[0] + 1
1217
+
1218
+ # Because the input can be padded, the absolute sequence length depends on the max position id.
1219
+ cos, sin = self.rotary_emb(value_states, position_ids)
1220
+ query_states, key_states = apply_multimodal_rotary_pos_emb(
1221
+ query_states, key_states, cos, sin, self.rope_scaling["mrope_section"]
1222
+ )
1223
+
1224
+ if past_key_value is not None:
1225
+ key_states = paddle.concat([past_key_value[0], key_states], axis=2)
1226
+ value_states = paddle.concat([past_key_value[1], value_states], axis=2)
1227
+ past_key_value = (key_states, value_states) if use_cache else None
1228
+
1229
+ # repeat k/v heads if n_kv_heads < n_heads
1230
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
1231
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
1232
+
1233
+ # Reashape to the expected shape for Flash Attention
1234
+ # [1, 3599, 12, 128]
1235
+ query_states = query_states.transpose(perm=[0, 2, 1, 3])
1236
+ key_states = key_states.transpose(perm=[0, 2, 1, 3])
1237
+ value_states = value_states.transpose(perm=[0, 2, 1, 3])
1238
+
1239
+ attn_output = self._flash_attention_forward(
1240
+ query_states, key_states, value_states, attention_mask, q_len
1241
+ )
1242
+
1243
+ attn_output = attn_output.reshape([bsz, q_len, -1])
1244
+ attn_output = self.o_proj(attn_output)
1245
+ if not output_attentions:
1246
+ attn_weights = None
1247
+ return attn_output, attn_weights, past_key_value
1248
+
1249
+ def _flash_attention_forward(
1250
+ self,
1251
+ query_states,
1252
+ key_states,
1253
+ value_states,
1254
+ attention_mask,
1255
+ query_length,
1256
+ dropout=0.0,
1257
+ softmax_scale=None,
1258
+ ):
1259
+ """
1260
+ Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
1261
+ first unpad the input, then computes the attention scores and pad the final attention scores.
1262
+
1263
+ Args:
1264
+ query_states (`paddle.Tensor`):
1265
+ Input query states to be passed to Flash Attention API
1266
+ key_states (`paddle.Tensor`):
1267
+ Input key states to be passed to Flash Attention API
1268
+ value_states (`paddle.Tensor`):
1269
+ Input value states to be passed to Flash Attention API
1270
+ attention_mask (`paddle.Tensor`):
1271
+ The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
1272
+ position of padding tokens and 1 for the position of non-padding tokens.
1273
+ dropout (`int`, *optional*):
1274
+ Attention dropout
1275
+ softmax_scale (`float`, *optional*):
1276
+ The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
1277
+ """
1278
+ # Contains at least one padding token in the sequence
1279
+ causal = self.is_causal and query_length != 1
1280
+
1281
+ if _IS_NPU:
1282
+ if attention_mask is not None:
1283
+ attn_output = paddle.nn.functional.flash_attention_npu( # TODO: flash_attn_unpadded
1284
+ query_states,
1285
+ key_states,
1286
+ value_states,
1287
+ attn_mask=attention_mask,
1288
+ dropout=dropout,
1289
+ causal=causal,
1290
+ is_varlen=True,
1291
+ )
1292
+ else:
1293
+ dtype = query_states.dtype
1294
+ attn_output = paddle.nn.functional.flash_attention_npu( # TODO: flash_attn_unpadded
1295
+ query_states.astype("bfloat16"),
1296
+ key_states.astype("bfloat16"),
1297
+ value_states.astype("bfloat16"),
1298
+ attn_mask=attention_mask,
1299
+ dropout=dropout,
1300
+ causal=causal,
1301
+ )
1302
+ attn_output = attn_output.astype(dtype)
1303
+ else:
1304
+ head_dim = query_states.shape[-1]
1305
+ softmax_scale = head_dim**-0.5 # TODO: 需要手动加上
1306
+
1307
+ if attention_mask is not None:
1308
+ batch_size = query_states.shape[0]
1309
+ (
1310
+ query_states,
1311
+ key_states,
1312
+ value_states,
1313
+ indices_q,
1314
+ cu_seq_lens,
1315
+ max_seq_lens,
1316
+ ) = self._unpad_input(
1317
+ query_states, key_states, value_states, attention_mask, query_length
1318
+ )
1319
+ cu_seqlens_q, cu_seqlens_k = cu_seq_lens
1320
+ max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
1321
+
1322
+ attn_output_unpad = flash_attn_varlen_func(
1323
+ query_states,
1324
+ key_states,
1325
+ value_states,
1326
+ cu_seqlens_q=cu_seqlens_q,
1327
+ cu_seqlens_k=cu_seqlens_k,
1328
+ max_seqlen_q=max_seqlen_in_batch_q,
1329
+ max_seqlen_k=max_seqlen_in_batch_k,
1330
+ scale=softmax_scale, # not softmax_scale=
1331
+ dropout=dropout,
1332
+ causal=causal,
1333
+ )[0]
1334
+
1335
+ attn_output = pad_input(
1336
+ attn_output_unpad, indices_q, batch_size, query_length
1337
+ )
1338
+ else:
1339
+ attn_output = flash_attn_func(
1340
+ query_states,
1341
+ key_states,
1342
+ value_states,
1343
+ dropout,
1344
+ causal=causal,
1345
+ )[0]
1346
+
1347
+ return attn_output
1348
+
1349
+ def _unpad_input(
1350
+ self, query_layer, key_layer, value_layer, attention_mask, query_length
1351
+ ):
1352
+ indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
1353
+ batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
1354
+
1355
+ # TODO:cuda error
1356
+ key_layer = index_first_axis(
1357
+ key_layer.reshape([batch_size * kv_seq_len, num_key_value_heads, head_dim]),
1358
+ indices_k,
1359
+ )
1360
+ value_layer = index_first_axis(
1361
+ value_layer.reshape(
1362
+ [batch_size * kv_seq_len, num_key_value_heads, head_dim]
1363
+ ),
1364
+ indices_k,
1365
+ )
1366
+
1367
+ if query_length == kv_seq_len:
1368
+ query_layer = index_first_axis(
1369
+ query_layer.reshape(
1370
+ [batch_size * kv_seq_len, self.num_heads, head_dim]
1371
+ ),
1372
+ indices_k,
1373
+ )
1374
+ cu_seqlens_q = cu_seqlens_k
1375
+ max_seqlen_in_batch_q = max_seqlen_in_batch_k
1376
+ indices_q = indices_k
1377
+ elif query_length == 1:
1378
+ max_seqlen_in_batch_q = 1
1379
+ cu_seqlens_q = paddle.arange(
1380
+ batch_size + 1, dtype=paddle.int32
1381
+ ) # There is a memcpy here, that is very bad.
1382
+ indices_q = cu_seqlens_q[:-1]
1383
+ query_layer = query_layer.squeeze(1)
1384
+ else:
1385
+ # The -q_len: slice assumes left padding.
1386
+ attention_mask = attention_mask[:, -query_length:]
1387
+ query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(
1388
+ query_layer, attention_mask
1389
+ )
1390
+
1391
+ return (
1392
+ query_layer,
1393
+ key_layer,
1394
+ value_layer,
1395
+ indices_q.to(paddle.int64),
1396
+ (cu_seqlens_q, cu_seqlens_k),
1397
+ (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
1398
+ )
1399
+
1400
+
1401
+ class Qwen2VLDecoderLayer(nn.Layer):
1402
+ def __init__(self, config: Qwen2VLConfig, layer_idx: int):
1403
+ super().__init__()
1404
+ self.hidden_size = config.hidden_size
1405
+
1406
+ # use_sliding_window false
1407
+ if (
1408
+ config.use_sliding_window
1409
+ and config.attn_implementation != "flash_attention_2"
1410
+ ):
1411
+ logging.warning_once(
1412
+ f"Sliding Window Attention is enabled but not implemented for `{config.attn_implementation}`; "
1413
+ "unexpected results may be encountered."
1414
+ )
1415
+
1416
+ self.self_attn = create_attention_module(config, "qwen2vl", layer_idx=layer_idx)
1417
+ # self.self_attn = Qwen2VLAttention(config, layer_idx)
1418
+ self.mlp = Qwen2MLP(config)
1419
+ self.input_layernorm = Qwen2RMSNorm(
1420
+ config, config.hidden_size, eps=config.rms_norm_eps
1421
+ )
1422
+ self.post_attention_layernorm = Qwen2RMSNorm(
1423
+ config, config.hidden_size, eps=config.rms_norm_eps
1424
+ )
1425
+
1426
+ def forward(
1427
+ self,
1428
+ hidden_states: paddle.Tensor,
1429
+ attention_mask: Optional[paddle.Tensor] = None,
1430
+ position_ids: Optional[paddle.Tensor] = None,
1431
+ past_key_value: Optional[Tuple[paddle.Tensor]] = None,
1432
+ output_attentions: Optional[bool] = False,
1433
+ use_cache: Optional[bool] = False,
1434
+ cache_position: Optional[paddle.Tensor] = None,
1435
+ **kwargs,
1436
+ ):
1437
+ """
1438
+ Args:
1439
+ hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
1440
+ attention_mask (`paddle.Tensor`, *optional*): attention mask of size
1441
+ `(batch, sequence_length)` where padding elements are indicated by 0.
1442
+ output_attentions (`bool`, *optional*):
1443
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
1444
+ returned tensors for more detail.
1445
+ use_cache (`bool`, *optional*):
1446
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
1447
+ (see `past_key_values`).
1448
+ past_key_value (`Tuple(paddle.Tensor)`, *optional*): cached past key and value projection states
1449
+ cache_position (`paddle.Tensor` of shape `(sequence_length)`, *optional*):
1450
+ Indices depicting the position of the input sequence tokens in the sequence.
1451
+ kwargs (`dict`, *optional*):
1452
+ Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
1453
+ into the model
1454
+ """
1455
+
1456
+ residual = hidden_states
1457
+
1458
+ hidden_states = self.input_layernorm(hidden_states)
1459
+
1460
+ # Self Attention
1461
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
1462
+ hidden_states=hidden_states,
1463
+ attention_mask=attention_mask,
1464
+ position_ids=position_ids,
1465
+ past_key_value=past_key_value,
1466
+ output_attentions=output_attentions,
1467
+ use_cache=use_cache,
1468
+ cache_position=cache_position,
1469
+ )
1470
+ hidden_states = residual + hidden_states
1471
+
1472
+ # Fully Connected
1473
+ residual = hidden_states
1474
+ hidden_states = self.post_attention_layernorm(hidden_states)
1475
+ hidden_states = self.mlp(hidden_states)
1476
+ hidden_states = residual + hidden_states
1477
+
1478
+ outputs = (hidden_states,)
1479
+
1480
+ if output_attentions:
1481
+ outputs += (self_attn_weights,)
1482
+
1483
+ if use_cache:
1484
+ outputs += (present_key_value,)
1485
+
1486
+ return outputs
1487
+
1488
+
1489
+ class Qwen2VLPreTrainedModel(PretrainedModel):
1490
+ config_class = Qwen2VLConfig
1491
+ base_model_prefix = "model"
1492
+ _no_split_modules = ["Qwen2VLDecoderLayer", "Qwen2VLVisionBlock"]
1493
+ _skip_keys_device_placement = "past_key_values"
1494
+
1495
+ def _init_weights(self, layer):
1496
+ std = 0.2
1497
+ if isinstance(layer, (nn.Linear, nn.Conv3D)):
1498
+ nn.initializer.Normal(mean=0.0, std=std)(layer.weight)
1499
+ if layer.bias is not None:
1500
+ nn.initializer.Constant(0.0)(layer.bias)
1501
+ elif isinstance(layer, nn.Embedding):
1502
+ nn.initializer.Normal(mean=0.0, std=std)(layer.weight)
1503
+ if layer._padding_idx is not None:
1504
+ with paddle.no_grad():
1505
+ layer.weight[layer._padding_idx] = 0.0
1506
+
1507
+
1508
+ class Qwen2VisionTransformerPretrainedModel(Qwen2VLPreTrainedModel):
1509
+ config_class = Qwen2VLVisionConfig
1510
+ _no_split_modules = ["Qwen2VLVisionBlock"]
1511
+
1512
+ def __init__(self, config) -> None:
1513
+ super().__init__(config)
1514
+ self.spatial_merge_size = config.spatial_merge_size
1515
+
1516
+ self.patch_embed = PatchEmbed(
1517
+ patch_size=config.patch_size,
1518
+ temporal_patch_size=config.temporal_patch_size,
1519
+ in_channels=config.in_channels,
1520
+ embed_dim=config.embed_dim,
1521
+ )
1522
+
1523
+ head_dim = config.embed_dim // config.num_heads
1524
+ self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2)
1525
+
1526
+ self.blocks = nn.LayerList(
1527
+ [Qwen2VLVisionBlock(config) for _ in range(config.depth)]
1528
+ )
1529
+ self.merger = PatchMerger(dim=config.hidden_size, context_dim=config.embed_dim)
1530
+ self.enable_recompute = False
1531
+
1532
+ def get_dtype(self) -> paddle.dtype:
1533
+ return self.blocks[0].mlp.fc2.weight.dtype
1534
+
1535
+ def rot_pos_emb(self, grid_thw):
1536
+ pos_ids = []
1537
+ for t, h, w in grid_thw:
1538
+ hpos_ids = paddle.arange(h).unsqueeze(1).expand([-1, w])
1539
+ hpos_ids = hpos_ids.reshape(
1540
+ [
1541
+ h // self.spatial_merge_size,
1542
+ self.spatial_merge_size,
1543
+ w // self.spatial_merge_size,
1544
+ self.spatial_merge_size,
1545
+ ]
1546
+ )
1547
+ hpos_ids = hpos_ids.transpose(perm=[0, 2, 1, 3])
1548
+ hpos_ids = hpos_ids.flatten()
1549
+
1550
+ wpos_ids = paddle.arange(w).unsqueeze(0).expand([h, -1])
1551
+ wpos_ids = wpos_ids.reshape(
1552
+ [
1553
+ h // self.spatial_merge_size,
1554
+ self.spatial_merge_size,
1555
+ w // self.spatial_merge_size,
1556
+ self.spatial_merge_size,
1557
+ ]
1558
+ )
1559
+ wpos_ids = wpos_ids.transpose([0, 2, 1, 3])
1560
+ wpos_ids = wpos_ids.flatten()
1561
+ pos_ids.append(
1562
+ paddle.stack(x=[hpos_ids, wpos_ids], axis=-1).tile(repeat_times=[t, 1])
1563
+ )
1564
+ pos_ids = paddle.concat(x=pos_ids, axis=0)
1565
+ max_grid_size = grid_thw[:, 1:].max()
1566
+ rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
1567
+ rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(start_axis=1)
1568
+ return rotary_pos_emb
1569
+
1570
+ @paddle.jit.not_to_static
1571
+ def recompute_training_full(
1572
+ self,
1573
+ layer_module: nn.Layer,
1574
+ hidden_states: paddle.Tensor,
1575
+ cu_seqlens_now: paddle.Tensor,
1576
+ rotary_pos_emb: paddle.Tensor,
1577
+ ):
1578
+ def create_custom_forward(module):
1579
+ def custom_forward(*inputs):
1580
+ return module(*inputs)
1581
+
1582
+ return custom_forward
1583
+
1584
+ hidden_states = recompute(
1585
+ create_custom_forward(layer_module),
1586
+ hidden_states,
1587
+ cu_seqlens_now,
1588
+ rotary_pos_emb,
1589
+ # use_reentrant=self.config.recompute_use_reentrant,
1590
+ )
1591
+ return hidden_states
1592
+
1593
+ def forward(
1594
+ self, hidden_states: paddle.Tensor, grid_thw: paddle.Tensor
1595
+ ) -> paddle.Tensor:
1596
+ # breakpoint()
1597
+ hidden_states = self.patch_embed(hidden_states)
1598
+ rotary_pos_emb = self.rot_pos_emb(grid_thw)
1599
+
1600
+ cu_seqlens = paddle.repeat_interleave(
1601
+ grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
1602
+ ).cumsum(axis=0, dtype="int32")
1603
+ cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
1604
+
1605
+ for idx, blk in enumerate(self.blocks):
1606
+ if self.enable_recompute and self.training:
1607
+ hidden_states = self.recompute_training_full(
1608
+ blk, hidden_states, cu_seqlens, rotary_pos_emb
1609
+ )
1610
+ else:
1611
+ hidden_states = blk(
1612
+ hidden_states, cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb
1613
+ )
1614
+
1615
+ return self.merger(hidden_states)
1616
+
1617
+
1618
+ class Qwen2VLModel(Qwen2VLPreTrainedModel):
1619
+ def __init__(self, config: Qwen2VLConfig):
1620
+ super().__init__(config)
1621
+ self.padding_idx = config.pad_token_id
1622
+ self.vocab_size = config.vocab_size
1623
+ self.hidden_size = config.hidden_size
1624
+ # Recompute defaults to False and is controlled by Trainer
1625
+
1626
+ if (
1627
+ config.tensor_parallel_degree > 1
1628
+ and config.vocab_size % config.tensor_parallel_degree == 0
1629
+ ):
1630
+ self.embed_tokens = mpu.VocabParallelEmbedding(
1631
+ self.vocab_size,
1632
+ self.hidden_size,
1633
+ weight_attr=paddle.ParamAttr(initializer=nn.initializer.XavierNormal()),
1634
+ )
1635
+ else:
1636
+ self.embed_tokens = nn.Embedding(
1637
+ self.vocab_size,
1638
+ self.hidden_size,
1639
+ )
1640
+
1641
+ # self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
1642
+ self.layers = nn.LayerList(
1643
+ [
1644
+ Qwen2VLDecoderLayer(config, layer_idx)
1645
+ for layer_idx in range(config.num_hidden_layers)
1646
+ ]
1647
+ )
1648
+ self.norm = Qwen2RMSNorm(config, config.hidden_size, eps=config.rms_norm_eps)
1649
+
1650
+ self.enamble_recompute = False
1651
+
1652
+ def get_input_embeddings(self):
1653
+ return self.embed_tokens
1654
+
1655
+ def set_input_embeddings(self, value):
1656
+ self.embed_tokens = value
1657
+
1658
+ @staticmethod
1659
+ def _prepare_decoder_attention_mask(
1660
+ attention_mask, input_shape, past_key_values_length, dtype
1661
+ ):
1662
+ if attention_mask is not None:
1663
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
1664
+ if len(attention_mask.shape) == 2:
1665
+ expanded_attn_mask = _expand_2d_mask(
1666
+ attention_mask, dtype, tgt_length=input_shape[-1]
1667
+ )
1668
+ # For decoding phase in generation, seq_length = 1, we don't need to add causal mask
1669
+ if input_shape[-1] > 1:
1670
+ combined_attention_mask = _make_causal_mask(
1671
+ input_shape,
1672
+ past_key_values_length=past_key_values_length,
1673
+ )
1674
+ expanded_attn_mask = expanded_attn_mask & combined_attention_mask
1675
+ # [bsz, seq_len, seq_len] -> [bsz, 1, seq_len, seq_len]
1676
+ elif len(attention_mask.shape) == 3:
1677
+ expanded_attn_mask = attention_mask.unsqueeze(1).astype("bool")
1678
+ # if attention_mask is already 4-D, do nothing
1679
+ else:
1680
+ expanded_attn_mask = attention_mask
1681
+ else:
1682
+ expanded_attn_mask = _make_causal_mask(
1683
+ input_shape,
1684
+ past_key_values_length=past_key_values_length,
1685
+ )
1686
+ # Convert bool attention_mask to float attention mask, which will be added to attention_scores later
1687
+ expanded_attn_mask = paddle.where(
1688
+ expanded_attn_mask, 0.0, paddle.finfo(dtype).min
1689
+ ).astype(dtype)
1690
+ return expanded_attn_mask
1691
+
1692
+ @paddle.jit.not_to_static
1693
+ def recompute_training_full(
1694
+ self,
1695
+ layer_module: nn.Layer,
1696
+ hidden_states: paddle.Tensor,
1697
+ attention_mask: paddle.Tensor,
1698
+ position_ids: Optional[paddle.Tensor],
1699
+ past_key_value: paddle.Tensor,
1700
+ output_attentions: bool,
1701
+ use_cache: bool,
1702
+ cache_position: Optional[paddle.Tensor] = None,
1703
+ ):
1704
+ def create_custom_forward(module):
1705
+ def custom_forward(*inputs):
1706
+ return module(*inputs)
1707
+
1708
+ return custom_forward
1709
+
1710
+ hidden_states = recompute(
1711
+ create_custom_forward(layer_module),
1712
+ hidden_states,
1713
+ attention_mask,
1714
+ position_ids,
1715
+ past_key_value,
1716
+ output_attentions,
1717
+ use_cache,
1718
+ cache_position,
1719
+ use_reentrant=self.config.recompute_use_reentrant,
1720
+ )
1721
+ return hidden_states
1722
+
1723
+ def forward(
1724
+ self,
1725
+ input_ids: paddle.Tensor = None,
1726
+ attention_mask: Optional[paddle.Tensor] = None,
1727
+ position_ids: Optional[paddle.Tensor] = None,
1728
+ past_key_values: Optional[List[paddle.Tensor]] = None,
1729
+ inputs_embeds: Optional[paddle.Tensor] = None,
1730
+ use_cache: Optional[bool] = None,
1731
+ output_attentions: Optional[bool] = None,
1732
+ output_hidden_states: Optional[bool] = None,
1733
+ return_dict: Optional[bool] = None,
1734
+ cache_position: Optional[paddle.Tensor] = None,
1735
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
1736
+ output_attentions = (
1737
+ output_attentions
1738
+ if output_attentions is not None
1739
+ else self.config.output_attentions
1740
+ )
1741
+ output_hidden_states = (
1742
+ output_hidden_states
1743
+ if output_hidden_states is not None
1744
+ else self.config.output_hidden_states
1745
+ )
1746
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
1747
+
1748
+ return_dict = (
1749
+ return_dict if return_dict is not None else self.config.use_return_dict
1750
+ )
1751
+
1752
+ if (input_ids is None) ^ (inputs_embeds is not None):
1753
+ raise ValueError(
1754
+ "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
1755
+ )
1756
+ elif input_ids is not None:
1757
+ batch_size, seq_length = input_ids.shape
1758
+ elif inputs_embeds is not None:
1759
+ batch_size, seq_length, _ = inputs_embeds.shape
1760
+ else:
1761
+ raise ValueError(
1762
+ "You have to specify either decoder_input_ids or decoder_inputs_embeds"
1763
+ )
1764
+
1765
+ if past_key_values is None:
1766
+ past_key_values = tuple([None] * len(self.layers))
1767
+ # NOTE: to make cache can be clear in-time
1768
+ past_key_values = list(past_key_values)
1769
+
1770
+ seq_length_with_past = seq_length
1771
+ cache_length = 0
1772
+ if past_key_values[0] is not None:
1773
+ cache_length = past_key_values[0][0].shape[2] # shape[1] in qwen2
1774
+ seq_length_with_past += cache_length
1775
+
1776
+ if inputs_embeds is None:
1777
+ inputs_embeds = self.embed_tokens(input_ids)
1778
+
1779
+ # embed positions
1780
+ if attention_mask is None:
1781
+ # [bs, seq_len]
1782
+ attention_mask = paddle.ones(
1783
+ (batch_size, seq_length_with_past), dtype=paddle.bool
1784
+ )
1785
+
1786
+ if flash_attn_varlen_func:
1787
+ causal_mask = attention_mask
1788
+ else:
1789
+ causal_mask = self._prepare_decoder_attention_mask(
1790
+ attention_mask,
1791
+ (batch_size, seq_length),
1792
+ cache_length,
1793
+ inputs_embeds.dtype,
1794
+ ) # [bs, 1, seq_len, seq_len]
1795
+
1796
+ if cache_position is None:
1797
+ past_seen_tokens = (
1798
+ past_key_values[0][0].shape[2] if past_key_values[0] is not None else 0
1799
+ )
1800
+ cache_position = paddle.arange(
1801
+ past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1]
1802
+ )
1803
+
1804
+ if position_ids is None:
1805
+ # the hard coded `3` is for temporal, height and width.
1806
+ position_ids = cache_position.reshape([1, 1, -1]).expand(
1807
+ [3, inputs_embeds.shape[0], -1]
1808
+ )
1809
+
1810
+ hidden_states = inputs_embeds
1811
+
1812
+ # decoder layers
1813
+ all_hidden_states = () if output_hidden_states else None
1814
+ all_self_attns = () if output_attentions else None
1815
+ next_decoder_cache = ()
1816
+
1817
+ for idx, (decoder_layer) in enumerate(self.layers):
1818
+ if output_hidden_states:
1819
+ all_hidden_states += (hidden_states,)
1820
+
1821
+ past_key_value = (
1822
+ past_key_values[idx] if past_key_values is not None else None
1823
+ )
1824
+
1825
+ if self.enamble_recompute and self.training:
1826
+ layer_outputs = self.recompute_training_full(
1827
+ decoder_layer,
1828
+ hidden_states,
1829
+ causal_mask,
1830
+ position_ids,
1831
+ past_key_value,
1832
+ output_attentions,
1833
+ use_cache,
1834
+ cache_position,
1835
+ )
1836
+ else:
1837
+ layer_outputs = decoder_layer(
1838
+ hidden_states,
1839
+ attention_mask=causal_mask,
1840
+ position_ids=position_ids,
1841
+ past_key_value=past_key_value,
1842
+ output_attentions=output_attentions, # False
1843
+ use_cache=use_cache, # True
1844
+ cache_position=cache_position,
1845
+ )
1846
+
1847
+ # NOTE: clear outdate cache after it has been used for memory saving
1848
+ past_key_value = past_key_values[idx] = None
1849
+
1850
+ hidden_states = layer_outputs[0]
1851
+
1852
+ next_decoder_cache = (
1853
+ next_decoder_cache + (layer_outputs[-1],) if use_cache else None
1854
+ )
1855
+
1856
+ if output_attentions:
1857
+ all_self_attns += (layer_outputs[1],)
1858
+
1859
+ hidden_states = self.norm(hidden_states)
1860
+
1861
+ # add hidden states from the last decoder layer
1862
+ if output_hidden_states:
1863
+ all_hidden_states += (hidden_states,)
1864
+
1865
+ next_cache = next_decoder_cache if use_cache else None
1866
+
1867
+ if not return_dict:
1868
+ return tuple(
1869
+ v
1870
+ for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
1871
+ if v is not None
1872
+ )
1873
+ return BaseModelOutputWithPast(
1874
+ last_hidden_state=hidden_states,
1875
+ past_key_values=next_cache,
1876
+ hidden_states=all_hidden_states,
1877
+ attentions=all_self_attns,
1878
+ )
1879
+
1880
+
1881
+ class Qwen2LMHead(nn.Layer):
1882
+ def __init__(self, config, embedding_weights=None, transpose_y=False):
1883
+ super(Qwen2LMHead, self).__init__()
1884
+ self.config = config
1885
+ if (
1886
+ config.tensor_parallel_degree > 1
1887
+ and config.vocab_size % config.tensor_parallel_degree == 0
1888
+ ):
1889
+ vocab_size = config.vocab_size // config.tensor_parallel_degree
1890
+ else:
1891
+ vocab_size = config.vocab_size
1892
+
1893
+ self.transpose_y = transpose_y
1894
+ if transpose_y:
1895
+ # only for weight from embedding_weights
1896
+ if embedding_weights is not None:
1897
+ self.weight = embedding_weights
1898
+ else:
1899
+ self.weight = self.create_parameter(
1900
+ shape=[vocab_size, config.hidden_size],
1901
+ dtype=paddle.get_default_dtype(),
1902
+ )
1903
+ else:
1904
+
1905
+ if vocab_size != config.vocab_size:
1906
+ with get_rng_state_tracker().rng_state():
1907
+ self.weight = self.create_parameter(
1908
+ shape=[config.hidden_size, vocab_size],
1909
+ dtype=paddle.get_default_dtype(),
1910
+ )
1911
+ else:
1912
+ self.weight = self.create_parameter(
1913
+ shape=[config.hidden_size, vocab_size],
1914
+ dtype=paddle.get_default_dtype(),
1915
+ )
1916
+
1917
+ # Must set distributed attr for Tensor Parallel !
1918
+ self.weight.is_distributed = (
1919
+ True if (vocab_size != config.vocab_size) else False
1920
+ )
1921
+ if self.weight.is_distributed:
1922
+ # for tie_word_embeddings
1923
+ self.weight.split_axis = 0 if self.transpose_y else 1
1924
+
1925
+ def forward(self, hidden_states, tensor_parallel_output=None):
1926
+ if tensor_parallel_output is None:
1927
+ tensor_parallel_output = self.config.tensor_parallel_output
1928
+
1929
+ # 确保数据类型一致
1930
+ if self.weight.dtype != hidden_states.dtype:
1931
+ hidden_states = paddle.cast(hidden_states, self.weight.dtype)
1932
+
1933
+ logits = parallel_matmul(
1934
+ hidden_states,
1935
+ self.weight,
1936
+ transpose_y=self.transpose_y,
1937
+ tensor_parallel_output=tensor_parallel_output,
1938
+ )
1939
+ return logits
1940
+
1941
+
1942
+ class Qwen2VLForConditionalGeneration(Qwen2VLPreTrainedModel):
1943
+ _tied_weights_keys = ["lm_head.weight"]
1944
+
1945
+ def __init__(self, config, attn_implementation="flash_attention_2"):
1946
+ super().__init__(config)
1947
+ config._attn_implementation = attn_implementation
1948
+ config.vision_config._attn_implementation = attn_implementation
1949
+
1950
+ self.visual = Qwen2VisionTransformerPretrainedModel._from_config(
1951
+ config.vision_config
1952
+ )
1953
+ self.model = Qwen2VLModel(config)
1954
+ self.vocab_size = config.vocab_size
1955
+
1956
+ if config.tie_word_embeddings:
1957
+ self.lm_head = Qwen2LMHead(
1958
+ config,
1959
+ embedding_weights=self.model.embed_tokens.weight,
1960
+ transpose_y=True,
1961
+ )
1962
+ self.tie_weights()
1963
+ else:
1964
+ self.lm_head = Qwen2LMHead(config)
1965
+ self.padding_side = "left" # set it to left by default, user can use setter to change padding_sides
1966
+
1967
+ def get_input_embeddings(self):
1968
+ return self.model.embed_tokens
1969
+
1970
+ def set_input_embeddings(self, value):
1971
+ self.model.embed_tokens = value
1972
+
1973
+ def get_output_embeddings(self):
1974
+ return self.lm_head
1975
+
1976
+ def set_output_embeddings(self, new_embeddings):
1977
+ self.lm_head = new_embeddings
1978
+
1979
+ def set_decoder(self, decoder):
1980
+ self.model = decoder
1981
+
1982
+ def get_decoder(self):
1983
+ return self.model
1984
+
1985
+ @staticmethod
1986
+ def get_rope_index(
1987
+ spatial_merge_size,
1988
+ image_token_id,
1989
+ video_token_id,
1990
+ vision_start_token_id,
1991
+ input_ids: paddle.Tensor,
1992
+ image_grid_thw: Optional[paddle.Tensor] = None,
1993
+ video_grid_thw: Optional[paddle.Tensor] = None,
1994
+ attention_mask: Optional[paddle.Tensor] = None,
1995
+ ) -> Tuple[paddle.Tensor, paddle.Tensor]:
1996
+ """
1997
+ Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
1998
+
1999
+ Explanation:
2000
+ Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
2001
+
2002
+ For pure text embedding sequence, the rotary position embedding has no difference with modern LLMs.
2003
+ Examples:
2004
+ input_ids: [T T T T T], here T is for text.
2005
+ temporal position_ids: [0, 1, 2, 3, 4]
2006
+ height position_ids: [0, 1, 2, 3, 4]
2007
+ width position_ids: [0, 1, 2, 3, 4]
2008
+
2009
+ For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
2010
+ and 1D rotary position embedding for text part.
2011
+ Examples:
2012
+ Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
2013
+ input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
2014
+ vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
2015
+ vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
2016
+ vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
2017
+ text temporal position_ids: [3, 4, 5, 6, 7]
2018
+ text height position_ids: [3, 4, 5, 6, 7]
2019
+ text width position_ids: [3, 4, 5, 6, 7]
2020
+ Here we calculate the text start position_ids as the max vision position_ids plus 1.
2021
+
2022
+ Args:
2023
+ input_ids (`paddle.Tensor` of shape `(batch_size, sequence_length)`):
2024
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
2025
+ it.
2026
+ image_grid_thw (`paddle.Tensor` of shape `(num_images, 3)`, *optional*):
2027
+ The temporal, height and width of feature shape of each image in LLM.
2028
+ video_grid_thw (`paddle.Tensor` of shape `(num_videos, 3)`, *optional*):
2029
+ The temporal, height and width of feature shape of each video in LLM.
2030
+ attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
2031
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
2032
+
2033
+ - 1 for tokens that are **not masked**,
2034
+ - 0 for tokens that are **masked**.
2035
+
2036
+ Returns:
2037
+ position_ids (`paddle.Tensor` of shape `(3, batch_size, sequence_length)`)
2038
+ mrope_position_deltas (`paddle.Tensor` of shape `(batch_size)`)
2039
+ """
2040
+ mrope_position_deltas = []
2041
+ if image_grid_thw is not None or video_grid_thw is not None:
2042
+ total_input_ids = input_ids
2043
+ position_ids = paddle.ones(
2044
+ [3, input_ids.shape[0], input_ids.shape[1]], dtype=input_ids.dtype
2045
+ )
2046
+ image_index, video_index = 0, 0
2047
+ for i, input_ids in enumerate(total_input_ids):
2048
+ # TODO: CUDA error in some paddle version
2049
+ if attention_mask is not None:
2050
+ input_ids = paddle.to_tensor(
2051
+ input_ids.cpu()[attention_mask[i].cpu() == 1]
2052
+ ) # NOTE 原始写法
2053
+
2054
+ image_nums, video_nums = 0, 0
2055
+ vision_start_indices = paddle.nonzero(
2056
+ input_ids == vision_start_token_id
2057
+ ).squeeze(
2058
+ 1
2059
+ ) # NOTE 原始写法
2060
+
2061
+ vision_tokens = input_ids[vision_start_indices + 1]
2062
+ image_nums = (
2063
+ (vision_tokens == image_token_id).sum()
2064
+ if vision_tokens.numel() > 0
2065
+ else 0
2066
+ )
2067
+ video_nums = (
2068
+ (vision_tokens == video_token_id).sum()
2069
+ if vision_tokens.numel() > 0
2070
+ else 0
2071
+ )
2072
+ input_tokens = input_ids.tolist()
2073
+ llm_pos_ids_list: list = []
2074
+ st = 0
2075
+ remain_images, remain_videos = image_nums, video_nums
2076
+ for _ in range(image_nums + video_nums):
2077
+ if image_token_id in input_tokens and remain_images > 0:
2078
+ ed_image = input_tokens.index(image_token_id, st)
2079
+ else:
2080
+ ed_image = len(input_tokens) + 1
2081
+ if video_token_id in input_tokens and remain_videos > 0:
2082
+ ed_video = input_tokens.index(video_token_id, st)
2083
+ else:
2084
+ ed_video = len(input_tokens) + 1
2085
+ if ed_image < ed_video:
2086
+ t, h, w = (
2087
+ image_grid_thw[image_index][0],
2088
+ image_grid_thw[image_index][1],
2089
+ image_grid_thw[image_index][2],
2090
+ )
2091
+ image_index += 1
2092
+ remain_images -= 1
2093
+ ed = ed_image
2094
+ else:
2095
+ t, h, w = (
2096
+ video_grid_thw[video_index][0],
2097
+ video_grid_thw[video_index][1],
2098
+ video_grid_thw[video_index][2],
2099
+ )
2100
+ video_index += 1
2101
+ remain_videos -= 1
2102
+ ed = ed_video
2103
+ llm_grid_t, llm_grid_h, llm_grid_w = (
2104
+ t.item(),
2105
+ h.item() // spatial_merge_size,
2106
+ w.item() // spatial_merge_size,
2107
+ )
2108
+ text_len = ed - st
2109
+
2110
+ st_idx = (
2111
+ llm_pos_ids_list[-1].max() + 1
2112
+ if len(llm_pos_ids_list) > 0
2113
+ else 0
2114
+ )
2115
+ llm_pos_ids_list.append(
2116
+ paddle.arange(text_len).reshape([1, -1]).expand([3, -1])
2117
+ + st_idx
2118
+ )
2119
+
2120
+ t_index = (
2121
+ paddle.arange(llm_grid_t)
2122
+ .reshape([-1, 1])
2123
+ .expand([-1, llm_grid_h * llm_grid_w])
2124
+ .flatten()
2125
+ )
2126
+ h_index = (
2127
+ paddle.arange(llm_grid_h)
2128
+ .reshape([1, -1, 1])
2129
+ .expand([llm_grid_t, -1, llm_grid_w])
2130
+ .flatten()
2131
+ )
2132
+ w_index = (
2133
+ paddle.arange(llm_grid_w)
2134
+ .reshape([1, 1, -1])
2135
+ .expand([llm_grid_t, llm_grid_h, -1])
2136
+ .flatten()
2137
+ )
2138
+ llm_pos_ids_list.append(
2139
+ paddle.stack([t_index, h_index, w_index]) + text_len + st_idx
2140
+ )
2141
+ st = ed + llm_grid_t * llm_grid_h * llm_grid_w
2142
+
2143
+ if st < len(input_tokens):
2144
+ st_idx = (
2145
+ llm_pos_ids_list[-1].max() + 1
2146
+ if len(llm_pos_ids_list) > 0
2147
+ else 0
2148
+ )
2149
+ text_len = len(input_tokens) - st
2150
+ llm_pos_ids_list.append(
2151
+ paddle.arange(text_len).reshape([1, -1]).expand([3, -1])
2152
+ + st_idx
2153
+ )
2154
+
2155
+ llm_positions = paddle.concat(llm_pos_ids_list, axis=1).reshape([3, -1])
2156
+ if _IS_NPU:
2157
+ bool_indices = (
2158
+ (attention_mask[i] == 1)
2159
+ .unsqueeze(0)
2160
+ .tile([position_ids.shape[0], 1])
2161
+ )
2162
+ position_ids[:, i] = paddle.index_put(
2163
+ position_ids[:, i], [bool_indices], llm_positions.reshape([-1])
2164
+ )
2165
+ else:
2166
+ position_ids[..., i, attention_mask[i] == 1] = llm_positions
2167
+ mrope_position_deltas.append(
2168
+ llm_positions.max() + 1 - len(total_input_ids[i])
2169
+ )
2170
+ mrope_position_deltas = paddle.to_tensor(mrope_position_deltas).unsqueeze(1)
2171
+ else:
2172
+ if attention_mask is not None:
2173
+ position_ids = paddle.cast(attention_mask, dtype="int64").cumsum(-1) - 1
2174
+ position_ids.masked_fill_(mask=attention_mask == 0, value=1)
2175
+ position_ids = position_ids.unsqueeze(0).expand([3, -1, -1])
2176
+ max_position_ids = position_ids.max(0, keepdim=False)[0].max(
2177
+ -1, keepdim=True
2178
+ )[0]
2179
+ mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
2180
+ else:
2181
+ position_ids = (
2182
+ paddle.arange(input_ids.shape[1])
2183
+ .reshape([1, 1, -1])
2184
+ .expand(shape=[3, input_ids.shape[0], -1])
2185
+ )
2186
+ mrope_position_deltas = paddle.zeros(
2187
+ [input_ids.shape[0], 1], dtype=input_ids.dtype
2188
+ )
2189
+
2190
+ return position_ids, mrope_position_deltas
2191
+
2192
+ def update_model_kwargs_for_generation(
2193
+ self,
2194
+ outputs: ModelOutput,
2195
+ model_kwargs: Dict[str, Any],
2196
+ is_encoder_decoder: bool = False,
2197
+ # num_new_tokens: int = 1,
2198
+ ) -> Dict[str, Any]:
2199
+ model_kwargs = super().update_model_kwargs_for_generation(
2200
+ outputs=outputs,
2201
+ model_kwargs=model_kwargs,
2202
+ is_encoder_decoder=is_encoder_decoder,
2203
+ )
2204
+
2205
+ if getattr(outputs, "rope_deltas", None) is not None:
2206
+ model_kwargs["rope_deltas"] = outputs.rope_deltas
2207
+
2208
+ return model_kwargs
2209
+
2210
+ def forward(
2211
+ self,
2212
+ input_ids: paddle.Tensor = None,
2213
+ attention_mask: Optional[paddle.Tensor] = None,
2214
+ position_ids: Optional[paddle.Tensor] = None,
2215
+ past_key_values: Optional[List[paddle.Tensor]] = None,
2216
+ inputs_embeds: Optional[paddle.Tensor] = None,
2217
+ labels: Optional[paddle.Tensor] = None,
2218
+ use_cache: Optional[bool] = None,
2219
+ output_attentions: Optional[bool] = None,
2220
+ output_hidden_states: Optional[bool] = None,
2221
+ return_dict: Optional[bool] = None,
2222
+ pixel_values: Optional[paddle.Tensor] = None,
2223
+ pixel_values_videos: Optional[paddle.Tensor] = None,
2224
+ image_grid_thw: Optional[paddle.Tensor] = None,
2225
+ video_grid_thw: Optional[paddle.Tensor] = None,
2226
+ rope_deltas: Optional[paddle.Tensor] = None,
2227
+ ):
2228
+ """
2229
+ Args:
2230
+ labels (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
2231
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
2232
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
2233
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
2234
+ """
2235
+ output_attentions = (
2236
+ output_attentions
2237
+ if output_attentions is not None
2238
+ else self.config.output_attentions
2239
+ )
2240
+ output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states # fmt:skip
2241
+
2242
+ return_dict = True # return_dict if return_dict is not None else self.config.use_return_dict
2243
+
2244
+ if inputs_embeds is None:
2245
+ inputs_embeds = self.model.embed_tokens(input_ids)
2246
+
2247
+ if pixel_values is not None:
2248
+ pixel_values = paddle.cast(pixel_values, inputs_embeds.dtype)
2249
+
2250
+ image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
2251
+ image_embeds = paddle.cast(image_embeds, inputs_embeds.dtype)
2252
+
2253
+ image_mask = input_ids == self.config.image_token_id
2254
+ if self.training:
2255
+ inputs_embeds = inputs_embeds.clone()
2256
+ inputs_embeds[image_mask] = image_embeds
2257
+ if pixel_values_videos is not None:
2258
+ pixel_values_videos = paddle.cast(
2259
+ pixel_values_videos, inputs_embeds.dtype
2260
+ )
2261
+ video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
2262
+ video_embeds = paddle.cast(video_embeds, inputs_embeds.dtype)
2263
+ video_mask = input_ids == self.config.video_token_id
2264
+ inputs_embeds[video_mask] = video_embeds
2265
+ if attention_mask is not None:
2266
+ attention_mask = attention_mask
2267
+
2268
+ outputs = self.model(
2269
+ input_ids=None,
2270
+ position_ids=position_ids,
2271
+ attention_mask=attention_mask,
2272
+ past_key_values=past_key_values,
2273
+ inputs_embeds=inputs_embeds,
2274
+ use_cache=use_cache,
2275
+ output_attentions=output_attentions,
2276
+ output_hidden_states=output_hidden_states,
2277
+ return_dict=return_dict,
2278
+ )
2279
+
2280
+ hidden_states = outputs[0]
2281
+
2282
+ tensor_parallel_output = (
2283
+ self.config.tensor_parallel_output
2284
+ and self.config.tensor_parallel_degree > 1
2285
+ )
2286
+
2287
+ logits = self.lm_head(
2288
+ hidden_states, tensor_parallel_output=tensor_parallel_output
2289
+ )
2290
+
2291
+ logits = paddle.cast(logits, "float32")
2292
+
2293
+ loss = None
2294
+ if labels is not None:
2295
+ # Shift so that tokens < n predict n
2296
+ shift_logits = logits[..., :-1, :]
2297
+ shift_labels = labels[..., 1:]
2298
+ # Flatten the tokens
2299
+ shift_logits = shift_logits.reshape([-1, self.config.vocab_size])
2300
+ shift_labels = shift_labels.reshape([-1])
2301
+ if _IS_NPU:
2302
+ tmp = F.log_softmax(shift_logits, axis=1)
2303
+ loss = F.nll_loss(tmp, shift_labels, reduction="sum")
2304
+ else:
2305
+ loss_fct = nn.CrossEntropyLoss(reduction="sum")
2306
+ loss = loss_fct(shift_logits, shift_labels)
2307
+ label_sum = paddle.sum(shift_labels != -100).cast("float32")
2308
+ loss = loss / label_sum
2309
+
2310
+ if not return_dict:
2311
+ output = (logits,) + tuple(outputs[1:])
2312
+ return (loss,) + output if loss is not None else output
2313
+
2314
+ return Qwen2VLCausalLMOutputWithPast(
2315
+ loss=loss,
2316
+ logits=logits,
2317
+ past_key_values=outputs.past_key_values,
2318
+ hidden_states=outputs.hidden_states,
2319
+ attentions=outputs.attentions,
2320
+ rope_deltas=rope_deltas,
2321
+ )
2322
+
2323
+ def prepare_inputs_for_generation(
2324
+ self,
2325
+ input_ids,
2326
+ past_key_values=None,
2327
+ attention_mask=None,
2328
+ inputs_embeds=None,
2329
+ cache_position=None,
2330
+ position_ids=None,
2331
+ use_cache=True,
2332
+ pixel_values=None,
2333
+ pixel_values_videos=None,
2334
+ image_grid_thw=None,
2335
+ video_grid_thw=None,
2336
+ **kwargs,
2337
+ ):
2338
+
2339
+ batch_size, seq_length = input_ids.shape
2340
+ if past_key_values is None:
2341
+ cache_position = paddle.arange(input_ids.shape[1])
2342
+ else:
2343
+ cache_position = paddle.to_tensor([seq_length - 1])
2344
+
2345
+ if past_key_values is not None:
2346
+ input_ids = input_ids[:, -1].unsqueeze(-1)
2347
+
2348
+ rope_deltas = kwargs.get("rope_deltas", None)
2349
+
2350
+ if attention_mask is not None and position_ids is None:
2351
+ if cache_position is None or (
2352
+ cache_position is not None and cache_position[0] == 0
2353
+ ):
2354
+ position_ids, rope_deltas = self.get_rope_index(
2355
+ self.config.vision_config.spatial_merge_size,
2356
+ self.config.image_token_id,
2357
+ self.config.video_token_id,
2358
+ self.config.vision_start_token_id,
2359
+ input_ids,
2360
+ image_grid_thw,
2361
+ video_grid_thw,
2362
+ attention_mask,
2363
+ )
2364
+ else:
2365
+ batch_size, seq_length = input_ids.shape
2366
+ delta = (
2367
+ cache_position[0] + rope_deltas
2368
+ if cache_position is not None and rope_deltas is not None
2369
+ else 0
2370
+ )
2371
+ position_ids = paddle.arange(seq_length)
2372
+ position_ids = position_ids.reshape([1, -1]).expand([batch_size, -1])
2373
+ position_ids = position_ids + delta
2374
+ position_ids = position_ids.unsqueeze(axis=0).expand([3, -1, -1])
2375
+
2376
+ if cache_position[0] != 0:
2377
+ pixel_values = None
2378
+ pixel_values_videos = None
2379
+
2380
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
2381
+ if inputs_embeds is not None and cache_position[0] == 0:
2382
+ model_inputs = {"inputs_embeds": inputs_embeds}
2383
+ else:
2384
+ model_inputs = {"input_ids": input_ids}
2385
+
2386
+ model_inputs.update(
2387
+ {
2388
+ "position_ids": position_ids, # [3, 1, 3602]
2389
+ "past_key_values": past_key_values, # DynamicCache()
2390
+ "use_cache": use_cache, # 1
2391
+ "attention_mask": attention_mask, # [1, 3602]
2392
+ "pixel_values": pixel_values, # [14308, 1176]
2393
+ "pixel_values_videos": pixel_values_videos,
2394
+ "image_grid_thw": image_grid_thw, # [[ 1, 98, 146]]
2395
+ "video_grid_thw": video_grid_thw,
2396
+ "rope_deltas": rope_deltas, # [[-3504]]
2397
+ }
2398
+ )
2399
+ return model_inputs
2400
+
2401
+ def gme_qwen2_vl_forward(
2402
+ self,
2403
+ input_ids: paddle.Tensor = None,
2404
+ attention_mask: Optional[paddle.Tensor] = None,
2405
+ position_ids: Optional[paddle.Tensor] = None,
2406
+ past_key_values: Optional[List[paddle.Tensor]] = None,
2407
+ inputs_embeds: Optional[paddle.Tensor] = None,
2408
+ labels: Optional[paddle.Tensor] = None,
2409
+ use_cache: Optional[bool] = None,
2410
+ output_attentions: Optional[bool] = None,
2411
+ output_hidden_states: Optional[bool] = None,
2412
+ return_dict: Optional[bool] = None,
2413
+ pixel_values: Optional[paddle.Tensor] = None,
2414
+ pixel_values_videos: Optional[paddle.Tensor] = None,
2415
+ image_grid_thw: Optional[paddle.Tensor] = None,
2416
+ video_grid_thw: Optional[paddle.Tensor] = None,
2417
+ rope_deltas: Optional[paddle.Tensor] = None,
2418
+ ):
2419
+
2420
+ output_attentions = (
2421
+ output_attentions
2422
+ if output_attentions is not None
2423
+ else self.config.output_attentions
2424
+ )
2425
+ output_hidden_states = (
2426
+ output_hidden_states
2427
+ if output_hidden_states is not None
2428
+ else self.config.output_hidden_states
2429
+ )
2430
+ return_dict = True # return_dict if return_dict is not None else self.config.use_return_dict
2431
+
2432
+ if inputs_embeds is None:
2433
+ inputs_embeds = self.model.embed_tokens(input_ids)
2434
+ if pixel_values is not None:
2435
+ # 确保 pixel_values 和 inputs_embeds 使用相同的数据类型
2436
+ pixel_values = paddle.cast(pixel_values, inputs_embeds.dtype)
2437
+ image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
2438
+ # 确保 image_embeds 和 inputs_embeds 使用相同的数据类型
2439
+ image_embeds = paddle.cast(image_embeds, inputs_embeds.dtype)
2440
+ image_mask = input_ids == self.config.image_token_id
2441
+ if self.training:
2442
+ inputs_embeds = inputs_embeds.clone()
2443
+
2444
+ inputs_embeds[image_mask] = image_embeds
2445
+
2446
+ if pixel_values_videos is not None:
2447
+ # 确保 pixel_values_videos 和 inputs_embeds 使用相同的数据类型
2448
+ pixel_values_videos = paddle.cast(
2449
+ pixel_values_videos, inputs_embeds.dtype
2450
+ )
2451
+ video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
2452
+ # 确保 video_embeds 和 inputs_embeds 使用相同的数据类型
2453
+ video_embeds = paddle.cast(video_embeds, inputs_embeds.dtype)
2454
+ video_mask = input_ids == self.config.video_token_id
2455
+ inputs_embeds[video_mask] = video_embeds
2456
+ if attention_mask is not None:
2457
+ attention_mask = attention_mask
2458
+
2459
+ outputs = self.model(
2460
+ input_ids=None,
2461
+ position_ids=position_ids,
2462
+ attention_mask=attention_mask,
2463
+ past_key_values=past_key_values,
2464
+ inputs_embeds=inputs_embeds,
2465
+ use_cache=use_cache,
2466
+ output_attentions=output_attentions,
2467
+ output_hidden_states=output_hidden_states,
2468
+ return_dict=return_dict,
2469
+ )
2470
+
2471
+ hidden_states = outputs[0]
2472
+ # get last hidden state
2473
+ last_hidden_state = hidden_states[:, -1, :]
2474
+ return last_hidden_state
2475
+
2476
+
2477
+ class PPDocBeeInference(Qwen2VLForConditionalGeneration):
2478
+ set_inference_operations(get_inference_operations() + ["docbee_generate"])
2479
+
2480
+ @benchmark.timeit_with_options(name="docbee_generate")
2481
+ def generate(self, inputs, **kwargs):
2482
+ max_new_tokens = kwargs.get("max_new_tokens", 2048)
2483
+ temperature = kwargs.get("temperature", 0.1)
2484
+ top_p = kwargs.get("top_p", 0.001)
2485
+ top_k = kwargs.get("top_k", 1)
2486
+ with paddle.no_grad():
2487
+ generated_ids = super().generate(
2488
+ **inputs,
2489
+ max_new_tokens=max_new_tokens,
2490
+ temperature=temperature,
2491
+ top_p=top_p,
2492
+ top_k=top_k,
2493
+ )
2494
+
2495
+ return generated_ids