paddlex 2.0.0rc4__py3-none-any.whl → 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1533) hide show
  1. paddlex/.version +1 -0
  2. paddlex/__init__.py +35 -18
  3. paddlex/__main__.py +39 -0
  4. paddlex/configs/modules/3d_bev_detection/BEVFusion.yaml +38 -0
  5. paddlex/configs/modules/chart_parsing/PP-Chart2Table.yaml +13 -0
  6. paddlex/configs/modules/doc_text_orientation/PP-LCNet_x1_0_doc_ori.yaml +41 -0
  7. paddlex/configs/modules/doc_vlm/PP-DocBee-2B.yaml +14 -0
  8. paddlex/configs/modules/doc_vlm/PP-DocBee-7B.yaml +14 -0
  9. paddlex/configs/modules/doc_vlm/PP-DocBee2-3B.yaml +14 -0
  10. paddlex/configs/modules/face_detection/BlazeFace-FPN-SSH.yaml +40 -0
  11. paddlex/configs/modules/face_detection/BlazeFace.yaml +40 -0
  12. paddlex/configs/modules/face_detection/PP-YOLOE_plus-S_face.yaml +40 -0
  13. paddlex/configs/modules/face_detection/PicoDet_LCNet_x2_5_face.yaml +40 -0
  14. paddlex/configs/modules/face_feature/MobileFaceNet.yaml +41 -0
  15. paddlex/configs/modules/face_feature/ResNet50_face.yaml +41 -0
  16. paddlex/configs/modules/formula_recognition/LaTeX_OCR_rec.yaml +40 -0
  17. paddlex/configs/modules/formula_recognition/PP-FormulaNet-L.yaml +40 -0
  18. paddlex/configs/modules/formula_recognition/PP-FormulaNet-S.yaml +40 -0
  19. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-L.yaml +40 -0
  20. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-M.yaml +40 -0
  21. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-S.yaml +40 -0
  22. paddlex/configs/modules/formula_recognition/UniMERNet.yaml +40 -0
  23. paddlex/configs/modules/human_detection/PP-YOLOE-L_human.yaml +42 -0
  24. paddlex/configs/modules/human_detection/PP-YOLOE-S_human.yaml +42 -0
  25. paddlex/configs/modules/image_anomaly_detection/STFPM.yaml +41 -0
  26. paddlex/configs/modules/image_classification/CLIP_vit_base_patch16_224.yaml +41 -0
  27. paddlex/configs/modules/image_classification/CLIP_vit_large_patch14_224.yaml +41 -0
  28. paddlex/configs/modules/image_classification/ConvNeXt_base_224.yaml +41 -0
  29. paddlex/configs/modules/image_classification/ConvNeXt_base_384.yaml +41 -0
  30. paddlex/configs/modules/image_classification/ConvNeXt_large_224.yaml +41 -0
  31. paddlex/configs/modules/image_classification/ConvNeXt_large_384.yaml +41 -0
  32. paddlex/configs/modules/image_classification/ConvNeXt_small.yaml +41 -0
  33. paddlex/configs/modules/image_classification/ConvNeXt_tiny.yaml +41 -0
  34. paddlex/configs/modules/image_classification/FasterNet-L.yaml +40 -0
  35. paddlex/configs/modules/image_classification/FasterNet-M.yaml +40 -0
  36. paddlex/configs/modules/image_classification/FasterNet-S.yaml +40 -0
  37. paddlex/configs/modules/image_classification/FasterNet-T0.yaml +40 -0
  38. paddlex/configs/modules/image_classification/FasterNet-T1.yaml +40 -0
  39. paddlex/configs/modules/image_classification/FasterNet-T2.yaml +40 -0
  40. paddlex/configs/modules/image_classification/MobileNetV1_x0_25.yaml +41 -0
  41. paddlex/configs/modules/image_classification/MobileNetV1_x0_5.yaml +41 -0
  42. paddlex/configs/modules/image_classification/MobileNetV1_x0_75.yaml +41 -0
  43. paddlex/configs/modules/image_classification/MobileNetV1_x1_0.yaml +41 -0
  44. paddlex/configs/modules/image_classification/MobileNetV2_x0_25.yaml +41 -0
  45. paddlex/configs/modules/image_classification/MobileNetV2_x0_5.yaml +41 -0
  46. paddlex/configs/modules/image_classification/MobileNetV2_x1_0.yaml +41 -0
  47. paddlex/configs/modules/image_classification/MobileNetV2_x1_5.yaml +41 -0
  48. paddlex/configs/modules/image_classification/MobileNetV2_x2_0.yaml +41 -0
  49. paddlex/configs/modules/image_classification/MobileNetV3_large_x0_35.yaml +41 -0
  50. paddlex/configs/modules/image_classification/MobileNetV3_large_x0_5.yaml +41 -0
  51. paddlex/configs/modules/image_classification/MobileNetV3_large_x0_75.yaml +41 -0
  52. paddlex/configs/modules/image_classification/MobileNetV3_large_x1_0.yaml +41 -0
  53. paddlex/configs/modules/image_classification/MobileNetV3_large_x1_25.yaml +41 -0
  54. paddlex/configs/modules/image_classification/MobileNetV3_small_x0_35.yaml +41 -0
  55. paddlex/configs/modules/image_classification/MobileNetV3_small_x0_5.yaml +41 -0
  56. paddlex/configs/modules/image_classification/MobileNetV3_small_x0_75.yaml +41 -0
  57. paddlex/configs/modules/image_classification/MobileNetV3_small_x1_0.yaml +41 -0
  58. paddlex/configs/modules/image_classification/MobileNetV3_small_x1_25.yaml +41 -0
  59. paddlex/configs/modules/image_classification/MobileNetV4_conv_large.yaml +41 -0
  60. paddlex/configs/modules/image_classification/MobileNetV4_conv_medium.yaml +41 -0
  61. paddlex/configs/modules/image_classification/MobileNetV4_conv_small.yaml +41 -0
  62. paddlex/configs/modules/image_classification/MobileNetV4_hybrid_large.yaml +41 -0
  63. paddlex/configs/modules/image_classification/MobileNetV4_hybrid_medium.yaml +41 -0
  64. paddlex/configs/modules/image_classification/PP-HGNetV2-B0.yaml +41 -0
  65. paddlex/configs/modules/image_classification/PP-HGNetV2-B1.yaml +41 -0
  66. paddlex/configs/modules/image_classification/PP-HGNetV2-B2.yaml +41 -0
  67. paddlex/configs/modules/image_classification/PP-HGNetV2-B3.yaml +41 -0
  68. paddlex/configs/modules/image_classification/PP-HGNetV2-B4.yaml +41 -0
  69. paddlex/configs/modules/image_classification/PP-HGNetV2-B5.yaml +41 -0
  70. paddlex/configs/modules/image_classification/PP-HGNetV2-B6.yaml +41 -0
  71. paddlex/configs/modules/image_classification/PP-HGNet_base.yaml +41 -0
  72. paddlex/configs/modules/image_classification/PP-HGNet_small.yaml +41 -0
  73. paddlex/configs/modules/image_classification/PP-HGNet_tiny.yaml +41 -0
  74. paddlex/configs/modules/image_classification/PP-LCNetV2_base.yaml +41 -0
  75. paddlex/configs/modules/image_classification/PP-LCNetV2_large.yaml +41 -0
  76. paddlex/configs/modules/image_classification/PP-LCNetV2_small.yaml +41 -0
  77. paddlex/configs/modules/image_classification/PP-LCNet_x0_25.yaml +41 -0
  78. paddlex/configs/modules/image_classification/PP-LCNet_x0_35.yaml +41 -0
  79. paddlex/configs/modules/image_classification/PP-LCNet_x0_5.yaml +41 -0
  80. paddlex/configs/modules/image_classification/PP-LCNet_x0_75.yaml +41 -0
  81. paddlex/configs/modules/image_classification/PP-LCNet_x1_0.yaml +41 -0
  82. paddlex/configs/modules/image_classification/PP-LCNet_x1_5.yaml +41 -0
  83. paddlex/configs/modules/image_classification/PP-LCNet_x2_0.yaml +41 -0
  84. paddlex/configs/modules/image_classification/PP-LCNet_x2_5.yaml +41 -0
  85. paddlex/configs/modules/image_classification/ResNet101.yaml +41 -0
  86. paddlex/configs/modules/image_classification/ResNet101_vd.yaml +41 -0
  87. paddlex/configs/modules/image_classification/ResNet152.yaml +41 -0
  88. paddlex/configs/modules/image_classification/ResNet152_vd.yaml +41 -0
  89. paddlex/configs/modules/image_classification/ResNet18.yaml +41 -0
  90. paddlex/configs/modules/image_classification/ResNet18_vd.yaml +41 -0
  91. paddlex/configs/modules/image_classification/ResNet200_vd.yaml +41 -0
  92. paddlex/configs/modules/image_classification/ResNet34.yaml +41 -0
  93. paddlex/configs/modules/image_classification/ResNet34_vd.yaml +41 -0
  94. paddlex/configs/modules/image_classification/ResNet50.yaml +41 -0
  95. paddlex/configs/modules/image_classification/ResNet50_vd.yaml +41 -0
  96. paddlex/configs/modules/image_classification/StarNet-S1.yaml +41 -0
  97. paddlex/configs/modules/image_classification/StarNet-S2.yaml +41 -0
  98. paddlex/configs/modules/image_classification/StarNet-S3.yaml +41 -0
  99. paddlex/configs/modules/image_classification/StarNet-S4.yaml +41 -0
  100. paddlex/configs/modules/image_classification/SwinTransformer_base_patch4_window12_384.yaml +41 -0
  101. paddlex/configs/modules/image_classification/SwinTransformer_base_patch4_window7_224.yaml +41 -0
  102. paddlex/configs/modules/image_classification/SwinTransformer_large_patch4_window12_384.yaml +41 -0
  103. paddlex/configs/modules/image_classification/SwinTransformer_large_patch4_window7_224.yaml +41 -0
  104. paddlex/configs/modules/image_classification/SwinTransformer_small_patch4_window7_224.yaml +41 -0
  105. paddlex/configs/modules/image_classification/SwinTransformer_tiny_patch4_window7_224.yaml +41 -0
  106. paddlex/configs/modules/image_feature/PP-ShiTuV2_rec.yaml +42 -0
  107. paddlex/configs/modules/image_feature/PP-ShiTuV2_rec_CLIP_vit_base.yaml +42 -0
  108. paddlex/configs/modules/image_feature/PP-ShiTuV2_rec_CLIP_vit_large.yaml +41 -0
  109. paddlex/configs/modules/image_multilabel_classification/CLIP_vit_base_patch16_448_ML.yaml +41 -0
  110. paddlex/configs/modules/image_multilabel_classification/PP-HGNetV2-B0_ML.yaml +41 -0
  111. paddlex/configs/modules/image_multilabel_classification/PP-HGNetV2-B4_ML.yaml +41 -0
  112. paddlex/configs/modules/image_multilabel_classification/PP-HGNetV2-B6_ML.yaml +41 -0
  113. paddlex/configs/modules/image_multilabel_classification/PP-LCNet_x1_0_ML.yaml +41 -0
  114. paddlex/configs/modules/image_multilabel_classification/ResNet50_ML.yaml +41 -0
  115. paddlex/configs/modules/image_unwarping/UVDoc.yaml +12 -0
  116. paddlex/configs/modules/instance_segmentation/Cascade-MaskRCNN-ResNet50-FPN.yaml +40 -0
  117. paddlex/configs/modules/instance_segmentation/Cascade-MaskRCNN-ResNet50-vd-SSLDv2-FPN.yaml +40 -0
  118. paddlex/configs/modules/instance_segmentation/Mask-RT-DETR-H.yaml +40 -0
  119. paddlex/configs/modules/instance_segmentation/Mask-RT-DETR-L.yaml +40 -0
  120. paddlex/configs/modules/instance_segmentation/Mask-RT-DETR-M.yaml +40 -0
  121. paddlex/configs/modules/instance_segmentation/Mask-RT-DETR-S.yaml +40 -0
  122. paddlex/configs/modules/instance_segmentation/Mask-RT-DETR-X.yaml +40 -0
  123. paddlex/configs/modules/instance_segmentation/MaskRCNN-ResNeXt101-vd-FPN.yaml +39 -0
  124. paddlex/configs/modules/instance_segmentation/MaskRCNN-ResNet101-FPN.yaml +40 -0
  125. paddlex/configs/modules/instance_segmentation/MaskRCNN-ResNet101-vd-FPN.yaml +40 -0
  126. paddlex/configs/modules/instance_segmentation/MaskRCNN-ResNet50-FPN.yaml +40 -0
  127. paddlex/configs/modules/instance_segmentation/MaskRCNN-ResNet50-vd-FPN.yaml +40 -0
  128. paddlex/configs/modules/instance_segmentation/MaskRCNN-ResNet50.yaml +40 -0
  129. paddlex/configs/modules/instance_segmentation/PP-YOLOE_seg-S.yaml +40 -0
  130. paddlex/configs/modules/instance_segmentation/SOLOv2.yaml +40 -0
  131. paddlex/configs/modules/keypoint_detection/PP-TinyPose_128x96.yaml +40 -0
  132. paddlex/configs/modules/keypoint_detection/PP-TinyPose_256x192.yaml +40 -0
  133. paddlex/configs/modules/layout_detection/PP-DocBlockLayout.yaml +40 -0
  134. paddlex/configs/modules/layout_detection/PP-DocLayout-L.yaml +40 -0
  135. paddlex/configs/modules/layout_detection/PP-DocLayout-M.yaml +40 -0
  136. paddlex/configs/modules/layout_detection/PP-DocLayout-S.yaml +40 -0
  137. paddlex/configs/modules/layout_detection/PP-DocLayout_plus-L.yaml +40 -0
  138. paddlex/configs/modules/layout_detection/PicoDet-L_layout_17cls.yaml +40 -0
  139. paddlex/configs/modules/layout_detection/PicoDet-L_layout_3cls.yaml +40 -0
  140. paddlex/configs/modules/layout_detection/PicoDet-S_layout_17cls.yaml +40 -0
  141. paddlex/configs/modules/layout_detection/PicoDet-S_layout_3cls.yaml +40 -0
  142. paddlex/configs/modules/layout_detection/PicoDet_layout_1x.yaml +40 -0
  143. paddlex/configs/modules/layout_detection/PicoDet_layout_1x_table.yaml +40 -0
  144. paddlex/configs/modules/layout_detection/RT-DETR-H_layout_17cls.yaml +40 -0
  145. paddlex/configs/modules/layout_detection/RT-DETR-H_layout_3cls.yaml +40 -0
  146. paddlex/configs/modules/mainbody_detection/PP-ShiTuV2_det.yaml +41 -0
  147. paddlex/configs/modules/multilingual_speech_recognition/whisper_base.yaml +12 -0
  148. paddlex/configs/modules/multilingual_speech_recognition/whisper_large.yaml +12 -0
  149. paddlex/configs/modules/multilingual_speech_recognition/whisper_medium.yaml +12 -0
  150. paddlex/configs/modules/multilingual_speech_recognition/whisper_small.yaml +12 -0
  151. paddlex/configs/modules/multilingual_speech_recognition/whisper_tiny.yaml +12 -0
  152. paddlex/configs/modules/object_detection/Cascade-FasterRCNN-ResNet50-FPN.yaml +41 -0
  153. paddlex/configs/modules/object_detection/Cascade-FasterRCNN-ResNet50-vd-SSLDv2-FPN.yaml +42 -0
  154. paddlex/configs/modules/object_detection/CenterNet-DLA-34.yaml +41 -0
  155. paddlex/configs/modules/object_detection/CenterNet-ResNet50.yaml +41 -0
  156. paddlex/configs/modules/object_detection/Co-DINO-R50.yaml +40 -0
  157. paddlex/configs/modules/object_detection/Co-DINO-Swin-L.yaml +40 -0
  158. paddlex/configs/modules/object_detection/Co-Deformable-DETR-R50.yaml +40 -0
  159. paddlex/configs/modules/object_detection/Co-Deformable-DETR-Swin-T.yaml +40 -0
  160. paddlex/configs/modules/object_detection/DETR-R50.yaml +42 -0
  161. paddlex/configs/modules/object_detection/FCOS-ResNet50.yaml +41 -0
  162. paddlex/configs/modules/object_detection/FasterRCNN-ResNeXt101-vd-FPN.yaml +42 -0
  163. paddlex/configs/modules/object_detection/FasterRCNN-ResNet101-FPN.yaml +42 -0
  164. paddlex/configs/modules/object_detection/FasterRCNN-ResNet101.yaml +42 -0
  165. paddlex/configs/modules/object_detection/FasterRCNN-ResNet34-FPN.yaml +42 -0
  166. paddlex/configs/modules/object_detection/FasterRCNN-ResNet50-FPN.yaml +42 -0
  167. paddlex/configs/modules/object_detection/FasterRCNN-ResNet50-vd-FPN.yaml +42 -0
  168. paddlex/configs/modules/object_detection/FasterRCNN-ResNet50-vd-SSLDv2-FPN.yaml +42 -0
  169. paddlex/configs/modules/object_detection/FasterRCNN-ResNet50.yaml +42 -0
  170. paddlex/configs/modules/object_detection/FasterRCNN-Swin-Tiny-FPN.yaml +42 -0
  171. paddlex/configs/modules/object_detection/PP-YOLOE_plus-L.yaml +40 -0
  172. paddlex/configs/modules/object_detection/PP-YOLOE_plus-M.yaml +40 -0
  173. paddlex/configs/modules/object_detection/PP-YOLOE_plus-S.yaml +40 -0
  174. paddlex/configs/modules/object_detection/PP-YOLOE_plus-X.yaml +40 -0
  175. paddlex/configs/modules/object_detection/PicoDet-L.yaml +40 -0
  176. paddlex/configs/modules/object_detection/PicoDet-M.yaml +42 -0
  177. paddlex/configs/modules/object_detection/PicoDet-S.yaml +40 -0
  178. paddlex/configs/modules/object_detection/PicoDet-XS.yaml +42 -0
  179. paddlex/configs/modules/object_detection/RT-DETR-H.yaml +40 -0
  180. paddlex/configs/modules/object_detection/RT-DETR-L.yaml +40 -0
  181. paddlex/configs/modules/object_detection/RT-DETR-R18.yaml +40 -0
  182. paddlex/configs/modules/object_detection/RT-DETR-R50.yaml +40 -0
  183. paddlex/configs/modules/object_detection/RT-DETR-X.yaml +40 -0
  184. paddlex/configs/modules/object_detection/YOLOX-L.yaml +40 -0
  185. paddlex/configs/modules/object_detection/YOLOX-M.yaml +40 -0
  186. paddlex/configs/modules/object_detection/YOLOX-N.yaml +40 -0
  187. paddlex/configs/modules/object_detection/YOLOX-S.yaml +40 -0
  188. paddlex/configs/modules/object_detection/YOLOX-T.yaml +40 -0
  189. paddlex/configs/modules/object_detection/YOLOX-X.yaml +40 -0
  190. paddlex/configs/modules/object_detection/YOLOv3-DarkNet53.yaml +40 -0
  191. paddlex/configs/modules/object_detection/YOLOv3-MobileNetV3.yaml +40 -0
  192. paddlex/configs/modules/object_detection/YOLOv3-ResNet50_vd_DCN.yaml +40 -0
  193. paddlex/configs/modules/open_vocabulary_detection/GroundingDINO-T.yaml +13 -0
  194. paddlex/configs/modules/open_vocabulary_detection/YOLO-Worldv2-L.yaml +13 -0
  195. paddlex/configs/modules/open_vocabulary_segmentation/SAM-H_box.yaml +17 -0
  196. paddlex/configs/modules/open_vocabulary_segmentation/SAM-H_point.yaml +15 -0
  197. paddlex/configs/modules/pedestrian_attribute_recognition/PP-LCNet_x1_0_pedestrian_attribute.yaml +41 -0
  198. paddlex/configs/modules/rotated_object_detection/PP-YOLOE-R-L.yaml +40 -0
  199. paddlex/configs/modules/seal_text_detection/PP-OCRv4_mobile_seal_det.yaml +40 -0
  200. paddlex/configs/modules/seal_text_detection/PP-OCRv4_server_seal_det.yaml +40 -0
  201. paddlex/configs/modules/semantic_segmentation/Deeplabv3-R101.yaml +40 -0
  202. paddlex/configs/modules/semantic_segmentation/Deeplabv3-R50.yaml +40 -0
  203. paddlex/configs/modules/semantic_segmentation/Deeplabv3_Plus-R101.yaml +40 -0
  204. paddlex/configs/modules/semantic_segmentation/Deeplabv3_Plus-R50.yaml +40 -0
  205. paddlex/configs/modules/semantic_segmentation/MaskFormer_small.yaml +42 -0
  206. paddlex/configs/modules/semantic_segmentation/MaskFormer_tiny.yaml +42 -0
  207. paddlex/configs/modules/semantic_segmentation/OCRNet_HRNet-W18.yaml +40 -0
  208. paddlex/configs/modules/semantic_segmentation/OCRNet_HRNet-W48.yaml +40 -0
  209. paddlex/configs/modules/semantic_segmentation/PP-LiteSeg-B.yaml +41 -0
  210. paddlex/configs/modules/semantic_segmentation/PP-LiteSeg-T.yaml +40 -0
  211. paddlex/configs/modules/semantic_segmentation/SeaFormer_base.yaml +40 -0
  212. paddlex/configs/modules/semantic_segmentation/SeaFormer_large.yaml +40 -0
  213. paddlex/configs/modules/semantic_segmentation/SeaFormer_small.yaml +40 -0
  214. paddlex/configs/modules/semantic_segmentation/SeaFormer_tiny.yaml +40 -0
  215. paddlex/configs/modules/semantic_segmentation/SegFormer-B0.yaml +40 -0
  216. paddlex/configs/modules/semantic_segmentation/SegFormer-B1.yaml +40 -0
  217. paddlex/configs/modules/semantic_segmentation/SegFormer-B2.yaml +40 -0
  218. paddlex/configs/modules/semantic_segmentation/SegFormer-B3.yaml +40 -0
  219. paddlex/configs/modules/semantic_segmentation/SegFormer-B4.yaml +40 -0
  220. paddlex/configs/modules/semantic_segmentation/SegFormer-B5.yaml +40 -0
  221. paddlex/configs/modules/small_object_detection/PP-YOLOE_plus_SOD-L.yaml +42 -0
  222. paddlex/configs/modules/small_object_detection/PP-YOLOE_plus_SOD-S.yaml +42 -0
  223. paddlex/configs/modules/small_object_detection/PP-YOLOE_plus_SOD-largesize-L.yaml +42 -0
  224. paddlex/configs/modules/table_cells_detection/RT-DETR-L_wired_table_cell_det.yaml +40 -0
  225. paddlex/configs/modules/table_cells_detection/RT-DETR-L_wireless_table_cell_det.yaml +40 -0
  226. paddlex/configs/modules/table_classification/PP-LCNet_x1_0_table_cls.yaml +41 -0
  227. paddlex/configs/modules/table_structure_recognition/SLANeXt_wired.yaml +39 -0
  228. paddlex/configs/modules/table_structure_recognition/SLANeXt_wireless.yaml +39 -0
  229. paddlex/configs/modules/table_structure_recognition/SLANet.yaml +39 -0
  230. paddlex/configs/modules/table_structure_recognition/SLANet_plus.yaml +39 -0
  231. paddlex/configs/modules/text_detection/PP-OCRv3_mobile_det.yaml +40 -0
  232. paddlex/configs/modules/text_detection/PP-OCRv3_server_det.yaml +40 -0
  233. paddlex/configs/modules/text_detection/PP-OCRv4_mobile_det.yaml +40 -0
  234. paddlex/configs/modules/text_detection/PP-OCRv4_server_det.yaml +40 -0
  235. paddlex/configs/modules/text_detection/PP-OCRv5_mobile_det.yaml +40 -0
  236. paddlex/configs/modules/text_detection/PP-OCRv5_server_det.yaml +40 -0
  237. paddlex/configs/modules/text_recognition/PP-OCRv3_mobile_rec.yaml +39 -0
  238. paddlex/configs/modules/text_recognition/PP-OCRv4_mobile_rec.yaml +39 -0
  239. paddlex/configs/modules/text_recognition/PP-OCRv4_server_rec.yaml +39 -0
  240. paddlex/configs/modules/text_recognition/PP-OCRv4_server_rec_doc.yaml +39 -0
  241. paddlex/configs/modules/text_recognition/PP-OCRv5_mobile_rec.yaml +39 -0
  242. paddlex/configs/modules/text_recognition/PP-OCRv5_server_rec.yaml +39 -0
  243. paddlex/configs/modules/text_recognition/arabic_PP-OCRv3_mobile_rec.yaml +39 -0
  244. paddlex/configs/modules/text_recognition/ch_RepSVTR_rec.yaml +39 -0
  245. paddlex/configs/modules/text_recognition/ch_SVTRv2_rec.yaml +39 -0
  246. paddlex/configs/modules/text_recognition/chinese_cht_PP-OCRv3_mobile_rec.yaml +39 -0
  247. paddlex/configs/modules/text_recognition/cyrillic_PP-OCRv3_mobile_rec.yaml +39 -0
  248. paddlex/configs/modules/text_recognition/devanagari_PP-OCRv3_mobile_rec.yaml +39 -0
  249. paddlex/configs/modules/text_recognition/en_PP-OCRv3_mobile_rec.yaml +39 -0
  250. paddlex/configs/modules/text_recognition/en_PP-OCRv4_mobile_rec.yaml +39 -0
  251. paddlex/configs/modules/text_recognition/japan_PP-OCRv3_mobile_rec.yaml +39 -0
  252. paddlex/configs/modules/text_recognition/ka_PP-OCRv3_mobile_rec.yaml +39 -0
  253. paddlex/configs/modules/text_recognition/korean_PP-OCRv3_mobile_rec.yaml +39 -0
  254. paddlex/configs/modules/text_recognition/latin_PP-OCRv3_mobile_rec.yaml +39 -0
  255. paddlex/configs/modules/text_recognition/ta_PP-OCRv3_mobile_rec.yaml +39 -0
  256. paddlex/configs/modules/text_recognition/te_PP-OCRv3_mobile_rec.yaml +39 -0
  257. paddlex/configs/modules/textline_orientation/PP-LCNet_x0_25_textline_ori.yaml +41 -0
  258. paddlex/configs/modules/ts_anomaly_detection/AutoEncoder_ad.yaml +37 -0
  259. paddlex/configs/modules/ts_anomaly_detection/DLinear_ad.yaml +37 -0
  260. paddlex/configs/modules/ts_anomaly_detection/Nonstationary_ad.yaml +37 -0
  261. paddlex/configs/modules/ts_anomaly_detection/PatchTST_ad.yaml +37 -0
  262. paddlex/configs/modules/ts_anomaly_detection/TimesNet_ad.yaml +37 -0
  263. paddlex/configs/modules/ts_classification/TimesNet_cls.yaml +37 -0
  264. paddlex/configs/modules/ts_forecast/DLinear.yaml +38 -0
  265. paddlex/configs/modules/ts_forecast/NLinear.yaml +38 -0
  266. paddlex/configs/modules/ts_forecast/Nonstationary.yaml +38 -0
  267. paddlex/configs/modules/ts_forecast/PatchTST.yaml +38 -0
  268. paddlex/configs/modules/ts_forecast/RLinear.yaml +38 -0
  269. paddlex/configs/modules/ts_forecast/TiDE.yaml +38 -0
  270. paddlex/configs/modules/ts_forecast/TimesNet.yaml +38 -0
  271. paddlex/configs/modules/vehicle_attribute_recognition/PP-LCNet_x1_0_vehicle_attribute.yaml +41 -0
  272. paddlex/configs/modules/vehicle_detection/PP-YOLOE-L_vehicle.yaml +41 -0
  273. paddlex/configs/modules/vehicle_detection/PP-YOLOE-S_vehicle.yaml +42 -0
  274. paddlex/configs/modules/video_classification/PP-TSM-R50_8frames_uniform.yaml +42 -0
  275. paddlex/configs/modules/video_classification/PP-TSMv2-LCNetV2_16frames_uniform.yaml +42 -0
  276. paddlex/configs/modules/video_classification/PP-TSMv2-LCNetV2_8frames_uniform.yaml +42 -0
  277. paddlex/configs/modules/video_detection/YOWO.yaml +40 -0
  278. paddlex/configs/pipelines/3d_bev_detection.yaml +9 -0
  279. paddlex/configs/pipelines/OCR.yaml +45 -0
  280. paddlex/configs/pipelines/PP-ChatOCRv3-doc.yaml +151 -0
  281. paddlex/configs/pipelines/PP-ChatOCRv4-doc.yaml +237 -0
  282. paddlex/configs/pipelines/PP-ShiTuV2.yaml +18 -0
  283. paddlex/configs/pipelines/PP-StructureV3.yaml +226 -0
  284. paddlex/configs/pipelines/anomaly_detection.yaml +8 -0
  285. paddlex/configs/pipelines/doc_preprocessor.yaml +15 -0
  286. paddlex/configs/pipelines/doc_understanding.yaml +9 -0
  287. paddlex/configs/pipelines/face_recognition.yaml +18 -0
  288. paddlex/configs/pipelines/formula_recognition.yaml +39 -0
  289. paddlex/configs/pipelines/human_keypoint_detection.yaml +17 -0
  290. paddlex/configs/pipelines/image_classification.yaml +10 -0
  291. paddlex/configs/pipelines/image_multilabel_classification.yaml +9 -0
  292. paddlex/configs/pipelines/instance_segmentation.yaml +10 -0
  293. paddlex/configs/pipelines/layout_parsing.yaml +102 -0
  294. paddlex/configs/pipelines/multilingual_speech_recognition.yaml +9 -0
  295. paddlex/configs/pipelines/object_detection.yaml +10 -0
  296. paddlex/configs/pipelines/open_vocabulary_detection.yaml +12 -0
  297. paddlex/configs/pipelines/open_vocabulary_segmentation.yaml +13 -0
  298. paddlex/configs/pipelines/pedestrian_attribute_recognition.yaml +15 -0
  299. paddlex/configs/pipelines/rotated_object_detection.yaml +10 -0
  300. paddlex/configs/pipelines/seal_recognition.yaml +52 -0
  301. paddlex/configs/pipelines/semantic_segmentation.yaml +10 -0
  302. paddlex/configs/pipelines/small_object_detection.yaml +10 -0
  303. paddlex/configs/pipelines/table_recognition.yaml +57 -0
  304. paddlex/configs/pipelines/table_recognition_v2.yaml +82 -0
  305. paddlex/configs/pipelines/ts_anomaly_detection.yaml +8 -0
  306. paddlex/configs/pipelines/ts_classification.yaml +8 -0
  307. paddlex/configs/pipelines/ts_forecast.yaml +8 -0
  308. paddlex/configs/pipelines/vehicle_attribute_recognition.yaml +15 -0
  309. paddlex/configs/pipelines/video_classification.yaml +9 -0
  310. paddlex/configs/pipelines/video_detection.yaml +10 -0
  311. paddlex/constants.py +17 -0
  312. paddlex/engine.py +56 -0
  313. paddlex/hpip_links.html +31 -0
  314. paddlex/inference/__init__.py +19 -0
  315. paddlex/inference/common/__init__.py +13 -0
  316. paddlex/inference/common/batch_sampler/__init__.py +21 -0
  317. paddlex/inference/common/batch_sampler/audio_batch_sampler.py +83 -0
  318. paddlex/inference/common/batch_sampler/base_batch_sampler.py +94 -0
  319. paddlex/inference/common/batch_sampler/det_3d_batch_sampler.py +144 -0
  320. paddlex/inference/common/batch_sampler/doc_vlm_batch_sampler.py +87 -0
  321. paddlex/inference/common/batch_sampler/image_batch_sampler.py +121 -0
  322. paddlex/inference/common/batch_sampler/ts_batch_sampler.py +109 -0
  323. paddlex/inference/common/batch_sampler/video_batch_sampler.py +74 -0
  324. paddlex/inference/common/reader/__init__.py +19 -0
  325. paddlex/inference/common/reader/audio_reader.py +46 -0
  326. paddlex/inference/common/reader/det_3d_reader.py +241 -0
  327. paddlex/inference/common/reader/image_reader.py +73 -0
  328. paddlex/inference/common/reader/ts_reader.py +46 -0
  329. paddlex/inference/common/reader/video_reader.py +42 -0
  330. paddlex/inference/common/result/__init__.py +29 -0
  331. paddlex/inference/common/result/base_cv_result.py +41 -0
  332. paddlex/inference/common/result/base_result.py +72 -0
  333. paddlex/inference/common/result/base_ts_result.py +41 -0
  334. paddlex/inference/common/result/base_video_result.py +36 -0
  335. paddlex/inference/common/result/mixin.py +709 -0
  336. paddlex/inference/models/__init__.py +86 -0
  337. paddlex/inference/models/anomaly_detection/__init__.py +15 -0
  338. paddlex/inference/models/anomaly_detection/predictor.py +135 -0
  339. paddlex/inference/models/anomaly_detection/processors.py +53 -0
  340. paddlex/inference/models/anomaly_detection/result.py +71 -0
  341. paddlex/inference/models/base/__init__.py +15 -0
  342. paddlex/inference/models/base/predictor/__init__.py +15 -0
  343. paddlex/inference/models/base/predictor/base_predictor.py +414 -0
  344. paddlex/inference/models/common/__init__.py +26 -0
  345. paddlex/inference/models/common/static_infer.py +801 -0
  346. paddlex/inference/models/common/tokenizer/__init__.py +21 -0
  347. paddlex/inference/models/common/tokenizer/bert_tokenizer.py +655 -0
  348. paddlex/inference/models/common/tokenizer/clip_tokenizer.py +609 -0
  349. paddlex/inference/models/common/tokenizer/gpt_tokenizer.py +453 -0
  350. paddlex/inference/models/common/tokenizer/qwen2_5_tokenizer.py +112 -0
  351. paddlex/inference/models/common/tokenizer/qwen2_tokenizer.py +438 -0
  352. paddlex/inference/models/common/tokenizer/qwen_tokenizer.py +288 -0
  353. paddlex/inference/models/common/tokenizer/tokenizer_utils.py +2149 -0
  354. paddlex/inference/models/common/tokenizer/tokenizer_utils_base.py +3720 -0
  355. paddlex/inference/models/common/tokenizer/utils.py +66 -0
  356. paddlex/inference/models/common/tokenizer/vocab.py +647 -0
  357. paddlex/inference/models/common/ts/__init__.py +15 -0
  358. paddlex/inference/models/common/ts/funcs.py +540 -0
  359. paddlex/inference/models/common/ts/processors.py +322 -0
  360. paddlex/inference/models/common/vision/__init__.py +23 -0
  361. paddlex/inference/models/common/vision/funcs.py +98 -0
  362. paddlex/inference/models/common/vision/processors.py +285 -0
  363. paddlex/inference/models/common/vlm/__init__.py +13 -0
  364. paddlex/inference/models/common/vlm/activations.py +189 -0
  365. paddlex/inference/models/common/vlm/bert_padding.py +127 -0
  366. paddlex/inference/models/common/vlm/conversion_utils.py +99 -0
  367. paddlex/inference/models/common/vlm/distributed.py +229 -0
  368. paddlex/inference/models/common/vlm/flash_attn_utils.py +119 -0
  369. paddlex/inference/models/common/vlm/fusion_ops.py +205 -0
  370. paddlex/inference/models/common/vlm/generation/__init__.py +34 -0
  371. paddlex/inference/models/common/vlm/generation/configuration_utils.py +533 -0
  372. paddlex/inference/models/common/vlm/generation/logits_process.py +730 -0
  373. paddlex/inference/models/common/vlm/generation/stopping_criteria.py +106 -0
  374. paddlex/inference/models/common/vlm/generation/utils.py +2162 -0
  375. paddlex/inference/models/common/vlm/transformers/__init__.py +16 -0
  376. paddlex/inference/models/common/vlm/transformers/configuration_utils.py +1037 -0
  377. paddlex/inference/models/common/vlm/transformers/conversion_utils.py +408 -0
  378. paddlex/inference/models/common/vlm/transformers/model_outputs.py +1612 -0
  379. paddlex/inference/models/common/vlm/transformers/model_utils.py +2014 -0
  380. paddlex/inference/models/common/vlm/transformers/utils.py +178 -0
  381. paddlex/inference/models/common/vlm/utils.py +109 -0
  382. paddlex/inference/models/doc_vlm/__init__.py +15 -0
  383. paddlex/inference/models/doc_vlm/modeling/GOT_ocr_2_0.py +830 -0
  384. paddlex/inference/models/doc_vlm/modeling/__init__.py +17 -0
  385. paddlex/inference/models/doc_vlm/modeling/qwen2.py +1606 -0
  386. paddlex/inference/models/doc_vlm/modeling/qwen2_5_vl.py +3006 -0
  387. paddlex/inference/models/doc_vlm/modeling/qwen2_vl.py +2495 -0
  388. paddlex/inference/models/doc_vlm/predictor.py +253 -0
  389. paddlex/inference/models/doc_vlm/processors/GOT_ocr_2_0.py +97 -0
  390. paddlex/inference/models/doc_vlm/processors/__init__.py +17 -0
  391. paddlex/inference/models/doc_vlm/processors/common.py +561 -0
  392. paddlex/inference/models/doc_vlm/processors/qwen2_5_vl.py +548 -0
  393. paddlex/inference/models/doc_vlm/processors/qwen2_vl.py +543 -0
  394. paddlex/inference/models/doc_vlm/result.py +21 -0
  395. paddlex/inference/models/face_feature/__init__.py +15 -0
  396. paddlex/inference/models/face_feature/predictor.py +66 -0
  397. paddlex/inference/models/formula_recognition/__init__.py +15 -0
  398. paddlex/inference/models/formula_recognition/predictor.py +193 -0
  399. paddlex/inference/models/formula_recognition/processors.py +1015 -0
  400. paddlex/inference/models/formula_recognition/result.py +411 -0
  401. paddlex/inference/models/image_classification/__init__.py +15 -0
  402. paddlex/inference/models/image_classification/predictor.py +172 -0
  403. paddlex/inference/models/image_classification/processors.py +89 -0
  404. paddlex/inference/models/image_classification/result.py +93 -0
  405. paddlex/inference/models/image_feature/__init__.py +15 -0
  406. paddlex/inference/models/image_feature/predictor.py +146 -0
  407. paddlex/inference/models/image_feature/processors.py +31 -0
  408. paddlex/inference/models/image_feature/result.py +32 -0
  409. paddlex/inference/models/image_multilabel_classification/__init__.py +15 -0
  410. paddlex/inference/models/image_multilabel_classification/predictor.py +95 -0
  411. paddlex/inference/models/image_multilabel_classification/processors.py +89 -0
  412. paddlex/inference/models/image_multilabel_classification/result.py +96 -0
  413. paddlex/inference/models/image_unwarping/__init__.py +15 -0
  414. paddlex/inference/models/image_unwarping/predictor.py +97 -0
  415. paddlex/inference/models/image_unwarping/processors.py +92 -0
  416. paddlex/inference/models/image_unwarping/result.py +47 -0
  417. paddlex/inference/models/instance_segmentation/__init__.py +15 -0
  418. paddlex/inference/models/instance_segmentation/predictor.py +202 -0
  419. paddlex/inference/models/instance_segmentation/processors.py +102 -0
  420. paddlex/inference/models/instance_segmentation/result.py +162 -0
  421. paddlex/inference/models/keypoint_detection/__init__.py +15 -0
  422. paddlex/inference/models/keypoint_detection/predictor.py +190 -0
  423. paddlex/inference/models/keypoint_detection/processors.py +367 -0
  424. paddlex/inference/models/keypoint_detection/result.py +197 -0
  425. paddlex/inference/models/m_3d_bev_detection/__init__.py +15 -0
  426. paddlex/inference/models/m_3d_bev_detection/predictor.py +303 -0
  427. paddlex/inference/models/m_3d_bev_detection/processors.py +990 -0
  428. paddlex/inference/models/m_3d_bev_detection/result.py +68 -0
  429. paddlex/inference/models/m_3d_bev_detection/visualizer_3d.py +169 -0
  430. paddlex/inference/models/multilingual_speech_recognition/__init__.py +15 -0
  431. paddlex/inference/models/multilingual_speech_recognition/predictor.py +137 -0
  432. paddlex/inference/models/multilingual_speech_recognition/processors.py +1933 -0
  433. paddlex/inference/models/multilingual_speech_recognition/result.py +21 -0
  434. paddlex/inference/models/object_detection/__init__.py +15 -0
  435. paddlex/inference/models/object_detection/predictor.py +344 -0
  436. paddlex/inference/models/object_detection/processors.py +885 -0
  437. paddlex/inference/models/object_detection/result.py +114 -0
  438. paddlex/inference/models/object_detection/utils.py +70 -0
  439. paddlex/inference/models/open_vocabulary_detection/__init__.py +15 -0
  440. paddlex/inference/models/open_vocabulary_detection/predictor.py +172 -0
  441. paddlex/inference/models/open_vocabulary_detection/processors/__init__.py +16 -0
  442. paddlex/inference/models/open_vocabulary_detection/processors/common.py +114 -0
  443. paddlex/inference/models/open_vocabulary_detection/processors/groundingdino_processors.py +496 -0
  444. paddlex/inference/models/open_vocabulary_detection/processors/yoloworld_processors.py +209 -0
  445. paddlex/inference/models/open_vocabulary_segmentation/__init__.py +15 -0
  446. paddlex/inference/models/open_vocabulary_segmentation/predictor.py +113 -0
  447. paddlex/inference/models/open_vocabulary_segmentation/processors/__init__.py +15 -0
  448. paddlex/inference/models/open_vocabulary_segmentation/processors/sam_processer.py +249 -0
  449. paddlex/inference/models/open_vocabulary_segmentation/results/__init__.py +15 -0
  450. paddlex/inference/models/open_vocabulary_segmentation/results/sam_result.py +149 -0
  451. paddlex/inference/models/semantic_segmentation/__init__.py +15 -0
  452. paddlex/inference/models/semantic_segmentation/predictor.py +158 -0
  453. paddlex/inference/models/semantic_segmentation/processors.py +117 -0
  454. paddlex/inference/models/semantic_segmentation/result.py +73 -0
  455. paddlex/inference/models/table_structure_recognition/__init__.py +15 -0
  456. paddlex/inference/models/table_structure_recognition/predictor.py +161 -0
  457. paddlex/inference/models/table_structure_recognition/processors.py +229 -0
  458. paddlex/inference/models/table_structure_recognition/result.py +63 -0
  459. paddlex/inference/models/text_detection/__init__.py +15 -0
  460. paddlex/inference/models/text_detection/predictor.py +191 -0
  461. paddlex/inference/models/text_detection/processors.py +538 -0
  462. paddlex/inference/models/text_detection/result.py +46 -0
  463. paddlex/inference/models/text_recognition/__init__.py +15 -0
  464. paddlex/inference/models/text_recognition/predictor.py +98 -0
  465. paddlex/inference/models/text_recognition/processors.py +245 -0
  466. paddlex/inference/models/text_recognition/result.py +76 -0
  467. paddlex/inference/models/ts_anomaly_detection/__init__.py +15 -0
  468. paddlex/inference/models/ts_anomaly_detection/predictor.py +141 -0
  469. paddlex/inference/models/ts_anomaly_detection/processors.py +98 -0
  470. paddlex/inference/models/ts_anomaly_detection/result.py +83 -0
  471. paddlex/inference/models/ts_classification/__init__.py +15 -0
  472. paddlex/inference/models/ts_classification/predictor.py +122 -0
  473. paddlex/inference/models/ts_classification/processors.py +122 -0
  474. paddlex/inference/models/ts_classification/result.py +87 -0
  475. paddlex/inference/models/ts_forecasting/__init__.py +15 -0
  476. paddlex/inference/models/ts_forecasting/predictor.py +154 -0
  477. paddlex/inference/models/ts_forecasting/processors.py +158 -0
  478. paddlex/inference/models/ts_forecasting/result.py +96 -0
  479. paddlex/inference/models/video_classification/__init__.py +15 -0
  480. paddlex/inference/models/video_classification/predictor.py +141 -0
  481. paddlex/inference/models/video_classification/processors.py +409 -0
  482. paddlex/inference/models/video_classification/result.py +96 -0
  483. paddlex/inference/models/video_detection/__init__.py +15 -0
  484. paddlex/inference/models/video_detection/predictor.py +129 -0
  485. paddlex/inference/models/video_detection/processors.py +463 -0
  486. paddlex/inference/models/video_detection/result.py +109 -0
  487. paddlex/inference/pipelines/__init__.py +239 -0
  488. paddlex/inference/pipelines/_parallel.py +172 -0
  489. paddlex/inference/pipelines/anomaly_detection/__init__.py +15 -0
  490. paddlex/inference/pipelines/anomaly_detection/pipeline.py +82 -0
  491. paddlex/inference/pipelines/attribute_recognition/__init__.py +15 -0
  492. paddlex/inference/pipelines/attribute_recognition/pipeline.py +120 -0
  493. paddlex/inference/pipelines/attribute_recognition/result.py +102 -0
  494. paddlex/inference/pipelines/base.py +156 -0
  495. paddlex/inference/pipelines/components/__init__.py +29 -0
  496. paddlex/inference/pipelines/components/chat_server/__init__.py +16 -0
  497. paddlex/inference/pipelines/components/chat_server/base.py +39 -0
  498. paddlex/inference/pipelines/components/chat_server/openai_bot_chat.py +236 -0
  499. paddlex/inference/pipelines/components/common/__init__.py +19 -0
  500. paddlex/inference/pipelines/components/common/base_operator.py +37 -0
  501. paddlex/inference/pipelines/components/common/base_result.py +66 -0
  502. paddlex/inference/pipelines/components/common/convert_points_and_boxes.py +45 -0
  503. paddlex/inference/pipelines/components/common/crop_image_regions.py +556 -0
  504. paddlex/inference/pipelines/components/common/seal_det_warp.py +972 -0
  505. paddlex/inference/pipelines/components/common/sort_boxes.py +85 -0
  506. paddlex/inference/pipelines/components/common/warp_image.py +50 -0
  507. paddlex/inference/pipelines/components/faisser.py +357 -0
  508. paddlex/inference/pipelines/components/prompt_engineering/__init__.py +16 -0
  509. paddlex/inference/pipelines/components/prompt_engineering/base.py +35 -0
  510. paddlex/inference/pipelines/components/prompt_engineering/generate_ensemble_prompt.py +128 -0
  511. paddlex/inference/pipelines/components/prompt_engineering/generate_kie_prompt.py +148 -0
  512. paddlex/inference/pipelines/components/retriever/__init__.py +16 -0
  513. paddlex/inference/pipelines/components/retriever/base.py +228 -0
  514. paddlex/inference/pipelines/components/retriever/openai_bot_retriever.py +70 -0
  515. paddlex/inference/pipelines/components/retriever/qianfan_bot_retriever.py +166 -0
  516. paddlex/inference/pipelines/components/utils/__init__.py +13 -0
  517. paddlex/inference/pipelines/components/utils/mixin.py +206 -0
  518. paddlex/inference/pipelines/doc_preprocessor/__init__.py +15 -0
  519. paddlex/inference/pipelines/doc_preprocessor/pipeline.py +209 -0
  520. paddlex/inference/pipelines/doc_preprocessor/result.py +98 -0
  521. paddlex/inference/pipelines/doc_understanding/__init__.py +15 -0
  522. paddlex/inference/pipelines/doc_understanding/pipeline.py +71 -0
  523. paddlex/inference/pipelines/face_recognition/__init__.py +15 -0
  524. paddlex/inference/pipelines/face_recognition/pipeline.py +63 -0
  525. paddlex/inference/pipelines/face_recognition/result.py +44 -0
  526. paddlex/inference/pipelines/formula_recognition/__init__.py +15 -0
  527. paddlex/inference/pipelines/formula_recognition/pipeline.py +347 -0
  528. paddlex/inference/pipelines/formula_recognition/result.py +282 -0
  529. paddlex/inference/pipelines/image_classification/__init__.py +15 -0
  530. paddlex/inference/pipelines/image_classification/pipeline.py +90 -0
  531. paddlex/inference/pipelines/image_multilabel_classification/__init__.py +15 -0
  532. paddlex/inference/pipelines/image_multilabel_classification/pipeline.py +97 -0
  533. paddlex/inference/pipelines/instance_segmentation/__init__.py +15 -0
  534. paddlex/inference/pipelines/instance_segmentation/pipeline.py +91 -0
  535. paddlex/inference/pipelines/keypoint_detection/__init__.py +15 -0
  536. paddlex/inference/pipelines/keypoint_detection/pipeline.py +158 -0
  537. paddlex/inference/pipelines/layout_parsing/__init__.py +16 -0
  538. paddlex/inference/pipelines/layout_parsing/pipeline.py +568 -0
  539. paddlex/inference/pipelines/layout_parsing/pipeline_v2.py +1382 -0
  540. paddlex/inference/pipelines/layout_parsing/result.py +191 -0
  541. paddlex/inference/pipelines/layout_parsing/result_v2.py +745 -0
  542. paddlex/inference/pipelines/layout_parsing/setting.py +87 -0
  543. paddlex/inference/pipelines/layout_parsing/utils.py +951 -0
  544. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/__init__.py +16 -0
  545. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/utils.py +1143 -0
  546. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/xycuts.py +562 -0
  547. paddlex/inference/pipelines/m_3d_bev_detection/__init__.py +15 -0
  548. paddlex/inference/pipelines/m_3d_bev_detection/pipeline.py +74 -0
  549. paddlex/inference/pipelines/multilingual_speech_recognition/__init__.py +15 -0
  550. paddlex/inference/pipelines/multilingual_speech_recognition/pipeline.py +78 -0
  551. paddlex/inference/pipelines/object_detection/__init__.py +15 -0
  552. paddlex/inference/pipelines/object_detection/pipeline.py +115 -0
  553. paddlex/inference/pipelines/ocr/__init__.py +15 -0
  554. paddlex/inference/pipelines/ocr/pipeline.py +463 -0
  555. paddlex/inference/pipelines/ocr/result.py +255 -0
  556. paddlex/inference/pipelines/open_vocabulary_detection/__init__.py +15 -0
  557. paddlex/inference/pipelines/open_vocabulary_detection/pipeline.py +86 -0
  558. paddlex/inference/pipelines/open_vocabulary_segmentation/__init__.py +15 -0
  559. paddlex/inference/pipelines/open_vocabulary_segmentation/pipeline.py +100 -0
  560. paddlex/inference/pipelines/pp_chatocr/__init__.py +16 -0
  561. paddlex/inference/pipelines/pp_chatocr/pipeline_base.py +111 -0
  562. paddlex/inference/pipelines/pp_chatocr/pipeline_v3.py +781 -0
  563. paddlex/inference/pipelines/pp_chatocr/pipeline_v4.py +992 -0
  564. paddlex/inference/pipelines/pp_shitu_v2/__init__.py +15 -0
  565. paddlex/inference/pipelines/pp_shitu_v2/pipeline.py +156 -0
  566. paddlex/inference/pipelines/pp_shitu_v2/result.py +126 -0
  567. paddlex/inference/pipelines/rotated_object_detection/__init__.py +15 -0
  568. paddlex/inference/pipelines/rotated_object_detection/pipeline.py +95 -0
  569. paddlex/inference/pipelines/seal_recognition/__init__.py +15 -0
  570. paddlex/inference/pipelines/seal_recognition/pipeline.py +335 -0
  571. paddlex/inference/pipelines/seal_recognition/result.py +89 -0
  572. paddlex/inference/pipelines/semantic_segmentation/__init__.py +15 -0
  573. paddlex/inference/pipelines/semantic_segmentation/pipeline.py +95 -0
  574. paddlex/inference/pipelines/small_object_detection/__init__.py +15 -0
  575. paddlex/inference/pipelines/small_object_detection/pipeline.py +95 -0
  576. paddlex/inference/pipelines/table_recognition/__init__.py +16 -0
  577. paddlex/inference/pipelines/table_recognition/pipeline.py +486 -0
  578. paddlex/inference/pipelines/table_recognition/pipeline_v2.py +1395 -0
  579. paddlex/inference/pipelines/table_recognition/result.py +218 -0
  580. paddlex/inference/pipelines/table_recognition/table_recognition_post_processing.py +366 -0
  581. paddlex/inference/pipelines/table_recognition/table_recognition_post_processing_v2.py +488 -0
  582. paddlex/inference/pipelines/table_recognition/utils.py +44 -0
  583. paddlex/inference/pipelines/ts_anomaly_detection/__init__.py +15 -0
  584. paddlex/inference/pipelines/ts_anomaly_detection/pipeline.py +72 -0
  585. paddlex/inference/pipelines/ts_classification/__init__.py +15 -0
  586. paddlex/inference/pipelines/ts_classification/pipeline.py +72 -0
  587. paddlex/inference/pipelines/ts_forecasting/__init__.py +15 -0
  588. paddlex/inference/pipelines/ts_forecasting/pipeline.py +72 -0
  589. paddlex/inference/pipelines/video_classification/__init__.py +15 -0
  590. paddlex/inference/pipelines/video_classification/pipeline.py +79 -0
  591. paddlex/inference/pipelines/video_detection/__init__.py +15 -0
  592. paddlex/inference/pipelines/video_detection/pipeline.py +86 -0
  593. paddlex/inference/serving/__init__.py +17 -0
  594. paddlex/inference/serving/basic_serving/__init__.py +18 -0
  595. paddlex/inference/serving/basic_serving/_app.py +221 -0
  596. paddlex/inference/serving/basic_serving/_pipeline_apps/__init__.py +44 -0
  597. paddlex/inference/serving/basic_serving/_pipeline_apps/_common/__init__.py +13 -0
  598. paddlex/inference/serving/basic_serving/_pipeline_apps/_common/common.py +104 -0
  599. paddlex/inference/serving/basic_serving/_pipeline_apps/_common/image_recognition.py +36 -0
  600. paddlex/inference/serving/basic_serving/_pipeline_apps/_common/ocr.py +95 -0
  601. paddlex/inference/serving/basic_serving/_pipeline_apps/anomaly_detection.py +67 -0
  602. paddlex/inference/serving/basic_serving/_pipeline_apps/doc_preprocessor.py +100 -0
  603. paddlex/inference/serving/basic_serving/_pipeline_apps/doc_understanding.py +153 -0
  604. paddlex/inference/serving/basic_serving/_pipeline_apps/face_recognition.py +226 -0
  605. paddlex/inference/serving/basic_serving/_pipeline_apps/formula_recognition.py +100 -0
  606. paddlex/inference/serving/basic_serving/_pipeline_apps/human_keypoint_detection.py +81 -0
  607. paddlex/inference/serving/basic_serving/_pipeline_apps/image_classification.py +69 -0
  608. paddlex/inference/serving/basic_serving/_pipeline_apps/image_multilabel_classification.py +73 -0
  609. paddlex/inference/serving/basic_serving/_pipeline_apps/instance_segmentation.py +87 -0
  610. paddlex/inference/serving/basic_serving/_pipeline_apps/layout_parsing.py +117 -0
  611. paddlex/inference/serving/basic_serving/_pipeline_apps/m_3d_bev_detection.py +79 -0
  612. paddlex/inference/serving/basic_serving/_pipeline_apps/multilingual_speech_recognition.py +92 -0
  613. paddlex/inference/serving/basic_serving/_pipeline_apps/object_detection.py +77 -0
  614. paddlex/inference/serving/basic_serving/_pipeline_apps/ocr.py +102 -0
  615. paddlex/inference/serving/basic_serving/_pipeline_apps/open_vocabulary_detection.py +81 -0
  616. paddlex/inference/serving/basic_serving/_pipeline_apps/open_vocabulary_segmentation.py +91 -0
  617. paddlex/inference/serving/basic_serving/_pipeline_apps/pedestrian_attribute_recognition.py +84 -0
  618. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv3_doc.py +193 -0
  619. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv4_doc.py +223 -0
  620. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_shituv2.py +221 -0
  621. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_structurev3.py +143 -0
  622. paddlex/inference/serving/basic_serving/_pipeline_apps/rotated_object_detection.py +81 -0
  623. paddlex/inference/serving/basic_serving/_pipeline_apps/seal_recognition.py +106 -0
  624. paddlex/inference/serving/basic_serving/_pipeline_apps/semantic_segmentation.py +67 -0
  625. paddlex/inference/serving/basic_serving/_pipeline_apps/small_object_detection.py +72 -0
  626. paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition.py +108 -0
  627. paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition_v2.py +113 -0
  628. paddlex/inference/serving/basic_serving/_pipeline_apps/ts_anomaly_detection.py +65 -0
  629. paddlex/inference/serving/basic_serving/_pipeline_apps/ts_classification.py +64 -0
  630. paddlex/inference/serving/basic_serving/_pipeline_apps/ts_forecast.py +65 -0
  631. paddlex/inference/serving/basic_serving/_pipeline_apps/vehicle_attribute_recognition.py +84 -0
  632. paddlex/inference/serving/basic_serving/_pipeline_apps/video_classification.py +76 -0
  633. paddlex/inference/serving/basic_serving/_pipeline_apps/video_detection.py +92 -0
  634. paddlex/inference/serving/basic_serving/_server.py +40 -0
  635. paddlex/inference/serving/infra/__init__.py +13 -0
  636. paddlex/inference/serving/infra/config.py +36 -0
  637. paddlex/inference/serving/infra/models.py +79 -0
  638. paddlex/inference/serving/infra/storage.py +180 -0
  639. paddlex/inference/serving/infra/utils.py +285 -0
  640. paddlex/inference/serving/schemas/__init__.py +13 -0
  641. paddlex/inference/serving/schemas/anomaly_detection.py +39 -0
  642. paddlex/inference/serving/schemas/doc_preprocessor.py +54 -0
  643. paddlex/inference/serving/schemas/doc_understanding.py +78 -0
  644. paddlex/inference/serving/schemas/face_recognition.py +124 -0
  645. paddlex/inference/serving/schemas/formula_recognition.py +56 -0
  646. paddlex/inference/serving/schemas/human_keypoint_detection.py +55 -0
  647. paddlex/inference/serving/schemas/image_classification.py +45 -0
  648. paddlex/inference/serving/schemas/image_multilabel_classification.py +47 -0
  649. paddlex/inference/serving/schemas/instance_segmentation.py +53 -0
  650. paddlex/inference/serving/schemas/layout_parsing.py +71 -0
  651. paddlex/inference/serving/schemas/m_3d_bev_detection.py +48 -0
  652. paddlex/inference/serving/schemas/multilingual_speech_recognition.py +57 -0
  653. paddlex/inference/serving/schemas/object_detection.py +52 -0
  654. paddlex/inference/serving/schemas/ocr.py +60 -0
  655. paddlex/inference/serving/schemas/open_vocabulary_detection.py +52 -0
  656. paddlex/inference/serving/schemas/open_vocabulary_segmentation.py +52 -0
  657. paddlex/inference/serving/schemas/pedestrian_attribute_recognition.py +61 -0
  658. paddlex/inference/serving/schemas/pp_chatocrv3_doc.py +133 -0
  659. paddlex/inference/serving/schemas/pp_chatocrv4_doc.py +150 -0
  660. paddlex/inference/serving/schemas/pp_shituv2.py +124 -0
  661. paddlex/inference/serving/schemas/pp_structurev3.py +88 -0
  662. paddlex/inference/serving/schemas/rotated_object_detection.py +52 -0
  663. paddlex/inference/serving/schemas/seal_recognition.py +62 -0
  664. paddlex/inference/serving/schemas/semantic_segmentation.py +45 -0
  665. paddlex/inference/serving/schemas/shared/__init__.py +13 -0
  666. paddlex/inference/serving/schemas/shared/classification.py +23 -0
  667. paddlex/inference/serving/schemas/shared/image_segmentation.py +28 -0
  668. paddlex/inference/serving/schemas/shared/object_detection.py +24 -0
  669. paddlex/inference/serving/schemas/shared/ocr.py +25 -0
  670. paddlex/inference/serving/schemas/small_object_detection.py +52 -0
  671. paddlex/inference/serving/schemas/table_recognition.py +64 -0
  672. paddlex/inference/serving/schemas/table_recognition_v2.py +69 -0
  673. paddlex/inference/serving/schemas/ts_anomaly_detection.py +37 -0
  674. paddlex/inference/serving/schemas/ts_classification.py +38 -0
  675. paddlex/inference/serving/schemas/ts_forecast.py +37 -0
  676. paddlex/inference/serving/schemas/vehicle_attribute_recognition.py +61 -0
  677. paddlex/inference/serving/schemas/video_classification.py +44 -0
  678. paddlex/inference/serving/schemas/video_detection.py +56 -0
  679. paddlex/inference/utils/__init__.py +13 -0
  680. paddlex/inference/utils/benchmark.py +379 -0
  681. paddlex/inference/utils/color_map.py +123 -0
  682. paddlex/inference/utils/get_pipeline_path.py +27 -0
  683. paddlex/inference/utils/hpi.py +254 -0
  684. paddlex/inference/utils/hpi_model_info_collection.json +2331 -0
  685. paddlex/inference/utils/io/__init__.py +36 -0
  686. paddlex/inference/utils/io/readers.py +504 -0
  687. paddlex/inference/utils/io/style.py +381 -0
  688. paddlex/inference/utils/io/tablepyxl.py +157 -0
  689. paddlex/inference/utils/io/writers.py +458 -0
  690. paddlex/inference/utils/model_paths.py +48 -0
  691. paddlex/inference/utils/new_ir_blocklist.py +27 -0
  692. paddlex/inference/utils/official_models.py +367 -0
  693. paddlex/inference/utils/pp_option.py +339 -0
  694. paddlex/inference/utils/trt_blocklist.py +43 -0
  695. paddlex/inference/utils/trt_config.py +420 -0
  696. paddlex/model.py +131 -0
  697. paddlex/modules/__init__.py +115 -0
  698. paddlex/modules/anomaly_detection/__init__.py +18 -0
  699. paddlex/modules/anomaly_detection/dataset_checker/__init__.py +94 -0
  700. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/__init__.py +19 -0
  701. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/analyse_dataset.py +82 -0
  702. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/check_dataset.py +91 -0
  703. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/convert_dataset.py +233 -0
  704. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/split_dataset.py +87 -0
  705. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/utils/__init__.py +13 -0
  706. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/utils/visualizer.py +76 -0
  707. paddlex/modules/anomaly_detection/evaluator.py +58 -0
  708. paddlex/modules/anomaly_detection/exportor.py +22 -0
  709. paddlex/modules/anomaly_detection/model_list.py +16 -0
  710. paddlex/modules/anomaly_detection/trainer.py +70 -0
  711. paddlex/modules/base/__init__.py +18 -0
  712. paddlex/modules/base/build_model.py +33 -0
  713. paddlex/modules/base/dataset_checker/__init__.py +16 -0
  714. paddlex/modules/base/dataset_checker/dataset_checker.py +169 -0
  715. paddlex/modules/base/dataset_checker/utils.py +108 -0
  716. paddlex/modules/base/evaluator.py +170 -0
  717. paddlex/modules/base/exportor.py +145 -0
  718. paddlex/modules/base/trainer.py +144 -0
  719. paddlex/modules/base/utils/__init__.py +13 -0
  720. paddlex/modules/base/utils/cinn_setting.py +89 -0
  721. paddlex/modules/base/utils/coco_eval.py +94 -0
  722. paddlex/modules/base/utils/topk_eval.py +118 -0
  723. paddlex/modules/doc_vlm/__init__.py +18 -0
  724. paddlex/modules/doc_vlm/dataset_checker.py +29 -0
  725. paddlex/modules/doc_vlm/evaluator.py +29 -0
  726. paddlex/modules/doc_vlm/exportor.py +29 -0
  727. paddlex/modules/doc_vlm/model_list.py +16 -0
  728. paddlex/modules/doc_vlm/trainer.py +41 -0
  729. paddlex/modules/face_recognition/__init__.py +18 -0
  730. paddlex/modules/face_recognition/dataset_checker/__init__.py +71 -0
  731. paddlex/modules/face_recognition/dataset_checker/dataset_src/__init__.py +16 -0
  732. paddlex/modules/face_recognition/dataset_checker/dataset_src/check_dataset.py +172 -0
  733. paddlex/modules/face_recognition/dataset_checker/dataset_src/utils/__init__.py +13 -0
  734. paddlex/modules/face_recognition/dataset_checker/dataset_src/utils/visualizer.py +153 -0
  735. paddlex/modules/face_recognition/evaluator.py +52 -0
  736. paddlex/modules/face_recognition/exportor.py +22 -0
  737. paddlex/modules/face_recognition/model_list.py +15 -0
  738. paddlex/modules/face_recognition/trainer.py +75 -0
  739. paddlex/modules/formula_recognition/__init__.py +18 -0
  740. paddlex/modules/formula_recognition/dataset_checker/__init__.py +113 -0
  741. paddlex/modules/formula_recognition/dataset_checker/dataset_src/__init__.py +19 -0
  742. paddlex/modules/formula_recognition/dataset_checker/dataset_src/analyse_dataset.py +158 -0
  743. paddlex/modules/formula_recognition/dataset_checker/dataset_src/check_dataset.py +76 -0
  744. paddlex/modules/formula_recognition/dataset_checker/dataset_src/convert_dataset.py +95 -0
  745. paddlex/modules/formula_recognition/dataset_checker/dataset_src/split_dataset.py +80 -0
  746. paddlex/modules/formula_recognition/evaluator.py +80 -0
  747. paddlex/modules/formula_recognition/exportor.py +22 -0
  748. paddlex/modules/formula_recognition/model_list.py +23 -0
  749. paddlex/modules/formula_recognition/trainer.py +123 -0
  750. paddlex/modules/general_recognition/__init__.py +18 -0
  751. paddlex/modules/general_recognition/dataset_checker/__init__.py +107 -0
  752. paddlex/modules/general_recognition/dataset_checker/dataset_src/__init__.py +19 -0
  753. paddlex/modules/general_recognition/dataset_checker/dataset_src/analyse_dataset.py +96 -0
  754. paddlex/modules/general_recognition/dataset_checker/dataset_src/check_dataset.py +99 -0
  755. paddlex/modules/general_recognition/dataset_checker/dataset_src/convert_dataset.py +100 -0
  756. paddlex/modules/general_recognition/dataset_checker/dataset_src/split_dataset.py +82 -0
  757. paddlex/modules/general_recognition/dataset_checker/dataset_src/utils/__init__.py +13 -0
  758. paddlex/modules/general_recognition/dataset_checker/dataset_src/utils/visualizer.py +147 -0
  759. paddlex/modules/general_recognition/evaluator.py +31 -0
  760. paddlex/modules/general_recognition/exportor.py +22 -0
  761. paddlex/modules/general_recognition/model_list.py +19 -0
  762. paddlex/modules/general_recognition/trainer.py +52 -0
  763. paddlex/modules/image_classification/__init__.py +18 -0
  764. paddlex/modules/image_classification/dataset_checker/__init__.py +104 -0
  765. paddlex/modules/image_classification/dataset_checker/dataset_src/__init__.py +19 -0
  766. paddlex/modules/image_classification/dataset_checker/dataset_src/analyse_dataset.py +92 -0
  767. paddlex/modules/image_classification/dataset_checker/dataset_src/check_dataset.py +132 -0
  768. paddlex/modules/image_classification/dataset_checker/dataset_src/convert_dataset.py +51 -0
  769. paddlex/modules/image_classification/dataset_checker/dataset_src/split_dataset.py +81 -0
  770. paddlex/modules/image_classification/dataset_checker/dataset_src/utils/__init__.py +13 -0
  771. paddlex/modules/image_classification/dataset_checker/dataset_src/utils/visualizer.py +153 -0
  772. paddlex/modules/image_classification/evaluator.py +43 -0
  773. paddlex/modules/image_classification/exportor.py +22 -0
  774. paddlex/modules/image_classification/model_list.py +99 -0
  775. paddlex/modules/image_classification/trainer.py +82 -0
  776. paddlex/modules/image_unwarping/__init__.py +13 -0
  777. paddlex/modules/image_unwarping/model_list.py +17 -0
  778. paddlex/modules/instance_segmentation/__init__.py +18 -0
  779. paddlex/modules/instance_segmentation/dataset_checker/__init__.py +107 -0
  780. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/__init__.py +19 -0
  781. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/analyse_dataset.py +82 -0
  782. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/check_dataset.py +95 -0
  783. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/convert_dataset.py +241 -0
  784. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/split_dataset.py +122 -0
  785. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/utils/__init__.py +13 -0
  786. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/utils/visualizer.py +223 -0
  787. paddlex/modules/instance_segmentation/evaluator.py +32 -0
  788. paddlex/modules/instance_segmentation/exportor.py +22 -0
  789. paddlex/modules/instance_segmentation/model_list.py +33 -0
  790. paddlex/modules/instance_segmentation/trainer.py +31 -0
  791. paddlex/modules/keypoint_detection/__init__.py +18 -0
  792. paddlex/modules/keypoint_detection/dataset_checker/__init__.py +56 -0
  793. paddlex/modules/keypoint_detection/dataset_checker/dataset_src/__init__.py +15 -0
  794. paddlex/modules/keypoint_detection/dataset_checker/dataset_src/check_dataset.py +91 -0
  795. paddlex/modules/keypoint_detection/dataset_checker/dataset_src/utils/__init__.py +13 -0
  796. paddlex/modules/keypoint_detection/dataset_checker/dataset_src/utils/visualizer.py +124 -0
  797. paddlex/modules/keypoint_detection/evaluator.py +41 -0
  798. paddlex/modules/keypoint_detection/exportor.py +22 -0
  799. paddlex/modules/keypoint_detection/model_list.py +16 -0
  800. paddlex/modules/keypoint_detection/trainer.py +39 -0
  801. paddlex/modules/m_3d_bev_detection/__init__.py +18 -0
  802. paddlex/modules/m_3d_bev_detection/dataset_checker/__init__.py +95 -0
  803. paddlex/modules/m_3d_bev_detection/dataset_checker/dataset_src/__init__.py +17 -0
  804. paddlex/modules/m_3d_bev_detection/dataset_checker/dataset_src/analyse_dataset.py +106 -0
  805. paddlex/modules/m_3d_bev_detection/dataset_checker/dataset_src/check_dataset.py +101 -0
  806. paddlex/modules/m_3d_bev_detection/evaluator.py +46 -0
  807. paddlex/modules/m_3d_bev_detection/exportor.py +22 -0
  808. paddlex/modules/m_3d_bev_detection/model_list.py +18 -0
  809. paddlex/modules/m_3d_bev_detection/trainer.py +68 -0
  810. paddlex/modules/multilabel_classification/__init__.py +18 -0
  811. paddlex/modules/multilabel_classification/dataset_checker/__init__.py +106 -0
  812. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/__init__.py +19 -0
  813. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/analyse_dataset.py +94 -0
  814. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/check_dataset.py +132 -0
  815. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/convert_dataset.py +120 -0
  816. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/split_dataset.py +81 -0
  817. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/utils/__init__.py +13 -0
  818. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/utils/visualizer.py +149 -0
  819. paddlex/modules/multilabel_classification/evaluator.py +43 -0
  820. paddlex/modules/multilabel_classification/exportor.py +22 -0
  821. paddlex/modules/multilabel_classification/model_list.py +24 -0
  822. paddlex/modules/multilabel_classification/trainer.py +85 -0
  823. paddlex/modules/multilingual_speech_recognition/__init__.py +18 -0
  824. paddlex/modules/multilingual_speech_recognition/dataset_checker.py +27 -0
  825. paddlex/modules/multilingual_speech_recognition/evaluator.py +27 -0
  826. paddlex/modules/multilingual_speech_recognition/exportor.py +27 -0
  827. paddlex/modules/multilingual_speech_recognition/model_list.py +22 -0
  828. paddlex/modules/multilingual_speech_recognition/trainer.py +42 -0
  829. paddlex/modules/object_detection/__init__.py +18 -0
  830. paddlex/modules/object_detection/dataset_checker/__init__.py +106 -0
  831. paddlex/modules/object_detection/dataset_checker/dataset_src/__init__.py +19 -0
  832. paddlex/modules/object_detection/dataset_checker/dataset_src/analyse_dataset.py +82 -0
  833. paddlex/modules/object_detection/dataset_checker/dataset_src/check_dataset.py +91 -0
  834. paddlex/modules/object_detection/dataset_checker/dataset_src/convert_dataset.py +438 -0
  835. paddlex/modules/object_detection/dataset_checker/dataset_src/split_dataset.py +123 -0
  836. paddlex/modules/object_detection/dataset_checker/dataset_src/utils/__init__.py +13 -0
  837. paddlex/modules/object_detection/dataset_checker/dataset_src/utils/visualizer.py +193 -0
  838. paddlex/modules/object_detection/evaluator.py +57 -0
  839. paddlex/modules/object_detection/exportor.py +22 -0
  840. paddlex/modules/object_detection/model_list.py +86 -0
  841. paddlex/modules/object_detection/trainer.py +98 -0
  842. paddlex/modules/open_vocabulary_detection/__init__.py +18 -0
  843. paddlex/modules/open_vocabulary_detection/dataset_checker.py +29 -0
  844. paddlex/modules/open_vocabulary_detection/evaluator.py +29 -0
  845. paddlex/modules/open_vocabulary_detection/exportor.py +29 -0
  846. paddlex/modules/open_vocabulary_detection/model_list.py +16 -0
  847. paddlex/modules/open_vocabulary_detection/trainer.py +44 -0
  848. paddlex/modules/open_vocabulary_segmentation/__init__.py +18 -0
  849. paddlex/modules/open_vocabulary_segmentation/dataset_checker.py +29 -0
  850. paddlex/modules/open_vocabulary_segmentation/evaluator.py +29 -0
  851. paddlex/modules/open_vocabulary_segmentation/exportor.py +29 -0
  852. paddlex/modules/open_vocabulary_segmentation/model_list.py +19 -0
  853. paddlex/modules/open_vocabulary_segmentation/trainer.py +44 -0
  854. paddlex/modules/semantic_segmentation/__init__.py +18 -0
  855. paddlex/modules/semantic_segmentation/dataset_checker/__init__.py +109 -0
  856. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/__init__.py +19 -0
  857. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/analyse_dataset.py +76 -0
  858. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/check_dataset.py +80 -0
  859. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/convert_dataset.py +165 -0
  860. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/split_dataset.py +87 -0
  861. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/utils/__init__.py +13 -0
  862. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/utils/visualizer.py +75 -0
  863. paddlex/modules/semantic_segmentation/evaluator.py +58 -0
  864. paddlex/modules/semantic_segmentation/exportor.py +31 -0
  865. paddlex/modules/semantic_segmentation/model_list.py +37 -0
  866. paddlex/modules/semantic_segmentation/trainer.py +72 -0
  867. paddlex/modules/table_recognition/__init__.py +18 -0
  868. paddlex/modules/table_recognition/dataset_checker/__init__.py +98 -0
  869. paddlex/modules/table_recognition/dataset_checker/dataset_src/__init__.py +18 -0
  870. paddlex/modules/table_recognition/dataset_checker/dataset_src/analyse_dataset.py +59 -0
  871. paddlex/modules/table_recognition/dataset_checker/dataset_src/check_dataset.py +87 -0
  872. paddlex/modules/table_recognition/dataset_checker/dataset_src/split_dataset.py +80 -0
  873. paddlex/modules/table_recognition/evaluator.py +43 -0
  874. paddlex/modules/table_recognition/exportor.py +22 -0
  875. paddlex/modules/table_recognition/model_list.py +21 -0
  876. paddlex/modules/table_recognition/trainer.py +67 -0
  877. paddlex/modules/text_detection/__init__.py +18 -0
  878. paddlex/modules/text_detection/dataset_checker/__init__.py +107 -0
  879. paddlex/modules/text_detection/dataset_checker/dataset_src/__init__.py +18 -0
  880. paddlex/modules/text_detection/dataset_checker/dataset_src/analyse_dataset.py +220 -0
  881. paddlex/modules/text_detection/dataset_checker/dataset_src/check_dataset.py +106 -0
  882. paddlex/modules/text_detection/dataset_checker/dataset_src/split_dataset.py +140 -0
  883. paddlex/modules/text_detection/evaluator.py +41 -0
  884. paddlex/modules/text_detection/exportor.py +22 -0
  885. paddlex/modules/text_detection/model_list.py +26 -0
  886. paddlex/modules/text_detection/trainer.py +65 -0
  887. paddlex/modules/text_recognition/__init__.py +18 -0
  888. paddlex/modules/text_recognition/dataset_checker/__init__.py +125 -0
  889. paddlex/modules/text_recognition/dataset_checker/dataset_src/__init__.py +19 -0
  890. paddlex/modules/text_recognition/dataset_checker/dataset_src/analyse_dataset.py +162 -0
  891. paddlex/modules/text_recognition/dataset_checker/dataset_src/check_dataset.py +104 -0
  892. paddlex/modules/text_recognition/dataset_checker/dataset_src/convert_dataset.py +95 -0
  893. paddlex/modules/text_recognition/dataset_checker/dataset_src/split_dataset.py +80 -0
  894. paddlex/modules/text_recognition/evaluator.py +64 -0
  895. paddlex/modules/text_recognition/exportor.py +22 -0
  896. paddlex/modules/text_recognition/model_list.py +36 -0
  897. paddlex/modules/text_recognition/trainer.py +105 -0
  898. paddlex/modules/ts_anomaly_detection/__init__.py +19 -0
  899. paddlex/modules/ts_anomaly_detection/dataset_checker/__init__.py +111 -0
  900. paddlex/modules/ts_anomaly_detection/dataset_checker/dataset_src/__init__.py +19 -0
  901. paddlex/modules/ts_anomaly_detection/dataset_checker/dataset_src/analyse_dataset.py +19 -0
  902. paddlex/modules/ts_anomaly_detection/dataset_checker/dataset_src/check_dataset.py +64 -0
  903. paddlex/modules/ts_anomaly_detection/dataset_checker/dataset_src/convert_dataset.py +74 -0
  904. paddlex/modules/ts_anomaly_detection/dataset_checker/dataset_src/split_dataset.py +63 -0
  905. paddlex/modules/ts_anomaly_detection/evaluator.py +67 -0
  906. paddlex/modules/ts_anomaly_detection/exportor.py +44 -0
  907. paddlex/modules/ts_anomaly_detection/model_list.py +22 -0
  908. paddlex/modules/ts_anomaly_detection/trainer.py +113 -0
  909. paddlex/modules/ts_classification/__init__.py +19 -0
  910. paddlex/modules/ts_classification/dataset_checker/__init__.py +111 -0
  911. paddlex/modules/ts_classification/dataset_checker/dataset_src/__init__.py +19 -0
  912. paddlex/modules/ts_classification/dataset_checker/dataset_src/analyse_dataset.py +77 -0
  913. paddlex/modules/ts_classification/dataset_checker/dataset_src/check_dataset.py +64 -0
  914. paddlex/modules/ts_classification/dataset_checker/dataset_src/convert_dataset.py +74 -0
  915. paddlex/modules/ts_classification/dataset_checker/dataset_src/split_dataset.py +88 -0
  916. paddlex/modules/ts_classification/evaluator.py +66 -0
  917. paddlex/modules/ts_classification/exportor.py +44 -0
  918. paddlex/modules/ts_classification/model_list.py +18 -0
  919. paddlex/modules/ts_classification/trainer.py +108 -0
  920. paddlex/modules/ts_forecast/__init__.py +19 -0
  921. paddlex/modules/ts_forecast/dataset_checker/__init__.py +111 -0
  922. paddlex/modules/ts_forecast/dataset_checker/dataset_src/__init__.py +19 -0
  923. paddlex/modules/ts_forecast/dataset_checker/dataset_src/analyse_dataset.py +19 -0
  924. paddlex/modules/ts_forecast/dataset_checker/dataset_src/check_dataset.py +64 -0
  925. paddlex/modules/ts_forecast/dataset_checker/dataset_src/convert_dataset.py +73 -0
  926. paddlex/modules/ts_forecast/dataset_checker/dataset_src/split_dataset.py +63 -0
  927. paddlex/modules/ts_forecast/evaluator.py +66 -0
  928. paddlex/modules/ts_forecast/exportor.py +44 -0
  929. paddlex/modules/ts_forecast/model_list.py +24 -0
  930. paddlex/modules/ts_forecast/trainer.py +108 -0
  931. paddlex/modules/video_classification/__init__.py +18 -0
  932. paddlex/modules/video_classification/dataset_checker/__init__.py +93 -0
  933. paddlex/modules/video_classification/dataset_checker/dataset_src/__init__.py +18 -0
  934. paddlex/modules/video_classification/dataset_checker/dataset_src/analyse_dataset.py +93 -0
  935. paddlex/modules/video_classification/dataset_checker/dataset_src/check_dataset.py +120 -0
  936. paddlex/modules/video_classification/dataset_checker/dataset_src/split_dataset.py +82 -0
  937. paddlex/modules/video_classification/evaluator.py +44 -0
  938. paddlex/modules/video_classification/exportor.py +22 -0
  939. paddlex/modules/video_classification/model_list.py +19 -0
  940. paddlex/modules/video_classification/trainer.py +88 -0
  941. paddlex/modules/video_detection/__init__.py +18 -0
  942. paddlex/modules/video_detection/dataset_checker/__init__.py +86 -0
  943. paddlex/modules/video_detection/dataset_checker/dataset_src/__init__.py +17 -0
  944. paddlex/modules/video_detection/dataset_checker/dataset_src/analyse_dataset.py +100 -0
  945. paddlex/modules/video_detection/dataset_checker/dataset_src/check_dataset.py +132 -0
  946. paddlex/modules/video_detection/evaluator.py +42 -0
  947. paddlex/modules/video_detection/exportor.py +22 -0
  948. paddlex/modules/video_detection/model_list.py +15 -0
  949. paddlex/modules/video_detection/trainer.py +82 -0
  950. paddlex/ops/__init__.py +152 -0
  951. paddlex/ops/iou3d_nms/iou3d_cpu.cpp +266 -0
  952. paddlex/ops/iou3d_nms/iou3d_cpu.h +28 -0
  953. paddlex/ops/iou3d_nms/iou3d_nms.cpp +206 -0
  954. paddlex/ops/iou3d_nms/iou3d_nms.h +35 -0
  955. paddlex/ops/iou3d_nms/iou3d_nms_api.cpp +114 -0
  956. paddlex/ops/iou3d_nms/iou3d_nms_kernel.cu +484 -0
  957. paddlex/ops/setup.py +37 -0
  958. paddlex/ops/voxel/voxelize_op.cc +194 -0
  959. paddlex/ops/voxel/voxelize_op.cu +346 -0
  960. paddlex/paddlex_cli.py +476 -0
  961. paddlex/repo_apis/Paddle3D_api/__init__.py +17 -0
  962. paddlex/repo_apis/Paddle3D_api/bev_fusion/__init__.py +18 -0
  963. paddlex/repo_apis/Paddle3D_api/bev_fusion/config.py +118 -0
  964. paddlex/repo_apis/Paddle3D_api/bev_fusion/model.py +238 -0
  965. paddlex/repo_apis/Paddle3D_api/bev_fusion/register.py +55 -0
  966. paddlex/repo_apis/Paddle3D_api/bev_fusion/runner.py +104 -0
  967. paddlex/repo_apis/Paddle3D_api/pp3d_config.py +145 -0
  968. paddlex/repo_apis/PaddleClas_api/__init__.py +17 -0
  969. paddlex/repo_apis/PaddleClas_api/cls/__init__.py +19 -0
  970. paddlex/repo_apis/PaddleClas_api/cls/config.py +595 -0
  971. paddlex/repo_apis/PaddleClas_api/cls/model.py +355 -0
  972. paddlex/repo_apis/PaddleClas_api/cls/register.py +907 -0
  973. paddlex/repo_apis/PaddleClas_api/cls/runner.py +218 -0
  974. paddlex/repo_apis/PaddleClas_api/shitu_rec/__init__.py +18 -0
  975. paddlex/repo_apis/PaddleClas_api/shitu_rec/config.py +141 -0
  976. paddlex/repo_apis/PaddleClas_api/shitu_rec/model.py +20 -0
  977. paddlex/repo_apis/PaddleClas_api/shitu_rec/register.py +68 -0
  978. paddlex/repo_apis/PaddleClas_api/shitu_rec/runner.py +50 -0
  979. paddlex/repo_apis/PaddleDetection_api/__init__.py +17 -0
  980. paddlex/repo_apis/PaddleDetection_api/config_helper.py +280 -0
  981. paddlex/repo_apis/PaddleDetection_api/instance_seg/__init__.py +18 -0
  982. paddlex/repo_apis/PaddleDetection_api/instance_seg/config.py +457 -0
  983. paddlex/repo_apis/PaddleDetection_api/instance_seg/model.py +403 -0
  984. paddlex/repo_apis/PaddleDetection_api/instance_seg/register.py +262 -0
  985. paddlex/repo_apis/PaddleDetection_api/instance_seg/runner.py +225 -0
  986. paddlex/repo_apis/PaddleDetection_api/object_det/__init__.py +19 -0
  987. paddlex/repo_apis/PaddleDetection_api/object_det/config.py +540 -0
  988. paddlex/repo_apis/PaddleDetection_api/object_det/model.py +429 -0
  989. paddlex/repo_apis/PaddleDetection_api/object_det/official_categories.py +245 -0
  990. paddlex/repo_apis/PaddleDetection_api/object_det/register.py +1135 -0
  991. paddlex/repo_apis/PaddleDetection_api/object_det/runner.py +225 -0
  992. paddlex/repo_apis/PaddleNLP_api/__init__.py +13 -0
  993. paddlex/repo_apis/PaddleOCR_api/__init__.py +22 -0
  994. paddlex/repo_apis/PaddleOCR_api/config_utils.py +53 -0
  995. paddlex/repo_apis/PaddleOCR_api/formula_rec/__init__.py +16 -0
  996. paddlex/repo_apis/PaddleOCR_api/formula_rec/config.py +571 -0
  997. paddlex/repo_apis/PaddleOCR_api/formula_rec/model.py +398 -0
  998. paddlex/repo_apis/PaddleOCR_api/formula_rec/register.py +99 -0
  999. paddlex/repo_apis/PaddleOCR_api/formula_rec/runner.py +239 -0
  1000. paddlex/repo_apis/PaddleOCR_api/table_rec/__init__.py +16 -0
  1001. paddlex/repo_apis/PaddleOCR_api/table_rec/config.py +64 -0
  1002. paddlex/repo_apis/PaddleOCR_api/table_rec/model.py +126 -0
  1003. paddlex/repo_apis/PaddleOCR_api/table_rec/register.py +70 -0
  1004. paddlex/repo_apis/PaddleOCR_api/table_rec/runner.py +51 -0
  1005. paddlex/repo_apis/PaddleOCR_api/text_det/__init__.py +16 -0
  1006. paddlex/repo_apis/PaddleOCR_api/text_det/config.py +62 -0
  1007. paddlex/repo_apis/PaddleOCR_api/text_det/model.py +72 -0
  1008. paddlex/repo_apis/PaddleOCR_api/text_det/register.py +107 -0
  1009. paddlex/repo_apis/PaddleOCR_api/text_det/runner.py +53 -0
  1010. paddlex/repo_apis/PaddleOCR_api/text_rec/__init__.py +16 -0
  1011. paddlex/repo_apis/PaddleOCR_api/text_rec/config.py +564 -0
  1012. paddlex/repo_apis/PaddleOCR_api/text_rec/model.py +398 -0
  1013. paddlex/repo_apis/PaddleOCR_api/text_rec/register.py +216 -0
  1014. paddlex/repo_apis/PaddleOCR_api/text_rec/runner.py +239 -0
  1015. paddlex/repo_apis/PaddleSeg_api/__init__.py +16 -0
  1016. paddlex/repo_apis/PaddleSeg_api/base_seg_config.py +134 -0
  1017. paddlex/repo_apis/PaddleSeg_api/seg/__init__.py +16 -0
  1018. paddlex/repo_apis/PaddleSeg_api/seg/config.py +183 -0
  1019. paddlex/repo_apis/PaddleSeg_api/seg/model.py +491 -0
  1020. paddlex/repo_apis/PaddleSeg_api/seg/register.py +272 -0
  1021. paddlex/repo_apis/PaddleSeg_api/seg/runner.py +261 -0
  1022. paddlex/repo_apis/PaddleTS_api/__init__.py +20 -0
  1023. paddlex/repo_apis/PaddleTS_api/ts_ad/__init__.py +16 -0
  1024. paddlex/repo_apis/PaddleTS_api/ts_ad/config.py +88 -0
  1025. paddlex/repo_apis/PaddleTS_api/ts_ad/register.py +146 -0
  1026. paddlex/repo_apis/PaddleTS_api/ts_ad/runner.py +158 -0
  1027. paddlex/repo_apis/PaddleTS_api/ts_base/__init__.py +13 -0
  1028. paddlex/repo_apis/PaddleTS_api/ts_base/config.py +244 -0
  1029. paddlex/repo_apis/PaddleTS_api/ts_base/model.py +276 -0
  1030. paddlex/repo_apis/PaddleTS_api/ts_base/runner.py +158 -0
  1031. paddlex/repo_apis/PaddleTS_api/ts_cls/__init__.py +16 -0
  1032. paddlex/repo_apis/PaddleTS_api/ts_cls/config.py +72 -0
  1033. paddlex/repo_apis/PaddleTS_api/ts_cls/register.py +59 -0
  1034. paddlex/repo_apis/PaddleTS_api/ts_cls/runner.py +158 -0
  1035. paddlex/repo_apis/PaddleTS_api/ts_fc/__init__.py +16 -0
  1036. paddlex/repo_apis/PaddleTS_api/ts_fc/config.py +136 -0
  1037. paddlex/repo_apis/PaddleTS_api/ts_fc/register.py +186 -0
  1038. paddlex/repo_apis/PaddleVideo_api/__init__.py +17 -0
  1039. paddlex/repo_apis/PaddleVideo_api/config_utils.py +51 -0
  1040. paddlex/repo_apis/PaddleVideo_api/video_cls/__init__.py +19 -0
  1041. paddlex/repo_apis/PaddleVideo_api/video_cls/config.py +548 -0
  1042. paddlex/repo_apis/PaddleVideo_api/video_cls/model.py +346 -0
  1043. paddlex/repo_apis/PaddleVideo_api/video_cls/register.py +70 -0
  1044. paddlex/repo_apis/PaddleVideo_api/video_cls/runner.py +204 -0
  1045. paddlex/repo_apis/PaddleVideo_api/video_det/__init__.py +19 -0
  1046. paddlex/repo_apis/PaddleVideo_api/video_det/config.py +549 -0
  1047. paddlex/repo_apis/PaddleVideo_api/video_det/model.py +298 -0
  1048. paddlex/repo_apis/PaddleVideo_api/video_det/register.py +44 -0
  1049. paddlex/repo_apis/PaddleVideo_api/video_det/runner.py +199 -0
  1050. paddlex/repo_apis/__init__.py +13 -0
  1051. paddlex/repo_apis/base/__init__.py +22 -0
  1052. paddlex/repo_apis/base/config.py +237 -0
  1053. paddlex/repo_apis/base/model.py +563 -0
  1054. paddlex/repo_apis/base/register.py +135 -0
  1055. paddlex/repo_apis/base/runner.py +390 -0
  1056. paddlex/repo_apis/base/utils/__init__.py +13 -0
  1057. paddlex/repo_apis/base/utils/arg.py +64 -0
  1058. paddlex/repo_apis/base/utils/subprocess.py +107 -0
  1059. paddlex/repo_manager/__init__.py +17 -0
  1060. paddlex/repo_manager/core.py +253 -0
  1061. paddlex/repo_manager/meta.py +180 -0
  1062. paddlex/repo_manager/repo.py +425 -0
  1063. paddlex/repo_manager/utils.py +148 -0
  1064. paddlex/utils/__init__.py +1 -12
  1065. paddlex/utils/cache.py +146 -0
  1066. paddlex/utils/config.py +216 -0
  1067. paddlex/utils/custom_device_list.py +311 -0
  1068. paddlex/utils/deps.py +249 -0
  1069. paddlex/utils/device.py +195 -0
  1070. paddlex/utils/download.py +168 -182
  1071. paddlex/utils/env.py +31 -48
  1072. paddlex/utils/errors/__init__.py +17 -0
  1073. paddlex/utils/errors/dataset_checker.py +78 -0
  1074. paddlex/utils/errors/others.py +138 -0
  1075. paddlex/utils/file_interface.py +211 -0
  1076. paddlex/utils/flags.py +70 -0
  1077. paddlex/utils/fonts/__init__.py +97 -0
  1078. paddlex/utils/func_register.py +41 -0
  1079. paddlex/utils/install.py +87 -0
  1080. paddlex/utils/interactive_get_pipeline.py +55 -0
  1081. paddlex/utils/lazy_loader.py +68 -0
  1082. paddlex/utils/logging.py +140 -33
  1083. paddlex/utils/misc.py +201 -0
  1084. paddlex/utils/pipeline_arguments.py +719 -0
  1085. paddlex/utils/result_saver.py +58 -0
  1086. paddlex/utils/subclass_register.py +99 -0
  1087. paddlex/version.py +55 -0
  1088. paddlex-3.0.0.dist-info/METADATA +1168 -0
  1089. paddlex-3.0.0.dist-info/RECORD +1093 -0
  1090. paddlex-3.0.0.dist-info/WHEEL +5 -0
  1091. paddlex-3.0.0.dist-info/entry_points.txt +2 -0
  1092. paddlex-3.0.0.dist-info/licenses/LICENSE +169 -0
  1093. paddlex-3.0.0.dist-info/top_level.txt +1 -0
  1094. PaddleClas/__init__.py +0 -16
  1095. PaddleClas/paddleclas.py +0 -375
  1096. PaddleClas/ppcls/__init__.py +0 -20
  1097. PaddleClas/ppcls/data/__init__.py +0 -15
  1098. PaddleClas/ppcls/data/imaug/__init__.py +0 -94
  1099. PaddleClas/ppcls/data/imaug/autoaugment.py +0 -264
  1100. PaddleClas/ppcls/data/imaug/batch_operators.py +0 -117
  1101. PaddleClas/ppcls/data/imaug/cutout.py +0 -41
  1102. PaddleClas/ppcls/data/imaug/fmix.py +0 -217
  1103. PaddleClas/ppcls/data/imaug/grid.py +0 -89
  1104. PaddleClas/ppcls/data/imaug/hide_and_seek.py +0 -44
  1105. PaddleClas/ppcls/data/imaug/operators.py +0 -244
  1106. PaddleClas/ppcls/data/imaug/randaugment.py +0 -106
  1107. PaddleClas/ppcls/data/imaug/random_erasing.py +0 -55
  1108. PaddleClas/ppcls/data/reader.py +0 -318
  1109. PaddleClas/ppcls/modeling/__init__.py +0 -20
  1110. PaddleClas/ppcls/modeling/architectures/__init__.py +0 -51
  1111. PaddleClas/ppcls/modeling/architectures/alexnet.py +0 -132
  1112. PaddleClas/ppcls/modeling/architectures/darknet.py +0 -161
  1113. PaddleClas/ppcls/modeling/architectures/densenet.py +0 -308
  1114. PaddleClas/ppcls/modeling/architectures/distillation_models.py +0 -65
  1115. PaddleClas/ppcls/modeling/architectures/distilled_vision_transformer.py +0 -196
  1116. PaddleClas/ppcls/modeling/architectures/dpn.py +0 -425
  1117. PaddleClas/ppcls/modeling/architectures/efficientnet.py +0 -901
  1118. PaddleClas/ppcls/modeling/architectures/ghostnet.py +0 -331
  1119. PaddleClas/ppcls/modeling/architectures/googlenet.py +0 -207
  1120. PaddleClas/ppcls/modeling/architectures/hrnet.py +0 -742
  1121. PaddleClas/ppcls/modeling/architectures/inception_v3.py +0 -481
  1122. PaddleClas/ppcls/modeling/architectures/inception_v4.py +0 -455
  1123. PaddleClas/ppcls/modeling/architectures/mixnet.py +0 -782
  1124. PaddleClas/ppcls/modeling/architectures/mobilenet_v1.py +0 -266
  1125. PaddleClas/ppcls/modeling/architectures/mobilenet_v2.py +0 -248
  1126. PaddleClas/ppcls/modeling/architectures/mobilenet_v3.py +0 -359
  1127. PaddleClas/ppcls/modeling/architectures/regnet.py +0 -383
  1128. PaddleClas/ppcls/modeling/architectures/repvgg.py +0 -339
  1129. PaddleClas/ppcls/modeling/architectures/res2net.py +0 -272
  1130. PaddleClas/ppcls/modeling/architectures/res2net_vd.py +0 -295
  1131. PaddleClas/ppcls/modeling/architectures/resnest.py +0 -705
  1132. PaddleClas/ppcls/modeling/architectures/resnet.py +0 -316
  1133. PaddleClas/ppcls/modeling/architectures/resnet_vc.py +0 -309
  1134. PaddleClas/ppcls/modeling/architectures/resnet_vd.py +0 -354
  1135. PaddleClas/ppcls/modeling/architectures/resnext.py +0 -253
  1136. PaddleClas/ppcls/modeling/architectures/resnext101_wsl.py +0 -447
  1137. PaddleClas/ppcls/modeling/architectures/resnext_vd.py +0 -266
  1138. PaddleClas/ppcls/modeling/architectures/rexnet.py +0 -240
  1139. PaddleClas/ppcls/modeling/architectures/se_resnet_vd.py +0 -378
  1140. PaddleClas/ppcls/modeling/architectures/se_resnext.py +0 -290
  1141. PaddleClas/ppcls/modeling/architectures/se_resnext_vd.py +0 -285
  1142. PaddleClas/ppcls/modeling/architectures/shufflenet_v2.py +0 -320
  1143. PaddleClas/ppcls/modeling/architectures/squeezenet.py +0 -154
  1144. PaddleClas/ppcls/modeling/architectures/vgg.py +0 -152
  1145. PaddleClas/ppcls/modeling/architectures/vision_transformer.py +0 -402
  1146. PaddleClas/ppcls/modeling/architectures/xception.py +0 -345
  1147. PaddleClas/ppcls/modeling/architectures/xception_deeplab.py +0 -386
  1148. PaddleClas/ppcls/modeling/loss.py +0 -154
  1149. PaddleClas/ppcls/modeling/utils.py +0 -53
  1150. PaddleClas/ppcls/optimizer/__init__.py +0 -19
  1151. PaddleClas/ppcls/optimizer/learning_rate.py +0 -159
  1152. PaddleClas/ppcls/optimizer/optimizer.py +0 -165
  1153. PaddleClas/ppcls/utils/__init__.py +0 -27
  1154. PaddleClas/ppcls/utils/check.py +0 -151
  1155. PaddleClas/ppcls/utils/config.py +0 -201
  1156. PaddleClas/ppcls/utils/logger.py +0 -120
  1157. PaddleClas/ppcls/utils/metrics.py +0 -107
  1158. PaddleClas/ppcls/utils/misc.py +0 -62
  1159. PaddleClas/ppcls/utils/model_zoo.py +0 -213
  1160. PaddleClas/ppcls/utils/save_load.py +0 -163
  1161. PaddleClas/setup.py +0 -55
  1162. PaddleClas/tools/__init__.py +0 -15
  1163. PaddleClas/tools/download.py +0 -50
  1164. PaddleClas/tools/ema.py +0 -58
  1165. PaddleClas/tools/eval.py +0 -112
  1166. PaddleClas/tools/export_model.py +0 -85
  1167. PaddleClas/tools/export_serving_model.py +0 -76
  1168. PaddleClas/tools/infer/__init__.py +0 -16
  1169. PaddleClas/tools/infer/infer.py +0 -94
  1170. PaddleClas/tools/infer/predict.py +0 -117
  1171. PaddleClas/tools/infer/utils.py +0 -233
  1172. PaddleClas/tools/program.py +0 -444
  1173. PaddleClas/tools/test_hubserving.py +0 -113
  1174. PaddleClas/tools/train.py +0 -141
  1175. paddlex/cls.py +0 -76
  1176. paddlex/command.py +0 -215
  1177. paddlex/cv/__init__.py +0 -17
  1178. paddlex/cv/datasets/__init__.py +0 -18
  1179. paddlex/cv/datasets/coco.py +0 -169
  1180. paddlex/cv/datasets/imagenet.py +0 -88
  1181. paddlex/cv/datasets/seg_dataset.py +0 -91
  1182. paddlex/cv/datasets/voc.py +0 -301
  1183. paddlex/cv/models/__init__.py +0 -18
  1184. paddlex/cv/models/base.py +0 -623
  1185. paddlex/cv/models/classifier.py +0 -814
  1186. paddlex/cv/models/detector.py +0 -1747
  1187. paddlex/cv/models/load_model.py +0 -126
  1188. paddlex/cv/models/segmenter.py +0 -673
  1189. paddlex/cv/models/slim/__init__.py +0 -13
  1190. paddlex/cv/models/slim/prune.py +0 -55
  1191. paddlex/cv/models/utils/__init__.py +0 -13
  1192. paddlex/cv/models/utils/det_metrics/__init__.py +0 -15
  1193. paddlex/cv/models/utils/det_metrics/coco_utils.py +0 -217
  1194. paddlex/cv/models/utils/det_metrics/metrics.py +0 -220
  1195. paddlex/cv/models/utils/ema.py +0 -48
  1196. paddlex/cv/models/utils/seg_metrics.py +0 -62
  1197. paddlex/cv/models/utils/visualize.py +0 -394
  1198. paddlex/cv/transforms/__init__.py +0 -46
  1199. paddlex/cv/transforms/batch_operators.py +0 -286
  1200. paddlex/cv/transforms/box_utils.py +0 -41
  1201. paddlex/cv/transforms/functions.py +0 -193
  1202. paddlex/cv/transforms/operators.py +0 -1402
  1203. paddlex/det.py +0 -43
  1204. paddlex/paddleseg/__init__.py +0 -17
  1205. paddlex/paddleseg/core/__init__.py +0 -20
  1206. paddlex/paddleseg/core/infer.py +0 -289
  1207. paddlex/paddleseg/core/predict.py +0 -145
  1208. paddlex/paddleseg/core/train.py +0 -258
  1209. paddlex/paddleseg/core/val.py +0 -172
  1210. paddlex/paddleseg/cvlibs/__init__.py +0 -17
  1211. paddlex/paddleseg/cvlibs/callbacks.py +0 -279
  1212. paddlex/paddleseg/cvlibs/config.py +0 -359
  1213. paddlex/paddleseg/cvlibs/manager.py +0 -142
  1214. paddlex/paddleseg/cvlibs/param_init.py +0 -91
  1215. paddlex/paddleseg/datasets/__init__.py +0 -21
  1216. paddlex/paddleseg/datasets/ade.py +0 -112
  1217. paddlex/paddleseg/datasets/cityscapes.py +0 -86
  1218. paddlex/paddleseg/datasets/cocostuff.py +0 -79
  1219. paddlex/paddleseg/datasets/dataset.py +0 -164
  1220. paddlex/paddleseg/datasets/mini_deep_globe_road_extraction.py +0 -95
  1221. paddlex/paddleseg/datasets/optic_disc_seg.py +0 -97
  1222. paddlex/paddleseg/datasets/pascal_context.py +0 -80
  1223. paddlex/paddleseg/datasets/voc.py +0 -113
  1224. paddlex/paddleseg/models/__init__.py +0 -39
  1225. paddlex/paddleseg/models/ann.py +0 -436
  1226. paddlex/paddleseg/models/attention_unet.py +0 -189
  1227. paddlex/paddleseg/models/backbones/__init__.py +0 -18
  1228. paddlex/paddleseg/models/backbones/hrnet.py +0 -815
  1229. paddlex/paddleseg/models/backbones/mobilenetv3.py +0 -365
  1230. paddlex/paddleseg/models/backbones/resnet_vd.py +0 -364
  1231. paddlex/paddleseg/models/backbones/xception_deeplab.py +0 -415
  1232. paddlex/paddleseg/models/bisenet.py +0 -311
  1233. paddlex/paddleseg/models/danet.py +0 -220
  1234. paddlex/paddleseg/models/decoupled_segnet.py +0 -233
  1235. paddlex/paddleseg/models/deeplab.py +0 -258
  1236. paddlex/paddleseg/models/dnlnet.py +0 -231
  1237. paddlex/paddleseg/models/emanet.py +0 -219
  1238. paddlex/paddleseg/models/fast_scnn.py +0 -318
  1239. paddlex/paddleseg/models/fcn.py +0 -135
  1240. paddlex/paddleseg/models/gcnet.py +0 -223
  1241. paddlex/paddleseg/models/gscnn.py +0 -357
  1242. paddlex/paddleseg/models/hardnet.py +0 -309
  1243. paddlex/paddleseg/models/isanet.py +0 -202
  1244. paddlex/paddleseg/models/layers/__init__.py +0 -19
  1245. paddlex/paddleseg/models/layers/activation.py +0 -73
  1246. paddlex/paddleseg/models/layers/attention.py +0 -146
  1247. paddlex/paddleseg/models/layers/layer_libs.py +0 -168
  1248. paddlex/paddleseg/models/layers/nonlocal2d.py +0 -155
  1249. paddlex/paddleseg/models/layers/pyramid_pool.py +0 -182
  1250. paddlex/paddleseg/models/losses/__init__.py +0 -27
  1251. paddlex/paddleseg/models/losses/binary_cross_entropy_loss.py +0 -174
  1252. paddlex/paddleseg/models/losses/bootstrapped_cross_entropy.py +0 -73
  1253. paddlex/paddleseg/models/losses/cross_entropy_loss.py +0 -94
  1254. paddlex/paddleseg/models/losses/decoupledsegnet_relax_boundary_loss.py +0 -129
  1255. paddlex/paddleseg/models/losses/dice_loss.py +0 -61
  1256. paddlex/paddleseg/models/losses/edge_attention_loss.py +0 -78
  1257. paddlex/paddleseg/models/losses/gscnn_dual_task_loss.py +0 -141
  1258. paddlex/paddleseg/models/losses/l1_loss.py +0 -76
  1259. paddlex/paddleseg/models/losses/lovasz_loss.py +0 -222
  1260. paddlex/paddleseg/models/losses/mean_square_error_loss.py +0 -65
  1261. paddlex/paddleseg/models/losses/mixed_loss.py +0 -58
  1262. paddlex/paddleseg/models/losses/ohem_cross_entropy_loss.py +0 -99
  1263. paddlex/paddleseg/models/losses/ohem_edge_attention_loss.py +0 -114
  1264. paddlex/paddleseg/models/ocrnet.py +0 -248
  1265. paddlex/paddleseg/models/pspnet.py +0 -147
  1266. paddlex/paddleseg/models/sfnet.py +0 -236
  1267. paddlex/paddleseg/models/shufflenet_slim.py +0 -268
  1268. paddlex/paddleseg/models/u2net.py +0 -574
  1269. paddlex/paddleseg/models/unet.py +0 -155
  1270. paddlex/paddleseg/models/unet_3plus.py +0 -316
  1271. paddlex/paddleseg/models/unet_plusplus.py +0 -237
  1272. paddlex/paddleseg/transforms/__init__.py +0 -16
  1273. paddlex/paddleseg/transforms/functional.py +0 -161
  1274. paddlex/paddleseg/transforms/transforms.py +0 -937
  1275. paddlex/paddleseg/utils/__init__.py +0 -22
  1276. paddlex/paddleseg/utils/config_check.py +0 -60
  1277. paddlex/paddleseg/utils/download.py +0 -163
  1278. paddlex/paddleseg/utils/env/__init__.py +0 -16
  1279. paddlex/paddleseg/utils/env/seg_env.py +0 -56
  1280. paddlex/paddleseg/utils/env/sys_env.py +0 -122
  1281. paddlex/paddleseg/utils/logger.py +0 -48
  1282. paddlex/paddleseg/utils/metrics.py +0 -146
  1283. paddlex/paddleseg/utils/progbar.py +0 -212
  1284. paddlex/paddleseg/utils/timer.py +0 -53
  1285. paddlex/paddleseg/utils/utils.py +0 -120
  1286. paddlex/paddleseg/utils/visualize.py +0 -90
  1287. paddlex/ppcls/__init__.py +0 -20
  1288. paddlex/ppcls/data/__init__.py +0 -15
  1289. paddlex/ppcls/data/imaug/__init__.py +0 -94
  1290. paddlex/ppcls/data/imaug/autoaugment.py +0 -264
  1291. paddlex/ppcls/data/imaug/batch_operators.py +0 -117
  1292. paddlex/ppcls/data/imaug/cutout.py +0 -41
  1293. paddlex/ppcls/data/imaug/fmix.py +0 -217
  1294. paddlex/ppcls/data/imaug/grid.py +0 -89
  1295. paddlex/ppcls/data/imaug/hide_and_seek.py +0 -44
  1296. paddlex/ppcls/data/imaug/operators.py +0 -256
  1297. paddlex/ppcls/data/imaug/randaugment.py +0 -106
  1298. paddlex/ppcls/data/imaug/random_erasing.py +0 -55
  1299. paddlex/ppcls/data/reader.py +0 -318
  1300. paddlex/ppcls/modeling/__init__.py +0 -20
  1301. paddlex/ppcls/modeling/architectures/__init__.py +0 -51
  1302. paddlex/ppcls/modeling/architectures/alexnet.py +0 -132
  1303. paddlex/ppcls/modeling/architectures/darknet.py +0 -161
  1304. paddlex/ppcls/modeling/architectures/densenet.py +0 -308
  1305. paddlex/ppcls/modeling/architectures/distillation_models.py +0 -65
  1306. paddlex/ppcls/modeling/architectures/distilled_vision_transformer.py +0 -196
  1307. paddlex/ppcls/modeling/architectures/dpn.py +0 -425
  1308. paddlex/ppcls/modeling/architectures/efficientnet.py +0 -901
  1309. paddlex/ppcls/modeling/architectures/ghostnet.py +0 -331
  1310. paddlex/ppcls/modeling/architectures/googlenet.py +0 -207
  1311. paddlex/ppcls/modeling/architectures/hrnet.py +0 -742
  1312. paddlex/ppcls/modeling/architectures/inception_v3.py +0 -541
  1313. paddlex/ppcls/modeling/architectures/inception_v4.py +0 -455
  1314. paddlex/ppcls/modeling/architectures/mixnet.py +0 -782
  1315. paddlex/ppcls/modeling/architectures/mobilenet_v1.py +0 -266
  1316. paddlex/ppcls/modeling/architectures/mobilenet_v2.py +0 -248
  1317. paddlex/ppcls/modeling/architectures/mobilenet_v3.py +0 -359
  1318. paddlex/ppcls/modeling/architectures/regnet.py +0 -383
  1319. paddlex/ppcls/modeling/architectures/repvgg.py +0 -339
  1320. paddlex/ppcls/modeling/architectures/res2net.py +0 -272
  1321. paddlex/ppcls/modeling/architectures/res2net_vd.py +0 -295
  1322. paddlex/ppcls/modeling/architectures/resnest.py +0 -705
  1323. paddlex/ppcls/modeling/architectures/resnet.py +0 -317
  1324. paddlex/ppcls/modeling/architectures/resnet_vc.py +0 -309
  1325. paddlex/ppcls/modeling/architectures/resnet_vd.py +0 -354
  1326. paddlex/ppcls/modeling/architectures/resnext.py +0 -259
  1327. paddlex/ppcls/modeling/architectures/resnext101_wsl.py +0 -447
  1328. paddlex/ppcls/modeling/architectures/resnext_vd.py +0 -266
  1329. paddlex/ppcls/modeling/architectures/rexnet.py +0 -240
  1330. paddlex/ppcls/modeling/architectures/se_resnet_vd.py +0 -378
  1331. paddlex/ppcls/modeling/architectures/se_resnext.py +0 -290
  1332. paddlex/ppcls/modeling/architectures/se_resnext_vd.py +0 -285
  1333. paddlex/ppcls/modeling/architectures/shufflenet_v2.py +0 -320
  1334. paddlex/ppcls/modeling/architectures/squeezenet.py +0 -154
  1335. paddlex/ppcls/modeling/architectures/vgg.py +0 -152
  1336. paddlex/ppcls/modeling/architectures/vision_transformer.py +0 -402
  1337. paddlex/ppcls/modeling/architectures/xception.py +0 -345
  1338. paddlex/ppcls/modeling/architectures/xception_deeplab.py +0 -386
  1339. paddlex/ppcls/modeling/loss.py +0 -158
  1340. paddlex/ppcls/modeling/utils.py +0 -53
  1341. paddlex/ppcls/optimizer/__init__.py +0 -19
  1342. paddlex/ppcls/optimizer/learning_rate.py +0 -159
  1343. paddlex/ppcls/optimizer/optimizer.py +0 -165
  1344. paddlex/ppcls/utils/__init__.py +0 -27
  1345. paddlex/ppcls/utils/check.py +0 -151
  1346. paddlex/ppcls/utils/config.py +0 -201
  1347. paddlex/ppcls/utils/logger.py +0 -120
  1348. paddlex/ppcls/utils/metrics.py +0 -112
  1349. paddlex/ppcls/utils/misc.py +0 -62
  1350. paddlex/ppcls/utils/model_zoo.py +0 -213
  1351. paddlex/ppcls/utils/save_load.py +0 -163
  1352. paddlex/ppdet/__init__.py +0 -16
  1353. paddlex/ppdet/core/__init__.py +0 -15
  1354. paddlex/ppdet/core/config/__init__.py +0 -13
  1355. paddlex/ppdet/core/config/schema.py +0 -248
  1356. paddlex/ppdet/core/config/yaml_helpers.py +0 -118
  1357. paddlex/ppdet/core/workspace.py +0 -279
  1358. paddlex/ppdet/data/__init__.py +0 -21
  1359. paddlex/ppdet/data/reader.py +0 -304
  1360. paddlex/ppdet/data/shm_utils.py +0 -67
  1361. paddlex/ppdet/data/source/__init__.py +0 -27
  1362. paddlex/ppdet/data/source/category.py +0 -823
  1363. paddlex/ppdet/data/source/coco.py +0 -243
  1364. paddlex/ppdet/data/source/dataset.py +0 -192
  1365. paddlex/ppdet/data/source/keypoint_coco.py +0 -656
  1366. paddlex/ppdet/data/source/mot.py +0 -360
  1367. paddlex/ppdet/data/source/voc.py +0 -204
  1368. paddlex/ppdet/data/source/widerface.py +0 -180
  1369. paddlex/ppdet/data/transform/__init__.py +0 -28
  1370. paddlex/ppdet/data/transform/autoaugment_utils.py +0 -1593
  1371. paddlex/ppdet/data/transform/batch_operators.py +0 -758
  1372. paddlex/ppdet/data/transform/gridmask_utils.py +0 -83
  1373. paddlex/ppdet/data/transform/keypoint_operators.py +0 -665
  1374. paddlex/ppdet/data/transform/mot_operators.py +0 -636
  1375. paddlex/ppdet/data/transform/op_helper.py +0 -468
  1376. paddlex/ppdet/data/transform/operators.py +0 -2103
  1377. paddlex/ppdet/engine/__init__.py +0 -29
  1378. paddlex/ppdet/engine/callbacks.py +0 -262
  1379. paddlex/ppdet/engine/env.py +0 -47
  1380. paddlex/ppdet/engine/export_utils.py +0 -118
  1381. paddlex/ppdet/engine/tracker.py +0 -425
  1382. paddlex/ppdet/engine/trainer.py +0 -535
  1383. paddlex/ppdet/metrics/__init__.py +0 -23
  1384. paddlex/ppdet/metrics/coco_utils.py +0 -184
  1385. paddlex/ppdet/metrics/json_results.py +0 -151
  1386. paddlex/ppdet/metrics/keypoint_metrics.py +0 -202
  1387. paddlex/ppdet/metrics/map_utils.py +0 -396
  1388. paddlex/ppdet/metrics/metrics.py +0 -300
  1389. paddlex/ppdet/metrics/mot_eval_utils.py +0 -192
  1390. paddlex/ppdet/metrics/mot_metrics.py +0 -184
  1391. paddlex/ppdet/metrics/widerface_utils.py +0 -393
  1392. paddlex/ppdet/model_zoo/__init__.py +0 -18
  1393. paddlex/ppdet/model_zoo/model_zoo.py +0 -86
  1394. paddlex/ppdet/model_zoo/tests/__init__.py +0 -13
  1395. paddlex/ppdet/model_zoo/tests/test_get_model.py +0 -48
  1396. paddlex/ppdet/model_zoo/tests/test_list_model.py +0 -68
  1397. paddlex/ppdet/modeling/__init__.py +0 -41
  1398. paddlex/ppdet/modeling/architectures/__init__.py +0 -40
  1399. paddlex/ppdet/modeling/architectures/cascade_rcnn.py +0 -144
  1400. paddlex/ppdet/modeling/architectures/centernet.py +0 -103
  1401. paddlex/ppdet/modeling/architectures/deepsort.py +0 -111
  1402. paddlex/ppdet/modeling/architectures/fairmot.py +0 -107
  1403. paddlex/ppdet/modeling/architectures/faster_rcnn.py +0 -106
  1404. paddlex/ppdet/modeling/architectures/fcos.py +0 -105
  1405. paddlex/ppdet/modeling/architectures/jde.py +0 -125
  1406. paddlex/ppdet/modeling/architectures/keypoint_hrhrnet.py +0 -286
  1407. paddlex/ppdet/modeling/architectures/keypoint_hrnet.py +0 -203
  1408. paddlex/ppdet/modeling/architectures/mask_rcnn.py +0 -135
  1409. paddlex/ppdet/modeling/architectures/meta_arch.py +0 -45
  1410. paddlex/ppdet/modeling/architectures/s2anet.py +0 -103
  1411. paddlex/ppdet/modeling/architectures/solov2.py +0 -110
  1412. paddlex/ppdet/modeling/architectures/ssd.py +0 -84
  1413. paddlex/ppdet/modeling/architectures/ttfnet.py +0 -98
  1414. paddlex/ppdet/modeling/architectures/yolo.py +0 -104
  1415. paddlex/ppdet/modeling/backbones/__init__.py +0 -37
  1416. paddlex/ppdet/modeling/backbones/blazenet.py +0 -322
  1417. paddlex/ppdet/modeling/backbones/darknet.py +0 -341
  1418. paddlex/ppdet/modeling/backbones/dla.py +0 -244
  1419. paddlex/ppdet/modeling/backbones/ghostnet.py +0 -476
  1420. paddlex/ppdet/modeling/backbones/hrnet.py +0 -724
  1421. paddlex/ppdet/modeling/backbones/mobilenet_v1.py +0 -410
  1422. paddlex/ppdet/modeling/backbones/mobilenet_v3.py +0 -497
  1423. paddlex/ppdet/modeling/backbones/name_adapter.py +0 -69
  1424. paddlex/ppdet/modeling/backbones/res2net.py +0 -358
  1425. paddlex/ppdet/modeling/backbones/resnet.py +0 -606
  1426. paddlex/ppdet/modeling/backbones/senet.py +0 -140
  1427. paddlex/ppdet/modeling/backbones/vgg.py +0 -216
  1428. paddlex/ppdet/modeling/bbox_utils.py +0 -464
  1429. paddlex/ppdet/modeling/heads/__init__.py +0 -41
  1430. paddlex/ppdet/modeling/heads/bbox_head.py +0 -379
  1431. paddlex/ppdet/modeling/heads/cascade_head.py +0 -285
  1432. paddlex/ppdet/modeling/heads/centernet_head.py +0 -194
  1433. paddlex/ppdet/modeling/heads/face_head.py +0 -113
  1434. paddlex/ppdet/modeling/heads/fcos_head.py +0 -270
  1435. paddlex/ppdet/modeling/heads/keypoint_hrhrnet_head.py +0 -108
  1436. paddlex/ppdet/modeling/heads/mask_head.py +0 -253
  1437. paddlex/ppdet/modeling/heads/roi_extractor.py +0 -111
  1438. paddlex/ppdet/modeling/heads/s2anet_head.py +0 -845
  1439. paddlex/ppdet/modeling/heads/solov2_head.py +0 -537
  1440. paddlex/ppdet/modeling/heads/ssd_head.py +0 -175
  1441. paddlex/ppdet/modeling/heads/ttf_head.py +0 -314
  1442. paddlex/ppdet/modeling/heads/yolo_head.py +0 -124
  1443. paddlex/ppdet/modeling/keypoint_utils.py +0 -302
  1444. paddlex/ppdet/modeling/layers.py +0 -1142
  1445. paddlex/ppdet/modeling/losses/__init__.py +0 -35
  1446. paddlex/ppdet/modeling/losses/ctfocal_loss.py +0 -67
  1447. paddlex/ppdet/modeling/losses/fairmot_loss.py +0 -41
  1448. paddlex/ppdet/modeling/losses/fcos_loss.py +0 -225
  1449. paddlex/ppdet/modeling/losses/iou_aware_loss.py +0 -48
  1450. paddlex/ppdet/modeling/losses/iou_loss.py +0 -210
  1451. paddlex/ppdet/modeling/losses/jde_loss.py +0 -182
  1452. paddlex/ppdet/modeling/losses/keypoint_loss.py +0 -228
  1453. paddlex/ppdet/modeling/losses/solov2_loss.py +0 -101
  1454. paddlex/ppdet/modeling/losses/ssd_loss.py +0 -163
  1455. paddlex/ppdet/modeling/losses/yolo_loss.py +0 -212
  1456. paddlex/ppdet/modeling/mot/__init__.py +0 -25
  1457. paddlex/ppdet/modeling/mot/matching/__init__.py +0 -19
  1458. paddlex/ppdet/modeling/mot/matching/deepsort_matching.py +0 -382
  1459. paddlex/ppdet/modeling/mot/matching/jde_matching.py +0 -145
  1460. paddlex/ppdet/modeling/mot/motion/__init__.py +0 -17
  1461. paddlex/ppdet/modeling/mot/motion/kalman_filter.py +0 -270
  1462. paddlex/ppdet/modeling/mot/tracker/__init__.py +0 -23
  1463. paddlex/ppdet/modeling/mot/tracker/base_jde_tracker.py +0 -267
  1464. paddlex/ppdet/modeling/mot/tracker/base_sde_tracker.py +0 -145
  1465. paddlex/ppdet/modeling/mot/tracker/deepsort_tracker.py +0 -165
  1466. paddlex/ppdet/modeling/mot/tracker/jde_tracker.py +0 -262
  1467. paddlex/ppdet/modeling/mot/utils.py +0 -181
  1468. paddlex/ppdet/modeling/mot/visualization.py +0 -130
  1469. paddlex/ppdet/modeling/necks/__init__.py +0 -25
  1470. paddlex/ppdet/modeling/necks/centernet_fpn.py +0 -185
  1471. paddlex/ppdet/modeling/necks/fpn.py +0 -233
  1472. paddlex/ppdet/modeling/necks/hrfpn.py +0 -131
  1473. paddlex/ppdet/modeling/necks/ttf_fpn.py +0 -243
  1474. paddlex/ppdet/modeling/necks/yolo_fpn.py +0 -1034
  1475. paddlex/ppdet/modeling/ops.py +0 -1599
  1476. paddlex/ppdet/modeling/post_process.py +0 -449
  1477. paddlex/ppdet/modeling/proposal_generator/__init__.py +0 -2
  1478. paddlex/ppdet/modeling/proposal_generator/anchor_generator.py +0 -135
  1479. paddlex/ppdet/modeling/proposal_generator/proposal_generator.py +0 -81
  1480. paddlex/ppdet/modeling/proposal_generator/rpn_head.py +0 -269
  1481. paddlex/ppdet/modeling/proposal_generator/target.py +0 -671
  1482. paddlex/ppdet/modeling/proposal_generator/target_layer.py +0 -476
  1483. paddlex/ppdet/modeling/reid/__init__.py +0 -23
  1484. paddlex/ppdet/modeling/reid/fairmot_embedding_head.py +0 -117
  1485. paddlex/ppdet/modeling/reid/jde_embedding_head.py +0 -189
  1486. paddlex/ppdet/modeling/reid/pyramidal_embedding.py +0 -151
  1487. paddlex/ppdet/modeling/reid/resnet.py +0 -320
  1488. paddlex/ppdet/modeling/shape_spec.py +0 -33
  1489. paddlex/ppdet/modeling/tests/__init__.py +0 -13
  1490. paddlex/ppdet/modeling/tests/test_architectures.py +0 -59
  1491. paddlex/ppdet/modeling/tests/test_base.py +0 -75
  1492. paddlex/ppdet/modeling/tests/test_ops.py +0 -839
  1493. paddlex/ppdet/modeling/tests/test_yolov3_loss.py +0 -420
  1494. paddlex/ppdet/optimizer.py +0 -285
  1495. paddlex/ppdet/slim/__init__.py +0 -62
  1496. paddlex/ppdet/slim/distill.py +0 -111
  1497. paddlex/ppdet/slim/prune.py +0 -85
  1498. paddlex/ppdet/slim/quant.py +0 -52
  1499. paddlex/ppdet/utils/__init__.py +0 -13
  1500. paddlex/ppdet/utils/check.py +0 -93
  1501. paddlex/ppdet/utils/checkpoint.py +0 -216
  1502. paddlex/ppdet/utils/cli.py +0 -151
  1503. paddlex/ppdet/utils/colormap.py +0 -56
  1504. paddlex/ppdet/utils/download.py +0 -477
  1505. paddlex/ppdet/utils/logger.py +0 -71
  1506. paddlex/ppdet/utils/stats.py +0 -95
  1507. paddlex/ppdet/utils/visualizer.py +0 -292
  1508. paddlex/ppdet/utils/voc_utils.py +0 -87
  1509. paddlex/seg.py +0 -38
  1510. paddlex/tools/__init__.py +0 -16
  1511. paddlex/tools/convert.py +0 -52
  1512. paddlex/tools/dataset_conversion/__init__.py +0 -24
  1513. paddlex/tools/dataset_conversion/x2coco.py +0 -379
  1514. paddlex/tools/dataset_conversion/x2imagenet.py +0 -82
  1515. paddlex/tools/dataset_conversion/x2seg.py +0 -343
  1516. paddlex/tools/dataset_conversion/x2voc.py +0 -230
  1517. paddlex/tools/dataset_split/__init__.py +0 -23
  1518. paddlex/tools/dataset_split/coco_split.py +0 -69
  1519. paddlex/tools/dataset_split/imagenet_split.py +0 -75
  1520. paddlex/tools/dataset_split/seg_split.py +0 -96
  1521. paddlex/tools/dataset_split/utils.py +0 -75
  1522. paddlex/tools/dataset_split/voc_split.py +0 -91
  1523. paddlex/tools/split.py +0 -41
  1524. paddlex/utils/checkpoint.py +0 -439
  1525. paddlex/utils/shm.py +0 -67
  1526. paddlex/utils/stats.py +0 -68
  1527. paddlex/utils/utils.py +0 -140
  1528. paddlex-2.0.0rc4.dist-info/LICENSE +0 -201
  1529. paddlex-2.0.0rc4.dist-info/METADATA +0 -29
  1530. paddlex-2.0.0rc4.dist-info/RECORD +0 -445
  1531. paddlex-2.0.0rc4.dist-info/WHEEL +0 -5
  1532. paddlex-2.0.0rc4.dist-info/entry_points.txt +0 -3
  1533. paddlex-2.0.0rc4.dist-info/top_level.txt +0 -2
@@ -0,0 +1,3720 @@
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import copy
16
+ import inspect
17
+ import io
18
+ import json
19
+ import os
20
+ import warnings
21
+ from collections import UserDict
22
+ from dataclasses import dataclass, field
23
+ from enum import Enum
24
+ from typing import (
25
+ Any,
26
+ Dict,
27
+ List,
28
+ Literal,
29
+ NamedTuple,
30
+ Optional,
31
+ Sequence,
32
+ Tuple,
33
+ Union,
34
+ )
35
+
36
+ import numpy as np
37
+
38
+ from .....utils import logging
39
+
40
+ __all__ = [
41
+ "AddedToken",
42
+ "FastEncoding",
43
+ "ExplicitEnum",
44
+ "PaddingStrategy",
45
+ "TensorType",
46
+ "TruncationStrategy",
47
+ "CharSpan",
48
+ "TokenSpan",
49
+ "BatchEncoding",
50
+ "SpecialTokensMixin",
51
+ "PretrainedTokenizerBase",
52
+ ]
53
+
54
+ TOKENIZER_CONFIG_NAME = "tokenizer_config.json"
55
+ CHAT_TEMPLATE_CONFIG_NAME = "chat_template.json"
56
+
57
+ VERY_LARGE_INTEGER = int(
58
+ 1e30
59
+ ) # This is used to set the max input length for a model with infinite size input
60
+ LARGE_INTEGER = int(
61
+ 1e20
62
+ ) # This is used when we need something big but slightly smaller than VERY_LARGE_INTEGER
63
+
64
+ # Define type aliases and NamedTuples
65
+ TextInput = str
66
+ PreTokenizedInput = List[str]
67
+ EncodedInput = List[int]
68
+ TextInputPair = Tuple[str, str]
69
+ PreTokenizedInputPair = Tuple[List[str], List[str]]
70
+ EncodedInputPair = Tuple[List[int], List[int]]
71
+
72
+ # Slow tokenizers used to be saved in three separated files
73
+ SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
74
+ ADDED_TOKENS_FILE = "added_tokens.json"
75
+ TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
76
+
77
+
78
+ @dataclass(frozen=True, eq=True)
79
+ class AddedToken:
80
+ """
81
+ AddedToken represents a token to be added to a Tokenizer An AddedToken can have special options defining the
82
+ way it should behave.
83
+ """
84
+
85
+ content: str = field(default_factory=str)
86
+ single_word: bool = False
87
+ lstrip: bool = False
88
+ rstrip: bool = False
89
+ normalized: bool = True
90
+ special: bool = True
91
+
92
+ def __getstate__(self):
93
+ return self.__dict__
94
+
95
+ def __str__(self):
96
+ return self.content
97
+
98
+
99
+ @dataclass
100
+ class FastEncoding:
101
+ """This is dummy class reserved for fast tokenizer"""
102
+
103
+
104
+ class ExplicitEnum(Enum):
105
+ """
106
+ Enum with more explicit error message for missing values.
107
+ """
108
+
109
+ @classmethod
110
+ def _missing_(cls, value):
111
+ raise ValueError(
112
+ f"{value} is not a valid {cls.__name__}, please select one of {list(cls._value2member_map_.keys())}"
113
+ )
114
+
115
+
116
+ class PaddingStrategy(ExplicitEnum):
117
+ """
118
+ Possible values for the `padding` argument in [`PretrainedTokenizerBase.__call__`]. Useful for tab-completion in an
119
+ IDE.
120
+ """
121
+
122
+ LONGEST = "longest"
123
+ MAX_LENGTH = "max_length"
124
+ DO_NOT_PAD = "do_not_pad"
125
+
126
+
127
+ class TensorType(ExplicitEnum):
128
+ """
129
+ Possible values for the `return_tensors` argument in [`PretrainedTokenizerBase.__call__`]. Useful for
130
+ tab-completion in an IDE.
131
+ """
132
+
133
+ PADDLE = "pd"
134
+ NUMPY = "np"
135
+
136
+
137
+ def to_py_obj(obj):
138
+ """
139
+ Convert a Paddle tensor, Numpy array or python list to a python list.
140
+ """
141
+ import paddle
142
+
143
+ if isinstance(obj, (dict, UserDict)):
144
+ return {k: to_py_obj(v) for k, v in obj.items()}
145
+ elif isinstance(obj, (list, tuple)):
146
+ return [to_py_obj(o) for o in obj]
147
+ elif isinstance(obj, paddle.Tensor):
148
+ return obj.numpy().tolist()
149
+ elif isinstance(obj, (np.ndarray, np.number)): # tolist also works on 0d np arrays
150
+ return obj.tolist()
151
+ else:
152
+ return obj
153
+
154
+
155
+ def _is_numpy(x):
156
+ return isinstance(x, np.ndarray)
157
+
158
+
159
+ class TruncationStrategy(ExplicitEnum):
160
+ """
161
+ Possible values for the `truncation` argument in [`PretrainedTokenizerBase.__call__`]. Useful for tab-completion in
162
+ an IDE.
163
+ """
164
+
165
+ ONLY_FIRST = "only_first"
166
+ ONLY_SECOND = "only_second"
167
+ LONGEST_FIRST = "longest_first"
168
+ DO_NOT_TRUNCATE = "do_not_truncate"
169
+
170
+
171
+ class CharSpan(NamedTuple):
172
+ """
173
+ Character span in the original string.
174
+
175
+ Args:
176
+ start (`int`): Index of the first character in the original string.
177
+ end (`int`): Index of the character following the last character in the original string.
178
+ """
179
+
180
+ start: int
181
+ end: int
182
+
183
+
184
+ class TokenSpan(NamedTuple):
185
+ """
186
+ Token span in an encoded string (list of tokens).
187
+
188
+ Args:
189
+ start (`int`): Index of the first token in the span.
190
+ end (`int`): Index of the token following the last token in the span.
191
+ """
192
+
193
+ start: int
194
+ end: int
195
+
196
+
197
+ class BatchEncoding(UserDict):
198
+ """
199
+ Holds the output of the [`PretrainedTokenizerBase.__call__`],
200
+ [`PretrainedTokenizerBase.encode_plus`] and
201
+ [`PretrainedTokenizerBase.batch_encode_plus`] methods (tokens, attention_masks, etc).
202
+
203
+ This class is derived from a python dictionary and can be used as a dictionary. In addition, this class exposes
204
+ utility methods to map from word/character space to token space.
205
+
206
+ Args:
207
+ data (`dict`):
208
+ Dictionary of lists/arrays/tensors returned by the `__call__`/`encode`/`batch_encode` methods
209
+ ('input_ids', 'attention_mask', etc.).
210
+ tensor_type (`Union[None, str, TensorType]`, *optional*):
211
+ You can give a tensor_type here to convert the lists of integers in Paddle/Numpy Tensors at
212
+ initialization.
213
+ prepend_batch_axis (`bool`, *optional*, defaults to `False`):
214
+ Whether or not to add a batch axis when converting to tensors (see `tensor_type` above).
215
+ """
216
+
217
+ def __init__(
218
+ self,
219
+ data: Optional[Dict[str, Any]] = None,
220
+ encoding: Optional[Union[FastEncoding, Sequence[FastEncoding]]] = None,
221
+ tensor_type: Union[None, str] = None,
222
+ prepend_batch_axis: bool = False,
223
+ n_sequences: Optional[int] = None,
224
+ ):
225
+ super().__init__(data)
226
+
227
+ if isinstance(encoding, FastEncoding):
228
+ encoding = [encoding]
229
+
230
+ self._encodings = encoding
231
+
232
+ if n_sequences is None and encoding is not None and len(encoding):
233
+ n_sequences = encoding[0].n_sequences
234
+
235
+ self._n_sequences = n_sequences
236
+
237
+ self.convert_to_tensors(
238
+ tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis
239
+ )
240
+
241
+ @property
242
+ def n_sequences(self) -> Optional[int]:
243
+ """
244
+ `Optional[int]`: The number of sequences used to generate each sample from the batch encoded in this
245
+ [`BatchEncoding`]. Currently can be one of `None` (unknown), `1` (a single sentence) or `2` (a pair of
246
+ sentences)
247
+ """
248
+ return self._n_sequences
249
+
250
+ @property
251
+ def is_fast(self) -> bool:
252
+ """
253
+ `bool`: Indicate whether this [`BatchEncoding`] was generated from the result of a [`PretrainedFastTokenizer`]
254
+ or not.
255
+ """
256
+ return self._encodings is not None
257
+
258
+ def __getitem__(self, item: Union[int, str]) -> Union[Any, FastEncoding]:
259
+ """
260
+ If the key is a string, returns the value of the dict associated to `key` ('input_ids', 'attention_mask',
261
+ etc.).
262
+
263
+ If the key is an integer, get the `Encoding` for batch item with index `key`.
264
+ """
265
+ if isinstance(item, str):
266
+ return self.data[item]
267
+ elif self._encodings is not None:
268
+ return self._encodings[item]
269
+ else:
270
+ raise KeyError(
271
+ "Indexing with integers is not available when using tokenizer.__call__()"
272
+ " with return_dict=True. Please set return_dict to False to use integer indexing."
273
+ )
274
+
275
+ def __getattr__(self, item: str):
276
+ try:
277
+ return self.data[item]
278
+ except KeyError:
279
+ raise AttributeError
280
+
281
+ def __getstate__(self):
282
+ return {"data": self.data, "encodings": self._encodings}
283
+
284
+ def __setstate__(self, state):
285
+ if "data" in state:
286
+ self.data = state["data"]
287
+
288
+ if "encodings" in state:
289
+ self._encodings = state["encodings"]
290
+
291
+ def keys(self):
292
+ return self.data.keys()
293
+
294
+ def values(self):
295
+ return self.data.values()
296
+
297
+ def items(self):
298
+ return self.data.items()
299
+
300
+ @property
301
+ def encodings(self) -> Optional[List[FastEncoding]]:
302
+ """
303
+ `Optional[List[FastEncoding]]`: The list all encodings from the tokenization process. Returns `None` if
304
+ the input was tokenized through Python (i.e., not a fast) tokenizer.
305
+ """
306
+ return self._encodings
307
+
308
+ def tokens(self, batch_index: int = 0) -> List[str]:
309
+ """
310
+ Return the list of tokens (sub-parts of the input strings after word/subword splitting and before conversion to
311
+ integer indices) at a given batch index (only works for the output of a fast tokenizer).
312
+
313
+ Args:
314
+ batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.
315
+
316
+ Returns:
317
+ `List[str]`: The list of tokens at that index.
318
+ """
319
+ if not self._encodings:
320
+ raise ValueError(
321
+ "tokens() is not available when using Python-based tokenizers"
322
+ )
323
+ return self._encodings[batch_index].tokens
324
+
325
+ def sequence_ids(self, batch_index: int = 0) -> List[Optional[int]]:
326
+ """
327
+ Return a list mapping the tokens to the id of their original sentences:
328
+
329
+ - `None` for special tokens added around or between sequences,
330
+ - `0` for tokens corresponding to words in the first sequence,
331
+ - `1` for tokens corresponding to words in the second sequence when a pair of sequences was jointly
332
+ encoded.
333
+
334
+ Args:
335
+ batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.
336
+
337
+ Returns:
338
+ `List[Optional[int]]`: A list indicating the sequence id corresponding to each token. Special tokens added
339
+ by the tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding
340
+ sequence.
341
+ """
342
+ if not self._encodings:
343
+ raise ValueError(
344
+ "sequence_ids() is not available when using Python-based tokenizers"
345
+ )
346
+ return self._encodings[batch_index].sequence_ids
347
+
348
+ def words(self, batch_index: int = 0) -> List[Optional[int]]:
349
+ """
350
+ Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.
351
+
352
+ Args:
353
+ batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.
354
+
355
+ Returns:
356
+ `List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by the
357
+ tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding word
358
+ (several tokens will be mapped to the same word index if they are parts of that word).
359
+ """
360
+ if not self._encodings:
361
+ raise ValueError(
362
+ "words() is not available when using Python-based tokenizers"
363
+ )
364
+ warnings.warn(
365
+ "`BatchEncoding.words()` property is deprecated and should be replaced with the identical, "
366
+ "but more self-explanatory `BatchEncoding.word_ids()` property.",
367
+ FutureWarning,
368
+ )
369
+ return self.word_ids(batch_index)
370
+
371
+ def word_ids(self, batch_index: int = 0) -> List[Optional[int]]:
372
+ """
373
+ Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.
374
+
375
+ Args:
376
+ batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.
377
+
378
+ Returns:
379
+ `List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by the
380
+ tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding word
381
+ (several tokens will be mapped to the same word index if they are parts of that word).
382
+ """
383
+ if not self._encodings:
384
+ raise ValueError(
385
+ "word_ids() is not available when using Python-based tokenizers"
386
+ )
387
+ return self._encodings[batch_index].word_ids
388
+
389
+ def token_to_sequence(
390
+ self, batch_or_token_index: int, token_index: Optional[int] = None
391
+ ) -> int:
392
+ """
393
+ Get the index of the sequence represented by the given token. In the general use case, this method returns `0`
394
+ for a single sequence or the first sequence of a pair, and `1` for the second sequence of a pair
395
+
396
+ Can be called as:
397
+
398
+ - `self.token_to_sequence(token_index)` if batch size is 1
399
+ - `self.token_to_sequence(batch_index, token_index)` if batch size is greater than 1
400
+
401
+ This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,
402
+ words are defined by the user). In this case it allows to easily associate encoded tokens with provided
403
+ tokenized words.
404
+
405
+ Args:
406
+ batch_or_token_index (`int`):
407
+ Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of
408
+ the token in the sequence.
409
+ token_index (`int`, *optional*):
410
+ If a batch index is provided in *batch_or_token_index*, this can be the index of the token in the
411
+ sequence.
412
+
413
+ Returns:
414
+ `int`: Index of the word in the input sequence.
415
+ """
416
+
417
+ if not self._encodings:
418
+ raise ValueError(
419
+ "token_to_sequence() is not available when using Python based tokenizers"
420
+ )
421
+ if token_index is not None:
422
+ batch_index = batch_or_token_index
423
+ else:
424
+ batch_index = 0
425
+ token_index = batch_or_token_index
426
+ if batch_index < 0:
427
+ batch_index = self._batch_size + batch_index
428
+ if token_index < 0:
429
+ token_index = self._seq_len + token_index
430
+ return self._encodings[batch_index].token_to_sequence(token_index)
431
+
432
+ def token_to_word(
433
+ self, batch_or_token_index: int, token_index: Optional[int] = None
434
+ ) -> int:
435
+ """
436
+ Get the index of the word corresponding (i.e. comprising) to an encoded token in a sequence of the batch.
437
+
438
+ Can be called as:
439
+
440
+ - `self.token_to_word(token_index)` if batch size is 1
441
+ - `self.token_to_word(batch_index, token_index)` if batch size is greater than 1
442
+
443
+ This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,
444
+ words are defined by the user). In this case it allows to easily associate encoded tokens with provided
445
+ tokenized words.
446
+
447
+ Args:
448
+ batch_or_token_index (`int`):
449
+ Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
450
+ the token in the sequence.
451
+ token_index (`int`, *optional*):
452
+ If a batch index is provided in *batch_or_token_index*, this can be the index of the token in the
453
+ sequence.
454
+
455
+ Returns:
456
+ `int`: Index of the word in the input sequence.
457
+ """
458
+
459
+ if not self._encodings:
460
+ raise ValueError(
461
+ "token_to_word() is not available when using Python based tokenizers"
462
+ )
463
+ if token_index is not None:
464
+ batch_index = batch_or_token_index
465
+ else:
466
+ batch_index = 0
467
+ token_index = batch_or_token_index
468
+ if batch_index < 0:
469
+ batch_index = self._batch_size + batch_index
470
+ if token_index < 0:
471
+ token_index = self._seq_len + token_index
472
+ return self._encodings[batch_index].token_to_word(token_index)
473
+
474
+ def word_to_tokens(
475
+ self,
476
+ batch_or_word_index: int,
477
+ word_index: Optional[int] = None,
478
+ sequence_index: int = 0,
479
+ ) -> Optional[TokenSpan]:
480
+ """
481
+ Get the encoded token span corresponding to a word in a sequence of the batch.
482
+
483
+ Token spans are returned as a [`TokenSpan`] with:
484
+
485
+ - **start** -- Index of the first token.
486
+ - **end** -- Index of the token following the last token.
487
+
488
+ Can be called as:
489
+
490
+ - `self.word_to_tokens(word_index, sequence_index: int = 0)` if batch size is 1
491
+ - `self.word_to_tokens(batch_index, word_index, sequence_index: int = 0)` if batch size is greater or equal to
492
+ 1
493
+
494
+ This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
495
+ are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
496
+ words.
497
+
498
+ Args:
499
+ batch_or_word_index (`int`):
500
+ Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of
501
+ the word in the sequence.
502
+ word_index (`int`, *optional*):
503
+ If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the
504
+ sequence.
505
+ sequence_index (`int`, *optional*, defaults to 0):
506
+ If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
507
+ or 1) the provided word index belongs to.
508
+
509
+ Returns:
510
+ Optional [`TokenSpan`] Span of tokens in the encoded sequence. Returns `None` if
511
+ no tokens correspond to the word.
512
+ """
513
+
514
+ if not self._encodings:
515
+ raise ValueError(
516
+ "word_to_tokens() is not available when using Python based tokenizers"
517
+ )
518
+ if word_index is not None:
519
+ batch_index = batch_or_word_index
520
+ else:
521
+ batch_index = 0
522
+ word_index = batch_or_word_index
523
+ if batch_index < 0:
524
+ batch_index = self._batch_size + batch_index
525
+ if word_index < 0:
526
+ word_index = self._seq_len + word_index
527
+ span = self._encodings[batch_index].word_to_tokens(word_index, sequence_index)
528
+ return TokenSpan(*span) if span is not None else None
529
+
530
+ def token_to_chars(
531
+ self, batch_or_token_index: int, token_index: Optional[int] = None
532
+ ) -> CharSpan:
533
+ """
534
+ Get the character span corresponding to an encoded token in a sequence of the batch.
535
+
536
+ Character spans are returned as a [`CharSpan`] with:
537
+
538
+ - **start** -- Index of the first character in the original string associated to the token.
539
+ - **end** -- Index of the character following the last character in the original string associated to the
540
+ token.
541
+
542
+ Can be called as:
543
+
544
+ - `self.token_to_chars(token_index)` if batch size is 1
545
+ - `self.token_to_chars(batch_index, token_index)` if batch size is greater or equal to 1
546
+
547
+ Args:
548
+ batch_or_token_index (`int`):
549
+ Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
550
+ the token in the sequence.
551
+ token_index (`int`, *optional*):
552
+ If a batch index is provided in *batch_or_token_index*, this can be the index of the token or tokens in
553
+ the sequence.
554
+
555
+ Returns:
556
+ [`CharSpan`]: Span of characters in the original string.
557
+ """
558
+
559
+ if not self._encodings:
560
+ raise ValueError(
561
+ "token_to_chars() is not available when using Python based tokenizers"
562
+ )
563
+ if token_index is not None:
564
+ batch_index = batch_or_token_index
565
+ else:
566
+ batch_index = 0
567
+ token_index = batch_or_token_index
568
+ return CharSpan(*(self._encodings[batch_index].token_to_chars(token_index)))
569
+
570
+ def char_to_token(
571
+ self,
572
+ batch_or_char_index: int,
573
+ char_index: Optional[int] = None,
574
+ sequence_index: int = 0,
575
+ ) -> int:
576
+ """
577
+ Get the index of the token in the encoded output comprising a character in the original string for a sequence
578
+ of the batch.
579
+
580
+ Can be called as:
581
+
582
+ - `self.char_to_token(char_index)` if batch size is 1
583
+ - `self.char_to_token(batch_index, char_index)` if batch size is greater or equal to 1
584
+
585
+ This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
586
+ are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
587
+ words.
588
+
589
+ Args:
590
+ batch_or_char_index (`int`):
591
+ Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
592
+ the word in the sequence
593
+ char_index (`int`, *optional*):
594
+ If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the
595
+ sequence.
596
+ sequence_index (`int`, *optional*, defaults to 0):
597
+ If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
598
+ or 1) the provided character index belongs to.
599
+
600
+
601
+ Returns:
602
+ `int`: Index of the token.
603
+ """
604
+
605
+ if not self._encodings:
606
+ raise ValueError(
607
+ "char_to_token() is not available when using Python based tokenizers"
608
+ )
609
+ if char_index is not None:
610
+ batch_index = batch_or_char_index
611
+ else:
612
+ batch_index = 0
613
+ char_index = batch_or_char_index
614
+ return self._encodings[batch_index].char_to_token(char_index, sequence_index)
615
+
616
+ def word_to_chars(
617
+ self,
618
+ batch_or_word_index: int,
619
+ word_index: Optional[int] = None,
620
+ sequence_index: int = 0,
621
+ ) -> CharSpan:
622
+ """
623
+ Get the character span in the original string corresponding to given word in a sequence of the batch.
624
+
625
+ Character spans are returned as a CharSpan NamedTuple with:
626
+
627
+ - start: index of the first character in the original string
628
+ - end: index of the character following the last character in the original string
629
+
630
+ Can be called as:
631
+
632
+ - `self.word_to_chars(word_index)` if batch size is 1
633
+ - `self.word_to_chars(batch_index, word_index)` if batch size is greater or equal to 1
634
+
635
+ Args:
636
+ batch_or_word_index (`int`):
637
+ Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
638
+ the word in the sequence
639
+ word_index (`int`, *optional*):
640
+ If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the
641
+ sequence.
642
+ sequence_index (`int`, *optional*, defaults to 0):
643
+ If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
644
+ or 1) the provided word index belongs to.
645
+
646
+ Returns:
647
+ `CharSpan` or `List[CharSpan]`: Span(s) of the associated character or characters in the string. CharSpan
648
+ are NamedTuple with:
649
+
650
+ - start: index of the first character associated to the token in the original string
651
+ - end: index of the character following the last character associated to the token in the original
652
+ string
653
+ """
654
+
655
+ if not self._encodings:
656
+ raise ValueError(
657
+ "word_to_chars() is not available when using Python based tokenizers"
658
+ )
659
+ if word_index is not None:
660
+ batch_index = batch_or_word_index
661
+ else:
662
+ batch_index = 0
663
+ word_index = batch_or_word_index
664
+ return CharSpan(
665
+ *(self._encodings[batch_index].word_to_chars(word_index, sequence_index))
666
+ )
667
+
668
+ def char_to_word(
669
+ self,
670
+ batch_or_char_index: int,
671
+ char_index: Optional[int] = None,
672
+ sequence_index: int = 0,
673
+ ) -> int:
674
+ """
675
+ Get the word in the original string corresponding to a character in the original string of a sequence of the
676
+ batch.
677
+
678
+ Can be called as:
679
+
680
+ - `self.char_to_word(char_index)` if batch size is 1
681
+ - `self.char_to_word(batch_index, char_index)` if batch size is greater than 1
682
+
683
+ This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
684
+ are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
685
+ words.
686
+
687
+ Args:
688
+ batch_or_char_index (`int`):
689
+ Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
690
+ the character in the original string.
691
+ char_index (`int`, *optional*):
692
+ If a batch index is provided in *batch_or_token_index*, this can be the index of the character in the
693
+ original string.
694
+ sequence_index (`int`, *optional*, defaults to 0):
695
+ If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
696
+ or 1) the provided character index belongs to.
697
+
698
+
699
+ Returns:
700
+ `int` or `List[int]`: Index or indices of the associated encoded token(s).
701
+ """
702
+
703
+ if not self._encodings:
704
+ raise ValueError(
705
+ "char_to_word() is not available when using Python based tokenizers"
706
+ )
707
+ if char_index is not None:
708
+ batch_index = batch_or_char_index
709
+ else:
710
+ batch_index = 0
711
+ char_index = batch_or_char_index
712
+ return self._encodings[batch_index].char_to_word(char_index, sequence_index)
713
+
714
+ def convert_to_tensors(
715
+ self,
716
+ tensor_type: Optional[Union[str, TensorType]] = None,
717
+ prepend_batch_axis: bool = False,
718
+ ):
719
+ """
720
+ Convert the inner content to tensors.
721
+
722
+ Args:
723
+ tensor_type (`str` or [`TensorType`], *optional*):
724
+ The type of tensors to use. If `str`, should be one of the values of the enum [`TensorType`]. If
725
+ `None`, no modification is done.
726
+ prepend_batch_axis (`int`, *optional*, defaults to `False`):
727
+ Whether or not to add the batch dimension during the conversion.
728
+ """
729
+ import paddle
730
+
731
+ if tensor_type is None:
732
+ return self
733
+
734
+ # Convert to TensorType
735
+ if not isinstance(tensor_type, TensorType):
736
+ tensor_type = TensorType(tensor_type)
737
+ # Get a function reference for the correct framework
738
+ if tensor_type == TensorType.PADDLE:
739
+ as_tensor = paddle.to_tensor
740
+ is_tensor = paddle.is_tensor
741
+ else:
742
+ as_tensor = np.asarray
743
+ is_tensor = _is_numpy
744
+
745
+ # Do the tensor conversion in batch
746
+ for key, value in self.items():
747
+ try:
748
+ if prepend_batch_axis:
749
+ value = [value]
750
+
751
+ if not is_tensor(value):
752
+ tensor = as_tensor(value)
753
+
754
+ self[key] = tensor
755
+ except: # noqa E722
756
+ if key == "overflowing_tokens":
757
+ raise ValueError(
758
+ "Unable to create tensor returning overflowing tokens of different lengths. "
759
+ "Please see if a fast version of this tokenizer is available to have this feature available."
760
+ )
761
+ raise ValueError(
762
+ "Unable to create tensor, you should probably activate truncation and/or padding "
763
+ "with 'padding=True' 'truncation=True' to have batched tensors with the same length."
764
+ )
765
+
766
+ return self
767
+
768
+
769
+ class SpecialTokensMixin:
770
+ """
771
+ A mixin derived by [`PretrainedTokenizer`] to handle specific behaviors related to
772
+ special tokens. In particular, this class hold the attributes which can be used to directly access these special
773
+ tokens in a model-independent manner and allow to set and update the special tokens.
774
+
775
+ Args:
776
+ bos_token (`str` or `AddedToken`, *optional*):
777
+ A special token representing the beginning of a sentence.
778
+ eos_token (`str` or `AddedToken`, *optional*):
779
+ A special token representing the end of a sentence.
780
+ unk_token (`str` or `AddedToken`, *optional*):
781
+ A special token representing an out-of-vocabulary token.
782
+ sep_token (`str` or `AddedToken`, *optional*):
783
+ A special token separating two different sentences in the same input (used by BERT for instance).
784
+ pad_token (`str` or `AddedToken`, *optional*):
785
+ A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
786
+ attention mechanisms or loss computation.
787
+ cls_token (`str` or `AddedToken`, *optional*):
788
+ A special token representing the class of the input (used by BERT for instance).
789
+ mask_token (`str` or `AddedToken`, *optional*):
790
+ A special token representing a masked token (used by masked-language modeling pretraining objectives, like
791
+ BERT).
792
+ additional_special_tokens (tuple or list of `str` or `AddedToken`, *optional*):
793
+ A tuple or a list of additional special tokens.
794
+ """
795
+
796
+ SPECIAL_TOKENS_ATTRIBUTES = [
797
+ "bos_token",
798
+ "eos_token",
799
+ "unk_token",
800
+ "sep_token",
801
+ "pad_token",
802
+ "cls_token",
803
+ "mask_token",
804
+ "additional_special_tokens",
805
+ ]
806
+
807
+ def __init__(self, verbose=True, **kwargs):
808
+ # note(guosheng): Since `__init__` might be called multiple times which
809
+ # is hooked before `PretrainedTokenizer` init, we do not set to None as
810
+ # HF to avoid unintentional overriding.
811
+ self._bos_token = getattr(self, "_bos_token", None)
812
+ self._eos_token = getattr(self, "_eos_token", None)
813
+ self._unk_token = getattr(self, "_unk_token", None)
814
+ self._sep_token = getattr(self, "_sep_token", None)
815
+ self._pad_token = getattr(self, "_pad_token", None)
816
+ self._cls_token = getattr(self, "_cls_token", None)
817
+ self._mask_token = getattr(self, "_mask_token", None)
818
+ self._pad_token_type_id = getattr(self, "_pad_token_type_id", 0)
819
+ self._additional_special_tokens = getattr(
820
+ self, "_additional_special_tokens", []
821
+ )
822
+ self.verbose = verbose
823
+
824
+ # We directly set the hidden value to allow initialization with special tokens
825
+ # which are not yet in the vocabulary. Necessary for serialization/de-serialization
826
+ # TODO clean this up at some point (probably by switching to fast tokenizers)
827
+ for key, value in kwargs.items():
828
+ if value is None:
829
+ continue
830
+ if key in self.SPECIAL_TOKENS_ATTRIBUTES:
831
+ if key == "additional_special_tokens":
832
+ assert isinstance(
833
+ value, (list, tuple)
834
+ ), f"Value {value} is not a list or tuple"
835
+ assert all(
836
+ isinstance(t, (str, AddedToken)) for t in value
837
+ ), "One of the tokens is not a string or an AddedToken"
838
+ setattr(self, key, value)
839
+ elif isinstance(value, (str, AddedToken)):
840
+ setattr(self, key, value)
841
+ else:
842
+ raise TypeError(
843
+ f"special token {key} has to be either str or AddedToken but got: {type(value)}"
844
+ )
845
+
846
+ def sanitize_special_tokens(self) -> int:
847
+ """
848
+ Make sure that all the special tokens attributes of the tokenizer (`tokenizer.mask_token`,
849
+ `tokenizer.cls_token`, etc.) are in the vocabulary.
850
+
851
+ Add the missing ones to the vocabulary if needed.
852
+
853
+ Return:
854
+ `int`: The number of tokens added in the vocabulary during the operation.
855
+ """
856
+ return self.add_tokens(self.all_special_tokens_extended, special_tokens=True)
857
+
858
+ def add_special_tokens(
859
+ self,
860
+ special_tokens_dict: Dict[str, Union[str, AddedToken]],
861
+ replace_additional_special_tokens=True,
862
+ ) -> int:
863
+ """
864
+ Add a dictionary of special tokens (eos, pad, cls, etc.) to the encoder and link them to class attributes. If
865
+ special tokens are NOT in the vocabulary, they are added to it (indexed starting from the last index of the
866
+ current vocabulary).
867
+
868
+ When adding new tokens to the vocabulary, you should make sure to also resize the token embedding matrix of the
869
+ model so that its embedding matrix matches the tokenizer.
870
+
871
+ In order to do that, please use the [`~PreTrainedModel.resize_token_embeddings`] method.
872
+
873
+ Using `add_special_tokens` will ensure your special tokens can be used in several ways:
874
+
875
+ - Special tokens are carefully handled by the tokenizer (they are never split).
876
+ - You can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This
877
+ makes it easy to develop model-agnostic training and fine-tuning scripts.
878
+
879
+ When possible, special tokens are already registered for provided pretrained models (for instance
880
+ [`BertTokenizer`] `cls_token` is already registered to be :obj*'[CLS]'* and XLM's one is also registered to be
881
+ `'</s>'`).
882
+
883
+ Args:
884
+ special_tokens_dict (dictionary *str* to *str* or `AddedToken`):
885
+ Keys should be in the list of predefined special attributes: [`bos_token`, `eos_token`, `unk_token`,
886
+ `sep_token`, `pad_token`, `cls_token`, `mask_token`, `additional_special_tokens`].
887
+
888
+ Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer
889
+ assign the index of the `unk_token` to them).
890
+ replace_additional_special_tokens (`bool`, *optional*,, defaults to `True`):
891
+ If `True`, the existing list of additional special tokens will be replaced by the list provided in
892
+ `special_tokens_dict`. Otherwise, `self._additional_special_tokens` is just extended. In the former
893
+ case, the tokens will NOT be removed from the tokenizer's full vocabulary - they are only being flagged
894
+ as non-special tokens. Remember, this only affects which tokens are skipped during decoding, not the
895
+ `added_tokens_encoder` and `added_tokens_decoder`. This means that the previous
896
+ `additional_special_tokens` are still added tokens, and will not be split by the model.
897
+
898
+ Returns:
899
+ `int`: Number of tokens added to the vocabulary.
900
+
901
+ Examples:
902
+
903
+ ```python
904
+ # Let's see how to add a new classification token to GPT-2
905
+ tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
906
+ model = GPT2Model.from_pretrained("gpt2")
907
+
908
+ special_tokens_dict = {"cls_token": "<CLS>"}
909
+
910
+ num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
911
+ print("We have added", num_added_toks, "tokens")
912
+ # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
913
+ model.resize_token_embeddings(len(tokenizer))
914
+
915
+ assert tokenizer.cls_token == "<CLS>"
916
+ ```"""
917
+ if not special_tokens_dict:
918
+ return 0
919
+
920
+ added_tokens = []
921
+ for key, value in special_tokens_dict.items():
922
+ assert (
923
+ key in self.SPECIAL_TOKENS_ATTRIBUTES
924
+ ), f"Key {key} is not a special token"
925
+
926
+ if self.verbose:
927
+ logging.info(f"Assigning {value} to the {key} key of the tokenizer")
928
+
929
+ if key == "additional_special_tokens":
930
+ assert isinstance(value, (list, tuple)) and all(
931
+ isinstance(t, (str, AddedToken)) for t in value
932
+ ), f"Tokens {value} for key {key} should all be str or AddedToken instances"
933
+
934
+ to_add = []
935
+ for token in value:
936
+ if (
937
+ not replace_additional_special_tokens
938
+ and str(token) in self.additional_special_tokens
939
+ ):
940
+ continue
941
+ to_add.append(token)
942
+ if replace_additional_special_tokens and len(to_add) > 0:
943
+ setattr(self, key, list(to_add))
944
+ else:
945
+ self._additional_special_tokens.extend(to_add)
946
+ added_tokens += to_add
947
+
948
+ else:
949
+ if not isinstance(value, (str, AddedToken)):
950
+ raise ValueError(
951
+ f"Token {value} for key {key} should be a str or an AddedToken instance"
952
+ )
953
+ setattr(self, key, value)
954
+ if value not in added_tokens:
955
+ added_tokens.append(value)
956
+
957
+ # if we are adding tokens that were not part of the vocab, we ought to add them
958
+ added_tokens = self.add_tokens(added_tokens, special_tokens=True)
959
+ return added_tokens
960
+
961
+ def add_tokens(
962
+ self,
963
+ new_tokens: Union[str, AddedToken, List[Union[str, AddedToken]]],
964
+ special_tokens: bool = False,
965
+ ) -> int:
966
+ """
967
+ Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
968
+ it with indices starting from length of the current vocabulary.
969
+
970
+ Note,None When adding new tokens to the vocabulary, you should make sure to also resize the token embedding
971
+ matrix of the model so that its embedding matrix matches the tokenizer.
972
+
973
+ In order to do that, please use the [`~PreTrainedModel.resize_token_embeddings`] method.
974
+
975
+ Args:
976
+ new_tokens (`str`, `AddedToken` or a list of *str* or `AddedToken`):
977
+ Tokens are only added if they are not already in the vocabulary. `AddedToken` wraps a string
978
+ token to let you personalize its behavior: whether this token should only match against a single word,
979
+ whether this token should strip all potential whitespaces on the left side, whether this token should
980
+ strip all potential whitespaces on the right side, etc.
981
+ special_tokens (`bool`, *optional*, defaults to `False`):
982
+ Can be used to specify if the token is a special token. This mostly change the normalization behavior
983
+ (special tokens like CLS or [MASK] are usually not lower-cased for instance).
984
+
985
+ Returns:
986
+ `int`: Number of tokens added to the vocabulary.
987
+
988
+ Examples:
989
+
990
+ ```python
991
+ # Let's see how to increase the vocabulary of Bert model and tokenizer
992
+ tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
993
+ model = BertModel.from_pretrained("bert-base-uncased")
994
+
995
+ num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
996
+ print("We have added", num_added_toks, "tokens")
997
+ # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
998
+ model.resize_token_embeddings(len(tokenizer))
999
+ ```"""
1000
+ if not new_tokens:
1001
+ return 0
1002
+
1003
+ if not isinstance(new_tokens, (list, tuple)):
1004
+ new_tokens = [new_tokens]
1005
+
1006
+ return self._add_tokens(new_tokens, special_tokens=special_tokens)
1007
+
1008
+ @classmethod
1009
+ def _add_extra_special_tokens(cls, extra_sp_token: Union[str, AddedToken]):
1010
+ if extra_sp_token not in cls.SPECIAL_TOKENS_ATTRIBUTES:
1011
+ cls.SPECIAL_TOKENS_ATTRIBUTES.append(extra_sp_token)
1012
+
1013
+ def _add_tokens(
1014
+ self,
1015
+ new_tokens: Union[List[str], List[AddedToken]],
1016
+ special_tokens: bool = False,
1017
+ ) -> int:
1018
+ raise NotImplementedError
1019
+
1020
+ @property
1021
+ def bos_token(self) -> str:
1022
+ """
1023
+ `str`: Beginning of sentence token. Log an error if used while not having been set.
1024
+ """
1025
+ if self._bos_token is None and self.verbose:
1026
+ logging.error("Using bos_token, but it is not set yet.")
1027
+ return None
1028
+ return str(self._bos_token)
1029
+
1030
+ @property
1031
+ def eos_token(self) -> str:
1032
+ """
1033
+ `str`: End of sentence token. Log an error if used while not having been set.
1034
+ """
1035
+ if self._eos_token is None and self.verbose:
1036
+ logging.error("Using eos_token, but it is not set yet.")
1037
+ return None
1038
+ return str(self._eos_token)
1039
+
1040
+ @property
1041
+ def unk_token(self) -> str:
1042
+ """
1043
+ `str`: Unknown token. Log an error if used while not having been set.
1044
+ """
1045
+ if self._unk_token is None and self.verbose:
1046
+ logging.error("Using unk_token, but it is not set yet.")
1047
+ return None
1048
+ return str(self._unk_token)
1049
+
1050
+ @property
1051
+ def sep_token(self) -> str:
1052
+ """
1053
+ `str`: Separation token, to separate context and query in an input sequence. Log an error if used while not
1054
+ having been set.
1055
+ """
1056
+ if self._sep_token is None and self.verbose:
1057
+ logging.error("Using sep_token, but it is not set yet.")
1058
+ return None
1059
+ return str(self._sep_token)
1060
+
1061
+ @property
1062
+ def pad_token(self) -> str:
1063
+ """
1064
+ `str`: Padding token. Log an error if used while not having been set.
1065
+ """
1066
+ if self._pad_token is None and self.verbose:
1067
+ logging.error("Using pad_token, but it is not set yet.")
1068
+ return None
1069
+ return str(self._pad_token)
1070
+
1071
+ @property
1072
+ def cls_token(self) -> str:
1073
+ """
1074
+ `str`: Classification token, to extract a summary of an input sequence leveraging self-attention along the full
1075
+ depth of the model. Log an error if used while not having been set.
1076
+ """
1077
+ if self._cls_token is None and self.verbose:
1078
+ logging.error("Using cls_token, but it is not set yet.")
1079
+ return None
1080
+ return str(self._cls_token)
1081
+
1082
+ @property
1083
+ def mask_token(self) -> str:
1084
+ """
1085
+ `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not
1086
+ having been set.
1087
+ """
1088
+ if self._mask_token is None and self.verbose:
1089
+ logging.error("Using mask_token, but it is not set yet.")
1090
+ return None
1091
+ return str(self._mask_token)
1092
+
1093
+ @property
1094
+ def additional_special_tokens(self) -> List[str]:
1095
+ """
1096
+ `List[str]`: All the additional special tokens you may want to use. Log an error if used while not having been
1097
+ set.
1098
+ """
1099
+ if self._additional_special_tokens is None and self.verbose:
1100
+ logging.error("Using additional_special_tokens, but it is not set yet.")
1101
+ return None
1102
+ return [str(tok) for tok in self._additional_special_tokens]
1103
+
1104
+ @bos_token.setter
1105
+ def bos_token(self, value):
1106
+ self._bos_token = value
1107
+
1108
+ @eos_token.setter
1109
+ def eos_token(self, value):
1110
+ self._eos_token = value
1111
+
1112
+ @unk_token.setter
1113
+ def unk_token(self, value):
1114
+ self._unk_token = value
1115
+
1116
+ @sep_token.setter
1117
+ def sep_token(self, value):
1118
+ self._sep_token = value
1119
+
1120
+ @pad_token.setter
1121
+ def pad_token(self, value):
1122
+ self._pad_token = value
1123
+
1124
+ @cls_token.setter
1125
+ def cls_token(self, value):
1126
+ self._cls_token = value
1127
+
1128
+ @mask_token.setter
1129
+ def mask_token(self, value):
1130
+ self._mask_token = value
1131
+
1132
+ @additional_special_tokens.setter
1133
+ def additional_special_tokens(self, value):
1134
+ self._additional_special_tokens = value
1135
+
1136
+ @property
1137
+ def bos_token_id(self) -> Optional[int]:
1138
+ """
1139
+ `Optional[int]`: Id of the beginning of sentence token in the vocabulary. Returns `None` if the token has not
1140
+ been set.
1141
+ """
1142
+ if self._bos_token is None:
1143
+ return None
1144
+ return self.convert_tokens_to_ids(self.bos_token)
1145
+
1146
+ @property
1147
+ def eos_token_id(self) -> Optional[int]:
1148
+ """
1149
+ `Optional[int]`: Id of the end of sentence token in the vocabulary. Returns `None` if the token has not been
1150
+ set.
1151
+ """
1152
+ if self._eos_token is None:
1153
+ return None
1154
+ return self.convert_tokens_to_ids(self.eos_token)
1155
+
1156
+ @property
1157
+ def unk_token_id(self) -> Optional[int]:
1158
+ """
1159
+ `Optional[int]`: Id of the unknown token in the vocabulary. Returns `None` if the token has not been set.
1160
+ """
1161
+ if self._unk_token is None:
1162
+ return None
1163
+ return self.convert_tokens_to_ids(self.unk_token)
1164
+
1165
+ @property
1166
+ def sep_token_id(self) -> Optional[int]:
1167
+ """
1168
+ `Optional[int]`: Id of the separation token in the vocabulary, to separate context and query in an input
1169
+ sequence. Returns `None` if the token has not been set.
1170
+ """
1171
+ if self._sep_token is None:
1172
+ return None
1173
+ return self.convert_tokens_to_ids(self.sep_token)
1174
+
1175
+ @property
1176
+ def pad_token_id(self) -> Optional[int]:
1177
+ """
1178
+ `Optional[int]`: Id of the padding token in the vocabulary. Returns `None` if the token has not been set.
1179
+ """
1180
+ if self._pad_token is None:
1181
+ return None
1182
+ return self.convert_tokens_to_ids(self.pad_token)
1183
+
1184
+ @property
1185
+ def pad_token_type_id(self) -> int:
1186
+ """
1187
+ `int`: Id of the padding token type in the vocabulary.
1188
+ """
1189
+ return self._pad_token_type_id
1190
+
1191
+ @property
1192
+ def cls_token_id(self) -> Optional[int]:
1193
+ """
1194
+ `Optional[int]`: Id of the classification token in the vocabulary, to extract a summary of an input sequence
1195
+ leveraging self-attention along the full depth of the model.
1196
+
1197
+ Returns `None` if the token has not been set.
1198
+ """
1199
+ if self._cls_token is None:
1200
+ return None
1201
+ return self.convert_tokens_to_ids(self.cls_token)
1202
+
1203
+ @property
1204
+ def mask_token_id(self) -> Optional[int]:
1205
+ """
1206
+ `Optional[int]`: Id of the mask token in the vocabulary, used when training a model with masked-language
1207
+ modeling. Returns `None` if the token has not been set.
1208
+ """
1209
+ if self._mask_token is None:
1210
+ return None
1211
+ return self.convert_tokens_to_ids(self.mask_token)
1212
+
1213
+ @property
1214
+ def additional_special_tokens_ids(self) -> List[int]:
1215
+ """
1216
+ `List[int]`: Ids of all the additional special tokens in the vocabulary. Log an error if used while not having
1217
+ been set.
1218
+ """
1219
+ return self.convert_tokens_to_ids(self.additional_special_tokens)
1220
+
1221
+ @bos_token_id.setter
1222
+ def bos_token_id(self, value):
1223
+ self._bos_token = (
1224
+ self.convert_ids_to_tokens(value) if value is not None else None
1225
+ )
1226
+
1227
+ @eos_token_id.setter
1228
+ def eos_token_id(self, value):
1229
+ self._eos_token = (
1230
+ self.convert_ids_to_tokens(value) if value is not None else None
1231
+ )
1232
+
1233
+ @unk_token_id.setter
1234
+ def unk_token_id(self, value):
1235
+ self._unk_token = (
1236
+ self.convert_ids_to_tokens(value) if value is not None else None
1237
+ )
1238
+
1239
+ @sep_token_id.setter
1240
+ def sep_token_id(self, value):
1241
+ self._sep_token = (
1242
+ self.convert_ids_to_tokens(value) if value is not None else None
1243
+ )
1244
+
1245
+ @pad_token_id.setter
1246
+ def pad_token_id(self, value):
1247
+ self._pad_token = (
1248
+ self.convert_ids_to_tokens(value) if value is not None else None
1249
+ )
1250
+
1251
+ @cls_token_id.setter
1252
+ def cls_token_id(self, value):
1253
+ self._cls_token = (
1254
+ self.convert_ids_to_tokens(value) if value is not None else None
1255
+ )
1256
+
1257
+ @mask_token_id.setter
1258
+ def mask_token_id(self, value):
1259
+ self._mask_token = (
1260
+ self.convert_ids_to_tokens(value) if value is not None else None
1261
+ )
1262
+
1263
+ @additional_special_tokens_ids.setter
1264
+ def additional_special_tokens_ids(self, values):
1265
+ self._additional_special_tokens = [
1266
+ self.convert_ids_to_tokens(value) for value in values
1267
+ ]
1268
+
1269
+ @property
1270
+ def special_tokens_map(self) -> Dict[str, Union[str, List[str]]]:
1271
+ """
1272
+ `Dict[str, Union[str, List[str]]]`: A dictionary mapping special token class attributes (`cls_token`,
1273
+ `unk_token`, etc.) to their values (`'<unk>'`, `'<cls>'`, etc.).
1274
+
1275
+ Convert potential tokens of `AddedToken` type to string.
1276
+ """
1277
+ set_attr = {}
1278
+ for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
1279
+ try:
1280
+ attr_value = getattr(self, "_" + attr)
1281
+ except:
1282
+ try:
1283
+ attr_value = getattr(self, attr)
1284
+ except:
1285
+ continue
1286
+ if attr_value:
1287
+ set_attr[attr] = (
1288
+ type(attr_value)(
1289
+ str(attr_value_sub) for attr_value_sub in attr_value
1290
+ )
1291
+ if isinstance(attr_value, (list, tuple))
1292
+ else str(attr_value)
1293
+ )
1294
+ return set_attr
1295
+
1296
+ @property
1297
+ def special_tokens_map_extended(
1298
+ self,
1299
+ ) -> Dict[str, Union[str, AddedToken, List[Union[str, AddedToken]]]]:
1300
+ """
1301
+ `Dict[str, Union[str, AddedToken, List[Union[str, AddedToken]]]]`: A dictionary mapping
1302
+ special token class attributes (`cls_token`, `unk_token`, etc.) to their values (`'<unk>'`, `'<cls>'`, etc.).
1303
+
1304
+ Don't convert tokens of `AddedToken` type to string so they can be used to control more finely how
1305
+ special tokens are tokenized.
1306
+ """
1307
+ set_attr = {}
1308
+ for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
1309
+ try:
1310
+ attr_value = getattr(self, "_" + attr)
1311
+ except:
1312
+ try:
1313
+ attr_value = getattr(self, attr)
1314
+ except:
1315
+ continue
1316
+ if attr_value:
1317
+ set_attr[attr] = attr_value
1318
+ return set_attr
1319
+
1320
+ @property
1321
+ def all_special_tokens(self) -> List[str]:
1322
+ """
1323
+ `List[str]`: All the special tokens (`'<unk>'`, `'<cls>'`, etc.) mapped to class attributes.
1324
+
1325
+ Convert tokens of `AddedToken` type to string.
1326
+ """
1327
+ all_toks = [str(s) for s in self.all_special_tokens_extended]
1328
+ return all_toks
1329
+
1330
+ @property
1331
+ def all_special_tokens_extended(self) -> List[Union[str, AddedToken]]:
1332
+ """
1333
+ `List[Union[str, AddedToken]]`: All the special tokens (`'<unk>'`, `'<cls>'`, etc.) mapped to class
1334
+ attributes.
1335
+
1336
+ Don't convert tokens of `AddedToken` type to string so they can be used to control more finely how
1337
+ special tokens are tokenized.
1338
+ """
1339
+ all_tokens = []
1340
+ seen = set()
1341
+ for value in self.special_tokens_map_extended.values():
1342
+ if isinstance(value, (list, tuple)):
1343
+ tokens_to_add = [token for token in value if str(token) not in seen]
1344
+ else:
1345
+ tokens_to_add = [value] if str(value) not in seen else []
1346
+ seen.update(map(str, tokens_to_add))
1347
+ all_tokens.extend(tokens_to_add)
1348
+ return all_tokens
1349
+
1350
+ @property
1351
+ def all_special_ids(self) -> List[int]:
1352
+ """
1353
+ `List[int]`: List the ids of the special tokens(`'<unk>'`, `'<cls>'`, etc.) mapped to class attributes.
1354
+ """
1355
+ all_toks = self.all_special_tokens
1356
+ all_ids = self.convert_tokens_to_ids(all_toks)
1357
+ return all_ids
1358
+
1359
+
1360
+ class PretrainedTokenizerBase(SpecialTokensMixin):
1361
+ """
1362
+ Base class for [`PretrainedTokenizer`].
1363
+
1364
+ Class attributes (overridden by derived classes)
1365
+
1366
+ - **resource_files_names** (`Dict[str, str]`) -- A dictionary with, as keys, the `__init__` keyword name of each
1367
+ vocabulary file required by the model, and as associated values, the filename for saving the associated file
1368
+ (string).
1369
+ - **pretrained_resource_files_map** (`Dict[str, Dict[str, str]]`) -- A dictionary of dictionaries, with the
1370
+ high-level keys being the `__init__` keyword name of each vocabulary file required by the model, the
1371
+ low-level being the `short-cut-names` of the pretrained models with, as associated values, the `url` to the
1372
+ associated pretrained vocabulary file.
1373
+ - **max_model_input_sizes** (`Dict[str, Optional[int]]`) -- A dictionary with, as keys, the `short-cut-names`
1374
+ of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model,
1375
+ or `None` if the model has no maximum input size.
1376
+ - **pretrained_init_configuration** (`Dict[str, Dict[str, Any]]`) -- A dictionary with, as keys, the
1377
+ `short-cut-names` of the pretrained models, and as associated values, a dictionary of specific arguments to
1378
+ pass to the `__init__` method of the tokenizer class for this pretrained model when loading the tokenizer
1379
+ with the [`~tokenizer_utils_base.PretrainedTokenizerBase.from_pretrained`] method.
1380
+ - **model_input_names** (`List[str]`) -- A list of inputs expected in the forward pass of the model.
1381
+ - **padding_side** (`str`) -- The default value for the side on which the model should have padding applied.
1382
+ Should be `'right'` or `'left'`.
1383
+ - **truncation_side** (`str`) -- The default value for the side on which the model should have truncation
1384
+ applied. Should be `'right'` or `'left'`.
1385
+
1386
+ Args:
1387
+ model_max_length (`int`, *optional*):
1388
+ The maximum length (in number of tokens) for the inputs to the transformer model. When the tokenizer is
1389
+ loaded with [`~tokenizer_utils_base.PretrainedTokenizerBase.from_pretrained`], this will be set to the
1390
+ value stored for the associated model in `max_model_input_sizes` (see above). If no value is provided, will
1391
+ default to VERY_LARGE_INTEGER (`int(1e30)`).
1392
+ padding_side (`str`, *optional*):
1393
+ The side on which the model should have padding applied. Should be selected between ['right', 'left'].
1394
+ Default value is picked from the class attribute of the same name.
1395
+ truncation_side (`str`, *optional*):
1396
+ The side on which the model should have truncation applied. Should be selected between ['right', 'left'].
1397
+ Default value is picked from the class attribute of the same name.
1398
+ model_input_names (`List[string]`, *optional*):
1399
+ The list of inputs accepted by the forward pass of the model (like `"token_type_ids"` or
1400
+ `"attention_mask"`). Default value is picked from the class attribute of the same name.
1401
+ bos_token (`str` or `AddedToken`, *optional*):
1402
+ A special token representing the beginning of a sentence. Will be associated to `self.bos_token` and
1403
+ `self.bos_token_id`.
1404
+ eos_token (`str` or `AddedToken`, *optional*):
1405
+ A special token representing the end of a sentence. Will be associated to `self.eos_token` and
1406
+ `self.eos_token_id`.
1407
+ unk_token (`str` or `AddedToken`, *optional*):
1408
+ A special token representing an out-of-vocabulary token. Will be associated to `self.unk_token` and
1409
+ `self.unk_token_id`.
1410
+ sep_token (`str` or `AddedToken`, *optional*):
1411
+ A special token separating two different sentences in the same input (used by BERT for instance). Will be
1412
+ associated to `self.sep_token` and `self.sep_token_id`.
1413
+ pad_token (`str` or `AddedToken`, *optional*):
1414
+ A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
1415
+ attention mechanisms or loss computation. Will be associated to `self.pad_token` and `self.pad_token_id`.
1416
+ cls_token (`str` or `AddedToken`, *optional*):
1417
+ A special token representing the class of the input (used by BERT for instance). Will be associated to
1418
+ `self.cls_token` and `self.cls_token_id`.
1419
+ mask_token (`str` or `AddedToken`, *optional*):
1420
+ A special token representing a masked token (used by masked-language modeling pretraining objectives, like
1421
+ BERT). Will be associated to `self.mask_token` and `self.mask_token_id`.
1422
+ additional_special_tokens (tuple or list of `str` or `AddedToken`, *optional*):
1423
+ A tuple or a list of additional special tokens. Add them here to ensure they won't be split by the
1424
+ tokenization process. Will be associated to `self.additional_special_tokens` and
1425
+ `self.additional_special_tokens_ids`.
1426
+ """
1427
+
1428
+ resource_files_names: Dict[str, str] = {}
1429
+ pretrained_resource_files_map: Dict[str, Dict[str, str]] = {}
1430
+ pretrained_init_configuration: Dict[str, Dict[str, Any]] = {}
1431
+ max_model_input_sizes: Dict[str, Optional[int]] = {}
1432
+ _auto_class: Optional[str] = None
1433
+ tokenizer_config_file = TOKENIZER_CONFIG_NAME
1434
+
1435
+ # first name has to correspond to main model input name
1436
+ # to make sure `tokenizer.pad(...)` works correctly
1437
+ model_input_names: List[str] = ["input_ids", "token_type_ids"]
1438
+ padding_side: str = "right"
1439
+ truncation_side: str = "right"
1440
+ slow_tokenizer_class = None
1441
+
1442
+ def __init__(self, **kwargs):
1443
+ # inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``)
1444
+ self.init_inputs = ()
1445
+
1446
+ self.init_kwargs = getattr(self, "init_kwargs", None) or copy.deepcopy(kwargs)
1447
+ self.name_or_path = kwargs.pop("name_or_path", "")
1448
+ self._processor_class = kwargs.pop("processor_class", None)
1449
+
1450
+ # For backward compatibility we fallback to set model_max_length from max_len if provided
1451
+ model_max_length = kwargs.pop("model_max_length", kwargs.pop("max_len", None))
1452
+ self.model_max_length = (
1453
+ model_max_length if model_max_length is not None else VERY_LARGE_INTEGER
1454
+ )
1455
+
1456
+ # Padding and truncation side are right by default and overridden in subclasses. If specified in the kwargs, it
1457
+ # is changed.
1458
+ self.padding_side = kwargs.pop("padding_side", self.padding_side)
1459
+ if self.padding_side not in ["right", "left"]:
1460
+ raise ValueError(
1461
+ f"Padding side should be selected between 'right' and 'left', current value: {self.padding_side}"
1462
+ )
1463
+
1464
+ self.truncation_side = kwargs.pop("truncation_side", self.truncation_side)
1465
+ if self.truncation_side not in ["right", "left"]:
1466
+ raise ValueError(
1467
+ f"Padding side should be selected between 'right' and 'left', current value: {self.truncation_side}"
1468
+ )
1469
+
1470
+ self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)
1471
+
1472
+ self.clean_up_tokenization_spaces = kwargs.pop(
1473
+ "clean_up_tokenization_spaces", False
1474
+ )
1475
+
1476
+ self.split_special_tokens = kwargs.pop("split_special_tokens", False)
1477
+
1478
+ self.deprecation_warnings = (
1479
+ {}
1480
+ ) # Use to store when we have already noticed a deprecation warning (avoid overlogging).
1481
+
1482
+ super().__init__(**kwargs)
1483
+
1484
+ @property
1485
+ def max_len_single_sentence(self) -> int:
1486
+ """
1487
+ `int`: The maximum length of a sentence that can be fed to the model.
1488
+ """
1489
+ return self.model_max_length - self.num_special_tokens_to_add(pair=False)
1490
+
1491
+ @property
1492
+ def max_len_sentences_pair(self) -> int:
1493
+ """
1494
+ `int`: The maximum combined length of a pair of sentences that can be fed to the model.
1495
+ """
1496
+ return self.model_max_length - self.num_special_tokens_to_add(pair=True)
1497
+
1498
+ @max_len_single_sentence.setter
1499
+ def max_len_single_sentence(self, value) -> int:
1500
+ # For backward compatibility, allow to try to setup 'max_len_single_sentence'.
1501
+ if (
1502
+ value == self.model_max_length - self.num_special_tokens_to_add(pair=False)
1503
+ and self.verbose
1504
+ ):
1505
+ if not self.deprecation_warnings.get("max_len_single_sentence", False):
1506
+ warnings.warn(
1507
+ "Setting 'max_len_single_sentence' is now deprecated. "
1508
+ "This value is automatically set up."
1509
+ )
1510
+ self.deprecation_warnings["max_len_single_sentence"] = True
1511
+ else:
1512
+ raise ValueError(
1513
+ "Setting 'max_len_single_sentence' is now deprecated. "
1514
+ "This value is automatically set up."
1515
+ )
1516
+
1517
+ def _switch_to_input_mode(self):
1518
+ """
1519
+ Private method to put the tokenizer in input mode (when it has different modes for input/outputs)
1520
+ """
1521
+ pass
1522
+
1523
+ @max_len_sentences_pair.setter
1524
+ def max_len_sentences_pair(self, value) -> int:
1525
+ if (
1526
+ value == self.model_max_length - self.num_special_tokens_to_add(pair=True)
1527
+ and self.verbose
1528
+ ):
1529
+ if not self.deprecation_warnings.get("max_len_sentences_pair", False):
1530
+ warnings.warn(
1531
+ "Setting 'max_len_sentences_pair' is now deprecated. "
1532
+ "This value is automatically set up."
1533
+ )
1534
+ self.deprecation_warnings["max_len_sentences_pair"] = True
1535
+ else:
1536
+ raise ValueError(
1537
+ "Setting 'max_len_sentences_pair' is now deprecated. "
1538
+ "This value is automatically set up."
1539
+ )
1540
+
1541
+ def _set_processor_class(self, processor_class: str):
1542
+ """Sets processor class as an attribute."""
1543
+ self._processor_class = processor_class
1544
+
1545
+ def __repr__(self) -> str:
1546
+ added_tokens_decoder_rep = "\n\t".join(
1547
+ [f"{k}: {v.__repr__()}," for k, v in self.added_tokens_decoder.items()]
1548
+ )
1549
+ return (
1550
+ f"{self.__class__.__name__}(name_or_path='{self.name_or_path}',"
1551
+ f" vocab_size={self.vocab_size}, model_max_length={self.model_max_length}, is_fast={self.is_fast},"
1552
+ f" padding_side='{self.padding_side}', truncation_side='{self.truncation_side}',"
1553
+ f" special_tokens={self.special_tokens_map}, clean_up_tokenization_spaces={self.clean_up_tokenization_spaces}), "
1554
+ " added_tokens_decoder={\n\t" + added_tokens_decoder_rep + "\n}"
1555
+ )
1556
+
1557
+ def get_vocab(self) -> Dict[str, int]:
1558
+ """
1559
+ Returns the vocabulary as a dictionary of token to index.
1560
+
1561
+ `tokenizer.get_vocab()[token]` is equivalent to `tokenizer.convert_tokens_to_ids(token)` when `token` is in the
1562
+ vocab.
1563
+
1564
+ Returns:
1565
+ `Dict[str, int]`: The vocabulary.
1566
+ """
1567
+ raise NotImplementedError()
1568
+
1569
+ @classmethod
1570
+ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
1571
+ """
1572
+ Creates an instance of `PretrainedTokenizer`. Related resources are loaded
1573
+ by specifying name of a built-in pretrained model, or a community-contributed
1574
+ pretrained model, or a local file directory path.
1575
+
1576
+ Args:
1577
+ pretrained_model_name_or_path (str): Name of pretrained model or dir path
1578
+ to load from. The string can be:
1579
+
1580
+ - Name of built-in pretrained model
1581
+ - Name of a community-contributed pretrained model.
1582
+ - Local directory path which contains tokenizer related resources
1583
+ and tokenizer config file ("tokenizer_config.json").
1584
+ from_hf_hub (bool, optional): whether to load from Huggingface Hub
1585
+ subfolder (str, optional) An optional value corresponding to a folder inside the repo.
1586
+ Only works when loading from Huggingface Hub.
1587
+ *args (tuple): position arguments for model `__init__`. If provided,
1588
+ use these as position argument values for tokenizer initialization.
1589
+ **kwargs (dict): keyword arguments for model `__init__`. If provided,
1590
+ use these to update pre-defined keyword argument values for tokenizer
1591
+ initialization.
1592
+
1593
+ Returns:
1594
+ PretrainedTokenizer: An instance of `PretrainedTokenizer`.
1595
+
1596
+ Example:
1597
+ .. code-block::
1598
+
1599
+ from paddlenlp.transformers import BertTokenizer
1600
+
1601
+ # Name of built-in pretrained model
1602
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
1603
+
1604
+ # Name of community-contributed pretrained model
1605
+ tokenizer = BertTokenizer.from_pretrained('yingyibiao/bert-base-uncased-sst-2-finetuned')
1606
+
1607
+ # Load from local directory path
1608
+ tokenizer = BertTokenizer.from_pretrained('./my_bert/')
1609
+ """
1610
+ cache_dir = kwargs.pop("cache_dir", None)
1611
+ from_hf_hub = kwargs.pop("from_hf_hub", False)
1612
+ from_aistudio = kwargs.pop("from_aistudio", False)
1613
+ subfolder = kwargs.pop("subfolder", "")
1614
+ return_tokenizer_file_dir = kwargs.pop("return_tokenizer_file_dir", False)
1615
+
1616
+ pretrained_model_name_or_path = str(pretrained_model_name_or_path)
1617
+ vocab_files = {}
1618
+ init_configuration = {}
1619
+
1620
+ additional_files_names = {
1621
+ "added_tokens_file": ADDED_TOKENS_FILE,
1622
+ "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,
1623
+ "tokenizer_config_file": TOKENIZER_CONFIG_FILE,
1624
+ "chat_template_file": CHAT_TEMPLATE_CONFIG_NAME,
1625
+ }
1626
+
1627
+ if hasattr(cls, "vocab_files_names") and len(cls.resource_files_names) == 0:
1628
+ cls.resource_files_names = copy.deepcopy(cls.vocab_files_names)
1629
+ logging.error(
1630
+ "The attribute 'vocab_files_names' is deprecated. Please use 'resource_files_names' instead.",
1631
+ DeprecationWarning,
1632
+ )
1633
+ vocab_files_target = {**cls.resource_files_names, **additional_files_names}
1634
+ # From HF Hub or AI Studio
1635
+ if from_hf_hub or from_aistudio:
1636
+ # Only include the necessary resource files specified by the tokenizer cls
1637
+ # Deep copy to avoid modifying the class attributes
1638
+ vocab_files = copy.deepcopy(cls.resource_files_names)
1639
+ vocab_files["tokenizer_config_file"] = cls.tokenizer_config_file
1640
+
1641
+ # From built-in pretrained models
1642
+ elif pretrained_model_name_or_path in cls.pretrained_init_configuration:
1643
+ for file_id, map_list in cls.pretrained_resource_files_map.items():
1644
+ vocab_files[file_id] = map_list[pretrained_model_name_or_path]
1645
+ init_configuration = copy.deepcopy(
1646
+ cls.pretrained_init_configuration[pretrained_model_name_or_path]
1647
+ )
1648
+ # From local dir path
1649
+ elif os.path.isdir(pretrained_model_name_or_path):
1650
+ vocab_files_target["tokenizer_config_file"] = cls.tokenizer_config_file
1651
+ for file_id, file_name in vocab_files_target.items():
1652
+ full_file_name = os.path.join(
1653
+ pretrained_model_name_or_path, subfolder, file_name
1654
+ )
1655
+ if os.path.isfile(full_file_name):
1656
+ vocab_files[file_id] = full_file_name
1657
+ else:
1658
+ # Assuming from community-contributed pretrained models
1659
+ for file_id, file_name in vocab_files_target.items():
1660
+ vocab_files[file_id] = file_name
1661
+ resolved_vocab_files = {}
1662
+ for file_id, file_path in vocab_files.items():
1663
+ # adapt to PaddleX
1664
+ resolved_vocab_files[file_id] = file_path
1665
+
1666
+ for file_id, file_path in resolved_vocab_files.items():
1667
+ if resolved_vocab_files[file_id] is not None:
1668
+ cache_dir = os.path.dirname(resolved_vocab_files[file_id])
1669
+ break
1670
+ return cls._from_pretrained(
1671
+ resolved_vocab_files,
1672
+ pretrained_model_name_or_path,
1673
+ init_configuration,
1674
+ *args,
1675
+ cache_dir=cache_dir,
1676
+ return_tokenizer_file_dir=return_tokenizer_file_dir,
1677
+ from_hf_hub=from_hf_hub,
1678
+ **kwargs,
1679
+ )
1680
+
1681
+ @classmethod
1682
+ def _from_pretrained(
1683
+ cls,
1684
+ resolved_vocab_files,
1685
+ pretrained_model_name_or_path,
1686
+ init_configuration,
1687
+ *init_inputs,
1688
+ cache_dir=None,
1689
+ return_tokenizer_file_dir=False,
1690
+ from_hf_hub=False,
1691
+ **kwargs,
1692
+ ):
1693
+ if cls.__name__.endswith("Fast"):
1694
+ from_slow = kwargs.get("from_slow", False)
1695
+ else:
1696
+ from_slow = kwargs.get("from_slow", True)
1697
+ has_tokenizer_file = (
1698
+ resolved_vocab_files.get("tokenizer_file", None) is not None
1699
+ )
1700
+ if (
1701
+ from_slow or not has_tokenizer_file
1702
+ ) and cls.slow_tokenizer_class is not None:
1703
+ slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained(
1704
+ copy.deepcopy(resolved_vocab_files),
1705
+ pretrained_model_name_or_path,
1706
+ copy.deepcopy(init_configuration),
1707
+ *init_inputs,
1708
+ cache_dir=cache_dir,
1709
+ **(copy.deepcopy(kwargs)),
1710
+ )
1711
+ else:
1712
+ slow_tokenizer = None
1713
+ tokenizer_config_file_dir_list = set()
1714
+ for k, v in resolved_vocab_files.items():
1715
+ if v is not None and os.path.isfile(v):
1716
+ tokenizer_config_file_dir_list.add(os.path.dirname(v))
1717
+ tokenizer_config_file_dir_list = list(tokenizer_config_file_dir_list)
1718
+ # TODO: check this
1719
+ assert (
1720
+ len(tokenizer_config_file_dir_list) > 0
1721
+ ), "All tokenizer files should be in the same directory."
1722
+
1723
+ has_tokenizer_file = (
1724
+ resolved_vocab_files.get("tokenizer_file", None) is not None
1725
+ )
1726
+ tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None)
1727
+ if tokenizer_config_file is not None:
1728
+ with io.open(tokenizer_config_file, encoding="utf-8") as f:
1729
+ init_kwargs = json.load(f)
1730
+ init_kwargs.pop("tokenizer_class", None)
1731
+ else:
1732
+ init_kwargs = init_configuration
1733
+
1734
+ if slow_tokenizer is not None:
1735
+ init_kwargs["__slow_tokenizer"] = slow_tokenizer
1736
+ init_kwargs["name_or_path"] = pretrained_model_name_or_path
1737
+ init_kwargs["from_slow"] = from_slow
1738
+
1739
+ pass_added_tokens_file = False
1740
+ added_tokens_decoder: Dict[int, AddedToken] = {}
1741
+ if "added_tokens_decoder" in init_kwargs:
1742
+ for idx, token in init_kwargs["added_tokens_decoder"].items():
1743
+ if isinstance(token, dict):
1744
+ token = AddedToken(**token)
1745
+ if isinstance(token, AddedToken):
1746
+ added_tokens_decoder[int(idx)] = token
1747
+ else:
1748
+ raise ValueError(
1749
+ f"Found a {token.__class__} in the saved `added_tokens_decoder`, should be a dictionary or an AddedToken instance"
1750
+ )
1751
+ init_kwargs["added_tokens_decoder"] = (
1752
+ added_tokens_decoder # NOTE tokenizer_config.json下, 注册的`added_tokens_decoder`被解析成字典
1753
+ )
1754
+ pass_added_tokens_file = True
1755
+
1756
+ init_kwargs.pop("init_class", None)
1757
+
1758
+ init_kwargs.update(kwargs)
1759
+
1760
+ def convert_added_tokens(obj):
1761
+ if (
1762
+ isinstance(obj, dict)
1763
+ and "__type" in obj
1764
+ and obj["__type"] == "AddedToken"
1765
+ ):
1766
+ obj.pop("__type")
1767
+ return AddedToken(**obj)
1768
+ elif isinstance(obj, (list, tuple)):
1769
+ return list(convert_added_tokens(o) for o in obj)
1770
+ elif isinstance(obj, dict):
1771
+ return {k: convert_added_tokens(v) for k, v in obj.items()}
1772
+ return obj
1773
+
1774
+ init_kwargs = convert_added_tokens(init_kwargs)
1775
+ if pretrained_model_name_or_path in cls.max_model_input_sizes:
1776
+ model_max_length = cls.max_model_input_sizes[pretrained_model_name_or_path]
1777
+ if model_max_length is not None and isinstance(
1778
+ model_max_length, (int, float)
1779
+ ):
1780
+ init_kwargs["model_max_length"] = min(
1781
+ init_kwargs.get("model_max_length", int(1e30)), model_max_length
1782
+ )
1783
+
1784
+ for args_name, file_path in resolved_vocab_files.items():
1785
+ if args_name not in init_kwargs or init_kwargs[args_name] is None:
1786
+ init_kwargs[args_name] = file_path
1787
+ elif not os.path.isfile(init_kwargs[args_name] or "") and os.path.isfile(
1788
+ file_path
1789
+ ):
1790
+ init_kwargs[args_name] = file_path
1791
+
1792
+ if from_hf_hub and "tokenizer_file" in init_kwargs:
1793
+ init_kwargs.pop("tokenizer_file")
1794
+
1795
+ try:
1796
+ tokenizer = cls(*init_inputs, **init_kwargs)
1797
+ # adapt to PaddleX
1798
+ except RuntimeError as e:
1799
+ if "sentencepiece_processor.cc" in str(e):
1800
+ logging.info(
1801
+ "Unable to load tokenizer model from SPM, loading from TikToken will be attempted instead."
1802
+ "(SentencePiece RuntimeError: Tried to load SPM model with non-SPM vocab file).",
1803
+ )
1804
+ return False
1805
+
1806
+ chat_template = init_kwargs.pop("chat_template", None)
1807
+ if chat_template is not None:
1808
+ tokenizer.init_chat_template(chat_template)
1809
+ special_tokens_map_file = resolved_vocab_files.pop(
1810
+ "special_tokens_map_file", None
1811
+ )
1812
+ if special_tokens_map_file is not None:
1813
+ with open(
1814
+ special_tokens_map_file, encoding="utf-8"
1815
+ ) as special_tokens_map_handle:
1816
+ special_tokens_map = json.load(special_tokens_map_handle)
1817
+ for key, value in special_tokens_map.items():
1818
+ if key in kwargs and kwargs[key]:
1819
+ continue
1820
+ if isinstance(value, dict):
1821
+ value = AddedToken(**value)
1822
+ elif isinstance(value, list):
1823
+ value = [
1824
+ AddedToken(**token) if isinstance(token, dict) else token
1825
+ for token in value
1826
+ ]
1827
+ setattr(tokenizer, key, value)
1828
+ cls._add_extra_special_tokens(key)
1829
+
1830
+ special_tokens = tokenizer.all_special_tokens
1831
+ added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None)
1832
+ added_tokens_file = None if pass_added_tokens_file else added_tokens_file
1833
+ if added_tokens_file is not None:
1834
+ with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
1835
+ added_tok_encoder = json.load(added_tokens_handle)
1836
+
1837
+ added_tok_encoder_sorted = list(
1838
+ sorted(added_tok_encoder.items(), key=lambda x: x[1])
1839
+ )
1840
+ for token, index in added_tok_encoder_sorted:
1841
+ if (
1842
+ has_tokenizer_file
1843
+ and index != len(tokenizer)
1844
+ and tokenizer.convert_tokens_to_ids(token) != index
1845
+ ):
1846
+ raise ValueError(
1847
+ f"Wrong index found for {token}: should be {tokenizer.convert_tokens_to_ids(token)} but found "
1848
+ f"{index}."
1849
+ )
1850
+ elif not has_tokenizer_file and index != len(tokenizer):
1851
+ raise ValueError(
1852
+ f"Non-consecutive added token '{token}' found. "
1853
+ f"Should have index {len(tokenizer)} but has index {index} in saved vocabulary."
1854
+ )
1855
+
1856
+ tokenizer.add_tokens(
1857
+ token, special_tokens=bool(token in special_tokens)
1858
+ )
1859
+ added_tokens = tokenizer.sanitize_special_tokens()
1860
+ if added_tokens:
1861
+ logging.info(
1862
+ "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained."
1863
+ )
1864
+ if pretrained_model_name_or_path in cls.pretrained_init_configuration:
1865
+ tokenizer.save_pretrained(cache_dir)
1866
+
1867
+ if return_tokenizer_file_dir:
1868
+ return tokenizer, list(tokenizer_config_file_dir_list)[0]
1869
+ return tokenizer
1870
+
1871
+ def save_pretrained(
1872
+ self, save_directory, filename_prefix: Optional[str] = None, **kwargs
1873
+ ):
1874
+ """
1875
+ Save tokenizer configuration and related resources to files under
1876
+ `save_directory`. The tokenizer configuration would be saved into
1877
+ `tokenizer_config_file` indicating file (thus `tokenizer_config.json`),
1878
+ and resources would be saved into `resource_files_names` indicating files
1879
+ by using `self.save_resources(save_directory)`.
1880
+
1881
+ The `save_directory` can be used in `from_pretrained` as argument value
1882
+ of `pretrained_model_name_or_path` to re-load the tokenizer.
1883
+
1884
+ Args:
1885
+ save_directory (str): Directory to save files into.
1886
+ filename_prefix: (str, optional):
1887
+ A prefix to add to the names of the files saved by the tokenizer.
1888
+
1889
+ Example:
1890
+ .. code-block::
1891
+
1892
+ from paddlenlp.transformers import BertTokenizer
1893
+
1894
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
1895
+ tokenizer.save_pretrained('trained_model')
1896
+ # reload from save_directory
1897
+ tokenizer = BertTokenizer.from_pretrained('trained_model')
1898
+ """
1899
+ assert not os.path.isfile(
1900
+ save_directory
1901
+ ), "Saving directory ({}) should be a directory, not a file".format(
1902
+ save_directory
1903
+ )
1904
+ os.makedirs(save_directory, exist_ok=True)
1905
+
1906
+ special_tokens_map_file = os.path.join(
1907
+ save_directory,
1908
+ (filename_prefix + "-" if filename_prefix else "")
1909
+ + SPECIAL_TOKENS_MAP_FILE,
1910
+ )
1911
+ tokenizer_config_file = os.path.join(
1912
+ save_directory,
1913
+ (filename_prefix + "-" if filename_prefix else "")
1914
+ + self.tokenizer_config_file,
1915
+ )
1916
+
1917
+ tokenizer_config = copy.deepcopy(self.init_kwargs)
1918
+ if len(self.init_inputs) > 0:
1919
+ tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs)
1920
+ for file_id in self.resource_files_names.keys():
1921
+ tokenizer_config.pop(file_id, None)
1922
+
1923
+ def convert_added_tokens(obj: Union[AddedToken, Any], add_type_field=True):
1924
+ if isinstance(obj, AddedToken):
1925
+ out = obj.__getstate__()
1926
+ if add_type_field:
1927
+ out["__type"] = "AddedToken"
1928
+ return out
1929
+ elif isinstance(obj, (list, tuple)):
1930
+ return list(
1931
+ convert_added_tokens(o, add_type_field=add_type_field) for o in obj
1932
+ )
1933
+ elif isinstance(obj, dict):
1934
+ return {
1935
+ k: convert_added_tokens(v, add_type_field=add_type_field)
1936
+ for k, v in obj.items()
1937
+ }
1938
+ return obj
1939
+
1940
+ tokenizer_config = convert_added_tokens(tokenizer_config, add_type_field=True)
1941
+
1942
+ added_tokens = {}
1943
+ for key, value in self.added_tokens_decoder.items():
1944
+ if isinstance(value, AddedToken):
1945
+ added_tokens[key] = value.__getstate__()
1946
+ else:
1947
+ added_tokens[key] = AddedToken(value).__getstate__()
1948
+ tokenizer_config["added_tokens_decoder"] = added_tokens
1949
+
1950
+ tokenizer_class = self.__class__.__name__
1951
+ tokenizer_config["tokenizer_class"] = tokenizer_class
1952
+
1953
+ with io.open(tokenizer_config_file, "w", encoding="utf-8") as f:
1954
+ f.write(json.dumps(tokenizer_config, ensure_ascii=False))
1955
+ logging.info(f"tokenizer config file saved in {tokenizer_config_file}")
1956
+
1957
+ write_dict = convert_added_tokens(
1958
+ self.special_tokens_map_extended, add_type_field=False
1959
+ )
1960
+ with open(special_tokens_map_file, "w", encoding="utf-8") as f:
1961
+ f.write(json.dumps(write_dict, ensure_ascii=False))
1962
+ logging.info(f"Special tokens file saved in {special_tokens_map_file}")
1963
+
1964
+ file_names = (tokenizer_config_file, special_tokens_map_file)
1965
+
1966
+ save_files = self._save_pretrained(
1967
+ save_directory=save_directory,
1968
+ file_names=file_names,
1969
+ filename_prefix=filename_prefix,
1970
+ )
1971
+
1972
+ return save_files
1973
+
1974
+ def _save_pretrained(
1975
+ self,
1976
+ save_directory: Union[str, os.PathLike],
1977
+ file_names: Tuple[str],
1978
+ filename_prefix: Optional[str] = None,
1979
+ ) -> Tuple[str]:
1980
+ """
1981
+ Save a tokenizer using the tokenizer format: vocabulary + added tokens.
1982
+
1983
+ """
1984
+ save_directory = str(save_directory)
1985
+
1986
+ added_tokens_file = os.path.join(
1987
+ save_directory,
1988
+ (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE,
1989
+ )
1990
+ added_vocab = self.get_added_vocab()
1991
+ if added_vocab:
1992
+ with open(added_tokens_file, "w", encoding="utf-8") as f:
1993
+ out_str = json.dumps(added_vocab, ensure_ascii=False)
1994
+ f.write(out_str)
1995
+ logging.info(f"added tokens file saved in {added_tokens_file}")
1996
+
1997
+ self.save_resources(save_directory)
1998
+
1999
+ return file_names + (added_tokens_file,)
2000
+
2001
+ def tokenize(
2002
+ self,
2003
+ text: str,
2004
+ pair: Optional[str] = None,
2005
+ add_special_tokens: bool = False,
2006
+ **kwargs,
2007
+ ) -> List[str]:
2008
+ """
2009
+ Converts a string in a sequence of tokens, replacing unknown tokens with the `unk_token`.
2010
+
2011
+ Args:
2012
+ text (`str`):
2013
+ The sequence to be encoded.
2014
+ pair (`str`, *optional*):
2015
+ A second sequence to be encoded with the first.
2016
+ add_special_tokens (`bool`, *optional*, defaults to `False`):
2017
+ Whether or not to add the special tokens associated with the corresponding model.
2018
+ kwargs (additional keyword arguments, *optional*):
2019
+ Will be passed to the underlying model specific encode method. See details in
2020
+ [`~PretrainedTokenizerBase.__call__`]
2021
+
2022
+ Returns:
2023
+ `List[str]`: The list of tokens.
2024
+ """
2025
+ raise NotImplementedError
2026
+
2027
+ def num_special_tokens_to_add(self, pair: bool = False) -> int:
2028
+ raise NotImplementedError
2029
+
2030
+ def _get_padding_truncation_strategies(
2031
+ self,
2032
+ padding=False,
2033
+ truncation=False,
2034
+ max_length=None,
2035
+ pad_to_multiple_of=None,
2036
+ verbose=True,
2037
+ **kwargs,
2038
+ ):
2039
+ """
2040
+ Find the correct padding/truncation strategy with backward compatibility for old arguments (truncation_strategy
2041
+ and pad_to_max_length) and behaviors.
2042
+ """
2043
+ old_truncation_strategy = kwargs.pop("truncation_strategy", "do_not_truncate")
2044
+ old_pad_to_max_length = kwargs.pop("pad_to_max_seq_len", False)
2045
+
2046
+ if max_length is not None and padding is False and truncation is False:
2047
+ if verbose:
2048
+ if not self.deprecation_warnings.get(
2049
+ "Truncation-not-explicitly-activated", False
2050
+ ):
2051
+ warnings.warn(
2052
+ "Truncation was not explicitly activated but `max_length` is provided a specific value, "
2053
+ "please use `truncation=True` to explicitly truncate examples to max length. "
2054
+ "Defaulting to 'longest_first' truncation strategy. "
2055
+ "If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy "
2056
+ "more precisely by providing a specific strategy to `truncation`."
2057
+ )
2058
+ self.deprecation_warnings["Truncation-not-explicitly-activated"] = True
2059
+ truncation = "longest_first"
2060
+
2061
+ # Get padding strategy
2062
+ if padding is False and old_pad_to_max_length:
2063
+ if verbose:
2064
+ warnings.warn(
2065
+ "The `pad_to_max_length` argument is deprecated and will be removed in a future version, "
2066
+ "use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or "
2067
+ "use `padding='max_length'` to pad to a max length. In this case, you can give a specific "
2068
+ "length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the "
2069
+ "maximal input size of the model (e.g. 512 for Bert).",
2070
+ FutureWarning,
2071
+ )
2072
+ if max_length is None:
2073
+ padding_strategy = PaddingStrategy.LONGEST
2074
+ else:
2075
+ padding_strategy = PaddingStrategy.MAX_LENGTH
2076
+ elif padding is not False:
2077
+ if padding is True:
2078
+ if verbose:
2079
+ if max_length is not None and (
2080
+ truncation is False or truncation == "do_not_truncate"
2081
+ ):
2082
+ warnings.warn(
2083
+ "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "
2084
+ "To pad to max length, use `padding='max_length'`."
2085
+ )
2086
+ if old_pad_to_max_length is not False:
2087
+ warnings.warn(
2088
+ "Though `pad_to_max_length` = `True`, it is ignored because `padding`=`True`."
2089
+ )
2090
+ padding_strategy = PaddingStrategy.LONGEST
2091
+ elif not isinstance(padding, PaddingStrategy):
2092
+ padding_strategy = PaddingStrategy(padding)
2093
+ elif isinstance(padding, PaddingStrategy):
2094
+ padding_strategy = padding
2095
+ else:
2096
+ padding_strategy = PaddingStrategy.DO_NOT_PAD
2097
+
2098
+ # Get truncation strategy
2099
+ if truncation is False and old_truncation_strategy != "do_not_truncate":
2100
+ if verbose:
2101
+ warnings.warn(
2102
+ "The `truncation_strategy` argument is deprecated and will be removed in a future version, "
2103
+ "use `truncation=True` to truncate examples to a max length. You can give a specific "
2104
+ "length with `max_length` (e.g. `max_length=45`) or leave max_length to None to truncate to the "
2105
+ "maximal input size of the model (e.g. 512 for Bert). "
2106
+ " If you have pairs of inputs, you can give a specific truncation strategy selected among "
2107
+ "`truncation='only_first'` (will only truncate the first sentence in the pairs) "
2108
+ "`truncation='only_second'` (will only truncate the second sentence in the pairs) "
2109
+ "or `truncation='longest_first'` (will iteratively remove tokens from the longest sentence in the pairs).",
2110
+ FutureWarning,
2111
+ )
2112
+ truncation_strategy = TruncationStrategy(old_truncation_strategy)
2113
+ elif truncation is not False and truncation is not None:
2114
+ if truncation is True:
2115
+ truncation_strategy = (
2116
+ TruncationStrategy.LONGEST_FIRST
2117
+ ) # Default to truncate the longest sequences in pairs of inputs
2118
+ elif not isinstance(truncation, TruncationStrategy):
2119
+ truncation_strategy = TruncationStrategy(truncation)
2120
+ elif isinstance(truncation, TruncationStrategy):
2121
+ truncation_strategy = truncation
2122
+ else:
2123
+ truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
2124
+
2125
+ # Set max length if needed
2126
+ if max_length is None:
2127
+ if padding_strategy == PaddingStrategy.MAX_LENGTH:
2128
+ if self.model_max_length > LARGE_INTEGER:
2129
+ if verbose:
2130
+ if not self.deprecation_warnings.get(
2131
+ "Asking-to-pad-to-max_length", False
2132
+ ):
2133
+ warnings.warn(
2134
+ "Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. "
2135
+ "Default to no padding."
2136
+ )
2137
+ self.deprecation_warnings["Asking-to-pad-to-max_length"] = True
2138
+ padding_strategy = PaddingStrategy.DO_NOT_PAD
2139
+ else:
2140
+ max_length = self.model_max_length
2141
+
2142
+ if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:
2143
+ if self.model_max_length > LARGE_INTEGER:
2144
+ if verbose:
2145
+ if not self.deprecation_warnings.get(
2146
+ "Asking-to-truncate-to-max_length", False
2147
+ ):
2148
+ warnings.warn(
2149
+ "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. "
2150
+ "Default to no truncation."
2151
+ )
2152
+ self.deprecation_warnings[
2153
+ "Asking-to-truncate-to-max_length"
2154
+ ] = True
2155
+ truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
2156
+ else:
2157
+ max_length = self.model_max_length
2158
+
2159
+ # Test if we have a padding token
2160
+ if padding_strategy != PaddingStrategy.DO_NOT_PAD and (
2161
+ not self.pad_token or self.pad_token_id < 0
2162
+ ):
2163
+ raise ValueError(
2164
+ "Asking to pad but the tokenizer does not have a padding token. "
2165
+ "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` "
2166
+ "or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`."
2167
+ )
2168
+
2169
+ # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided
2170
+ if (
2171
+ truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE
2172
+ and padding_strategy != PaddingStrategy.DO_NOT_PAD
2173
+ and pad_to_multiple_of is not None
2174
+ and max_length is not None
2175
+ and (max_length % pad_to_multiple_of != 0)
2176
+ ):
2177
+ raise ValueError(
2178
+ f"Truncation and padding are both activated but "
2179
+ f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})."
2180
+ )
2181
+
2182
+ return padding_strategy, truncation_strategy, max_length, kwargs
2183
+
2184
+ def __call__(
2185
+ self,
2186
+ text: Union[str, List[str], List[List[str]]],
2187
+ text_pair: Optional[Union[str, List[str], List[List[str]]]] = None,
2188
+ max_length: Optional[int] = None,
2189
+ stride: int = 0,
2190
+ is_split_into_words: Union[bool, str] = False,
2191
+ padding: Union[bool, str, PaddingStrategy] = False,
2192
+ truncation: Union[bool, str, TruncationStrategy] = False,
2193
+ return_position_ids: bool = None,
2194
+ return_token_type_ids: Optional[bool] = None,
2195
+ return_attention_mask: Optional[bool] = None,
2196
+ return_length: bool = False,
2197
+ return_overflowing_tokens: bool = False,
2198
+ return_special_tokens_mask: bool = False,
2199
+ return_dict: bool = True,
2200
+ return_offsets_mapping: bool = False,
2201
+ add_special_tokens: bool = True,
2202
+ pad_to_multiple_of: Optional[int] = None,
2203
+ padding_side: Optional[Literal["right", "left"]] = None,
2204
+ return_tensors: Optional[Union[str, TensorType]] = None,
2205
+ verbose: bool = True,
2206
+ **kwargs,
2207
+ ):
2208
+ """
2209
+ Performs tokenization and uses the tokenized tokens to prepare model
2210
+ inputs. It supports sequence or sequence pair as input, and batch input
2211
+ is allowed. `self.encode()` or `self.batch_encode()` would be called
2212
+ separately for single or batch input depending on input format and
2213
+ `is_split_into_words` argument.
2214
+
2215
+ Args:
2216
+ text (str, List[str] or List[List[str]]):
2217
+ The sequence or batch of sequences to be processed. One sequence
2218
+ is a string or a list of strings depending on whether it has been
2219
+ pretokenized. If each sequence is provided as a list of strings
2220
+ (pretokenized), you must set `is_split_into_words` as `True` to
2221
+ disambiguate with a batch of sequences.
2222
+ text_pair (str, List[str] or List[List[str]], optional):
2223
+ Same as `text` argument, while it represents for the latter
2224
+ sequence of the sequence pair.
2225
+ max_length (int, optional):
2226
+ If set to a number, will limit the total sequence returned so
2227
+ that it has a maximum length. If there are overflowing tokens,
2228
+ those overflowing tokens will be added to the returned dictionary
2229
+ when `return_overflowing_tokens` is `True`. Defaults to `None`.
2230
+ stride (int, optional):
2231
+ Only available for batch input of sequence pair and mainly for
2232
+ question answering usage. When for QA, `text` represents questions
2233
+ and `text_pair` represents contexts. If `stride` is set to a
2234
+ positive number, the context will be split into multiple spans
2235
+ where `stride` defines the number of (tokenized) tokens to skip
2236
+ from the start of one span to get the next span, thus will produce
2237
+ a bigger batch than inputs to include all spans. Moreover, 'overflow_to_sample'
2238
+ and 'offset_mapping' preserving the original example and position
2239
+ information will be added to the returned dictionary. Defaults to 0.
2240
+ is_split_into_words (Union[bool, str], optional):
2241
+ when the text is words or tokens, `is_split_into_words` should be True or `token`.
2242
+ `True`: means that the text should be words which should be tokenized.
2243
+ `token`: means that the text should be tokens which already be tokenized, so it should not be tokenized again.
2244
+ padding (bool, str or [PaddingStrategy], optional):
2245
+ Activates and controls padding. Accepts the following values:
2246
+
2247
+ - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
2248
+ sequence if provided).
2249
+ - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
2250
+ acceptable input length for the model if that argument is not provided.
2251
+ - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
2252
+ lengths).
2253
+ Defaults to `False`.
2254
+ truncation (bool, str or [TruncationStrategy], optional):
2255
+ Activates and controls truncation. Accepts the following values:
2256
+
2257
+ - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or
2258
+ to the maximum acceptable input length for the model if that argument is not provided. This will
2259
+ truncate token by token, removing a token from the longest sequence in the pair if a pair of
2260
+ sequences (or a batch of pairs) is provided.
2261
+ - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
2262
+ maximum acceptable input length for the model if that argument is not provided. This will only
2263
+ truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
2264
+ - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
2265
+ maximum acceptable input length for the model if that argument is not provided. This will only
2266
+ truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
2267
+ - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
2268
+ greater than the model maximum admissible input size).
2269
+ Defaults to `False`.
2270
+ return_position_ids (bool, optional):
2271
+ Whether to include tokens position ids in the returned dictionary.
2272
+ Defaults to `False`.
2273
+ return_token_type_ids (bool, optional):
2274
+ Whether to include token type ids in the returned dictionary.
2275
+ Defaults to `True`.
2276
+ return_attention_mask (bool, optional):
2277
+ Whether to include the attention mask in the returned dictionary.
2278
+ Defaults to `False`.
2279
+ return_length (bool, optional):
2280
+ Whether to include the length of each encoded inputs in the
2281
+ returned dictionary. Defaults to `False`.
2282
+ return_overflowing_tokens (bool, optional):
2283
+ Whether to include overflowing token information in the returned
2284
+ dictionary. Defaults to `False`.
2285
+ return_special_tokens_mask (bool, optional):
2286
+ Whether to include special tokens mask information in the returned
2287
+ dictionary. Defaults to `False`.
2288
+ return_dict (bool, optional):
2289
+ Decide the format for returned encoded batch inputs. Only works when
2290
+ input is a batch of data.
2291
+ ::
2292
+ - If True, encoded inputs would be a dictionary like:
2293
+ {'input_ids': [[1, 4444, 4385, 1545, 6712],[1, 4444, 4385]],
2294
+ 'token_type_ids': [[0, 0, 0, 0, 0], [0, 0, 0]]}
2295
+ - If False, encoded inputs would be a list like:
2296
+ [{'input_ids': [1, 4444, 4385, 1545, 6712],
2297
+ 'token_type_ids': [0, 0, 0, 0, 0]},
2298
+ {'input_ids': [1, 4444, 4385], 'token_type_ids': [0, 0, 0]}]
2299
+
2300
+ Defaults to `True`.
2301
+ return_offsets_mapping (bool, optional):
2302
+ Whether to include the list of pair preserving the index of start
2303
+ and end char in original input for each token in the returned
2304
+ dictionary. Would be automatically set to `True` when `stride` > 0.
2305
+ Defaults to `False`.
2306
+ add_special_tokens (bool, optional):
2307
+ Whether to add the special tokens associated with the corresponding model
2308
+ to the encoded inputs. Defaults to `True`
2309
+ pad_to_multiple_of (int, optional):
2310
+ If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
2311
+ the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
2312
+ Defaults to `None`.
2313
+ padding_side (`str`, *optional*):
2314
+ The side on which the model should have padding applied. Should be selected between ['right', 'left'].
2315
+ Default value is picked from the class attribute of the same name.
2316
+ return_tensors (str or [TensorType], optional):
2317
+ If set, will return tensors instead of list of python integers. Acceptable values are:
2318
+
2319
+ - `'pd'`: Return Paddle `paddle.Tensor` objects.
2320
+ - `'np'`: Return Numpy `np.ndarray` objects.
2321
+ Defaults to `None`.
2322
+ verbose (bool, optional):
2323
+ Whether or not to print more information and warnings. Defaults to True.
2324
+
2325
+ Returns:
2326
+ dict or list[dict] (for batch input):
2327
+ The dict has the following optional items:
2328
+
2329
+ - **input_ids** (list[int] or list[list[int]]): List of token ids to be fed to a model.
2330
+ - **position_ids** (list[int] or list[list[int]], optional): List of token position ids to be
2331
+ fed to a model. Included when `return_position_ids` is `True`
2332
+ - **token_type_ids** (list[int] or list[list[int]], optional): List of token type ids to be
2333
+ fed to a model. Included when `return_token_type_ids` is `True`.
2334
+ - **attention_mask** (list[int] or list[list[int]], optional): List of integers valued 0 or 1,
2335
+ where 0 specifies paddings and should not be attended to by the
2336
+ model. Included when `return_attention_mask` is `True`.
2337
+ - **seq_len** (int or list[int], optional): The input_ids length. Included when `return_length`
2338
+ is `True`.
2339
+ - **overflowing_tokens** (list[int] or list[list[int]], optional): List of overflowing tokens.
2340
+ Included when if `max_length` is specified and `return_overflowing_tokens`
2341
+ is True.
2342
+ - **num_truncated_tokens** (int or list[int], optional): The number of overflowing tokens.
2343
+ Included when if `max_length` is specified and `return_overflowing_tokens`
2344
+ is True.
2345
+ - **special_tokens_mask** (list[int] or list[list[int]], optional): List of integers valued 0 or 1,
2346
+ with 0 specifying special added tokens and 1 specifying sequence tokens.
2347
+ Included when `return_special_tokens_mask` is `True`.
2348
+ - **offset_mapping** (list[int], optional): list of pair preserving the
2349
+ index of start and end char in original input for each token.
2350
+ For a sqecial token, the index pair is `(0, 0)`. Included when
2351
+ `return_overflowing_tokens` is True or `stride` > 0.
2352
+ - **overflow_to_sample** (int or list[int], optional): Index of example from which this
2353
+ feature is generated. Included when `stride` works.
2354
+ """
2355
+
2356
+ # Input type checking for clearer error
2357
+ def _is_valid_text_input(t):
2358
+ if isinstance(t, str):
2359
+ # Strings are fine
2360
+ return True
2361
+ elif isinstance(t, (list, tuple)):
2362
+ # List are fine as long as they are...
2363
+ if len(t) == 0:
2364
+ # ... empty
2365
+ return True
2366
+ elif isinstance(t[0], str):
2367
+ # ... list of strings
2368
+ return True
2369
+ elif isinstance(t[0], (list, tuple)):
2370
+ # ... list with an empty list or with a list of strings
2371
+ return len(t[0]) == 0 or isinstance(t[0][0], str)
2372
+ else:
2373
+ return False
2374
+ else:
2375
+ return False
2376
+
2377
+ if not _is_valid_text_input(text):
2378
+ raise ValueError(
2379
+ "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
2380
+ "or `List[List[str]]` (batch of pretokenized examples)."
2381
+ )
2382
+
2383
+ if text_pair is not None and not _is_valid_text_input(text_pair):
2384
+ raise ValueError(
2385
+ "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
2386
+ "or `List[List[str]]` (batch of pretokenized examples)."
2387
+ )
2388
+
2389
+ # check `split_into_words` value
2390
+ if isinstance(is_split_into_words, str) and is_split_into_words != "token":
2391
+ raise ValueError(
2392
+ "the value of `is_split_into_words` should be one of: {True, False, 'token'} but receive: <%s>",
2393
+ is_split_into_words,
2394
+ )
2395
+
2396
+ if is_split_into_words:
2397
+ is_batched = (
2398
+ isinstance(text, (list, tuple))
2399
+ and text
2400
+ and isinstance(text[0], (list, tuple))
2401
+ )
2402
+ else:
2403
+ is_batched = isinstance(text, (list, tuple))
2404
+
2405
+ if is_batched:
2406
+ if isinstance(text_pair, str):
2407
+ raise TypeError(
2408
+ "when tokenizing batches of text, `text_pair` must be a list or tuple with the same length as `text`."
2409
+ )
2410
+ if text_pair is not None and len(text) != len(text_pair):
2411
+ raise ValueError(
2412
+ f"batch length of `text`: {len(text)} does not match batch length of `text_pair`: {len(text_pair)}."
2413
+ )
2414
+ batch_text_or_text_pairs = (
2415
+ list(zip(text, text_pair)) if text_pair is not None else text
2416
+ )
2417
+ return self.batch_encode(
2418
+ batch_text_or_text_pairs=batch_text_or_text_pairs,
2419
+ max_length=max_length,
2420
+ stride=stride,
2421
+ is_split_into_words=is_split_into_words,
2422
+ padding=padding,
2423
+ truncation=truncation,
2424
+ return_position_ids=return_position_ids,
2425
+ return_token_type_ids=return_token_type_ids,
2426
+ return_attention_mask=return_attention_mask,
2427
+ return_length=return_length,
2428
+ return_overflowing_tokens=return_overflowing_tokens,
2429
+ return_special_tokens_mask=return_special_tokens_mask,
2430
+ return_dict=return_dict,
2431
+ return_offsets_mapping=return_offsets_mapping,
2432
+ add_special_tokens=add_special_tokens,
2433
+ pad_to_multiple_of=pad_to_multiple_of,
2434
+ padding_side=padding_side,
2435
+ return_tensors=return_tensors,
2436
+ verbose=verbose,
2437
+ **kwargs,
2438
+ )
2439
+ else:
2440
+ return self.encode(
2441
+ text=text,
2442
+ text_pair=text_pair,
2443
+ max_length=max_length,
2444
+ stride=stride,
2445
+ is_split_into_words=is_split_into_words,
2446
+ padding=padding,
2447
+ truncation=truncation,
2448
+ return_position_ids=return_position_ids,
2449
+ return_token_type_ids=return_token_type_ids,
2450
+ return_attention_mask=return_attention_mask,
2451
+ return_length=return_length,
2452
+ return_overflowing_tokens=return_overflowing_tokens,
2453
+ return_special_tokens_mask=return_special_tokens_mask,
2454
+ return_offsets_mapping=return_offsets_mapping,
2455
+ add_special_tokens=add_special_tokens,
2456
+ pad_to_multiple_of=pad_to_multiple_of,
2457
+ padding_side=padding_side,
2458
+ return_tensors=return_tensors,
2459
+ verbose=verbose,
2460
+ **kwargs,
2461
+ )
2462
+
2463
+ def encode(
2464
+ self,
2465
+ text,
2466
+ text_pair=None,
2467
+ add_special_tokens=True,
2468
+ padding: Union[bool, str, PaddingStrategy] = False,
2469
+ truncation: Union[bool, str, TruncationStrategy] = False,
2470
+ max_length: Optional[int] = None,
2471
+ stride: int = 0,
2472
+ is_split_into_words: bool = False,
2473
+ pad_to_multiple_of: Optional[int] = None,
2474
+ padding_side: Optional[Literal["right", "left"]] = None,
2475
+ return_tensors: Optional[Union[str, TensorType]] = None,
2476
+ return_token_type_ids: Optional[bool] = None,
2477
+ return_attention_mask: Optional[bool] = None,
2478
+ return_overflowing_tokens: bool = False,
2479
+ return_special_tokens_mask: bool = False,
2480
+ return_offsets_mapping: bool = False,
2481
+ return_length: bool = False,
2482
+ verbose: bool = True,
2483
+ return_position_ids=None,
2484
+ **kwargs,
2485
+ ) -> BatchEncoding:
2486
+ """
2487
+ Tokenize and prepare for the model a sequence or a pair of sequences.
2488
+
2489
+ Args:
2490
+ text (`str`, `List[str]` or `List[int]`):
2491
+ The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
2492
+ `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
2493
+ method).
2494
+ text_pair (`str`, `List[str]` or `List[int]`, *optional*):
2495
+ Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
2496
+ the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
2497
+ method).
2498
+ """
2499
+ # Backward compatibility for 'max_seq_len'
2500
+ old_max_seq_len = kwargs.get("max_seq_len", None)
2501
+ if max_length is None and old_max_seq_len:
2502
+ if verbose:
2503
+ warnings.warn(
2504
+ "The `max_seq_len` argument is deprecated and will be removed in a future version, "
2505
+ "please use `max_length` instead.",
2506
+ FutureWarning,
2507
+ )
2508
+ max_length = old_max_seq_len
2509
+ # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
2510
+ padding_strategy, truncation_strategy, max_length, kwargs = (
2511
+ self._get_padding_truncation_strategies(
2512
+ padding=padding,
2513
+ truncation=truncation,
2514
+ max_length=max_length,
2515
+ pad_to_multiple_of=pad_to_multiple_of,
2516
+ verbose=verbose,
2517
+ **kwargs,
2518
+ )
2519
+ )
2520
+
2521
+ return self._encode_plus(
2522
+ text=text,
2523
+ text_pair=text_pair,
2524
+ add_special_tokens=add_special_tokens,
2525
+ padding_strategy=padding_strategy,
2526
+ truncation_strategy=truncation_strategy,
2527
+ max_length=max_length,
2528
+ stride=stride,
2529
+ is_split_into_words=is_split_into_words,
2530
+ pad_to_multiple_of=pad_to_multiple_of,
2531
+ padding_side=padding_side,
2532
+ return_tensors=return_tensors,
2533
+ return_position_ids=return_position_ids,
2534
+ return_token_type_ids=return_token_type_ids,
2535
+ return_attention_mask=return_attention_mask,
2536
+ return_overflowing_tokens=return_overflowing_tokens,
2537
+ return_special_tokens_mask=return_special_tokens_mask,
2538
+ return_offsets_mapping=return_offsets_mapping,
2539
+ return_length=return_length,
2540
+ verbose=verbose,
2541
+ **kwargs,
2542
+ )
2543
+
2544
+ def encode_plus(
2545
+ self,
2546
+ text: Union[TextInput, PreTokenizedInput, EncodedInput],
2547
+ text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
2548
+ add_special_tokens: bool = True,
2549
+ padding: Union[bool, str, PaddingStrategy] = False,
2550
+ truncation: Union[bool, str, TruncationStrategy] = None,
2551
+ max_length: Optional[int] = None,
2552
+ stride: int = 0,
2553
+ is_split_into_words: bool = False,
2554
+ padding_side: Optional[Literal["right", "left"]] = None,
2555
+ pad_to_multiple_of: Optional[int] = None,
2556
+ return_tensors: Optional[Union[str, TensorType]] = None,
2557
+ return_token_type_ids: Optional[bool] = None,
2558
+ return_attention_mask: Optional[bool] = None,
2559
+ return_overflowing_tokens: bool = False,
2560
+ return_special_tokens_mask: bool = False,
2561
+ return_offsets_mapping: bool = False,
2562
+ return_length: bool = False,
2563
+ verbose: bool = True,
2564
+ **kwargs,
2565
+ ) -> BatchEncoding:
2566
+ """
2567
+ Tokenize and prepare for the model a sequence or a pair of sequences.
2568
+
2569
+ <Tip warning={true}>
2570
+
2571
+ This method is deprecated, `__call__` should be used instead.
2572
+
2573
+ </Tip>
2574
+
2575
+ Args:
2576
+ text (`str`, `List[str]` or `List[int]` (the latter only for not-fast tokenizers)):
2577
+ The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
2578
+ `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
2579
+ method).
2580
+ text_pair (`str`, `List[str]` or `List[int]`, *optional*):
2581
+ Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
2582
+ the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
2583
+ method).
2584
+ """
2585
+
2586
+ # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
2587
+ padding_strategy, truncation_strategy, max_length, kwargs = (
2588
+ self._get_padding_truncation_strategies(
2589
+ padding=padding,
2590
+ truncation=truncation,
2591
+ max_length=max_length,
2592
+ pad_to_multiple_of=pad_to_multiple_of,
2593
+ verbose=verbose,
2594
+ **kwargs,
2595
+ )
2596
+ )
2597
+
2598
+ return self._encode_plus(
2599
+ text=text,
2600
+ text_pair=text_pair,
2601
+ add_special_tokens=add_special_tokens,
2602
+ padding_strategy=padding_strategy,
2603
+ truncation_strategy=truncation_strategy,
2604
+ max_length=max_length,
2605
+ stride=stride,
2606
+ is_split_into_words=is_split_into_words,
2607
+ pad_to_multiple_of=pad_to_multiple_of,
2608
+ padding_side=padding_side,
2609
+ return_tensors=return_tensors,
2610
+ return_token_type_ids=return_token_type_ids,
2611
+ return_attention_mask=return_attention_mask,
2612
+ return_overflowing_tokens=return_overflowing_tokens,
2613
+ return_special_tokens_mask=return_special_tokens_mask,
2614
+ return_offsets_mapping=return_offsets_mapping,
2615
+ return_length=return_length,
2616
+ verbose=verbose,
2617
+ **kwargs,
2618
+ )
2619
+
2620
+ def _encode_plus(
2621
+ self,
2622
+ text: Union[TextInput, PreTokenizedInput, EncodedInput],
2623
+ text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
2624
+ add_special_tokens: bool = True,
2625
+ padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
2626
+ truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
2627
+ max_length: Optional[int] = None,
2628
+ stride: int = 0,
2629
+ is_split_into_words: bool = False,
2630
+ pad_to_multiple_of: Optional[int] = None,
2631
+ padding_side: Optional[Literal["right", "left"]] = None,
2632
+ return_position_ids: Optional[bool] = None,
2633
+ return_tensors: Optional[Union[str, TensorType]] = None,
2634
+ return_token_type_ids: Optional[bool] = None,
2635
+ return_attention_mask: Optional[bool] = None,
2636
+ return_overflowing_tokens: bool = False,
2637
+ return_special_tokens_mask: bool = False,
2638
+ return_offsets_mapping: bool = False,
2639
+ return_length: bool = False,
2640
+ verbose: bool = True,
2641
+ **kwargs,
2642
+ ) -> BatchEncoding:
2643
+ raise NotImplementedError
2644
+
2645
+ def batch_encode(
2646
+ self,
2647
+ batch_text_or_text_pairs: Union[
2648
+ List[TextInput],
2649
+ List[TextInputPair],
2650
+ List[PreTokenizedInput],
2651
+ List[PreTokenizedInputPair],
2652
+ List[EncodedInput],
2653
+ List[EncodedInputPair],
2654
+ ],
2655
+ max_length=None,
2656
+ stride: int = 0,
2657
+ is_split_into_words: bool = False,
2658
+ padding: Union[bool, str, PaddingStrategy] = False,
2659
+ truncation: Union[bool, str, TruncationStrategy] = False,
2660
+ return_position_ids=None,
2661
+ # TODO(wj-mcat): keep align with `encode` method
2662
+ return_token_type_ids=None,
2663
+ return_attention_mask=None,
2664
+ return_length=False,
2665
+ return_overflowing_tokens=False,
2666
+ return_special_tokens_mask=False,
2667
+ return_dict=True,
2668
+ return_offsets_mapping=False,
2669
+ add_special_tokens=True,
2670
+ pad_to_multiple_of: Optional[int] = None,
2671
+ padding_side: Optional[Literal["right", "left"]] = None,
2672
+ return_tensors: Optional[Union[str, TensorType]] = None,
2673
+ verbose: bool = True,
2674
+ **kwargs,
2675
+ ) -> BatchEncoding:
2676
+ """
2677
+ Performs tokenization and uses the tokenized tokens to prepare model
2678
+ inputs. It supports batch inputs of sequence or sequence pair.
2679
+
2680
+ Args:
2681
+ batch_text_or_text_pairs (list):
2682
+ The element of list can be sequence or sequence pair, and the
2683
+ sequence is a string or a list of strings depending on whether
2684
+ it has been pretokenized. If each sequence is provided as a list
2685
+ of strings (pretokenized), you must set `is_split_into_words` as
2686
+ `True` to disambiguate with a sequence pair.
2687
+
2688
+ Returns:
2689
+ dict or list[dict]:
2690
+ The dict has the following optional items:
2691
+
2692
+ """
2693
+ # Backward compatibility for 'max_seq_len'
2694
+ old_max_seq_len = kwargs.get("max_seq_len", None)
2695
+ if max_length is None and old_max_seq_len:
2696
+ if verbose:
2697
+ warnings.warn(
2698
+ "The `max_seq_len` argument is deprecated and will be removed in a future version, "
2699
+ "please use `max_length` instead.",
2700
+ FutureWarning,
2701
+ )
2702
+ max_length = old_max_seq_len
2703
+ # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
2704
+ padding_strategy, truncation_strategy, max_length, kwargs = (
2705
+ self._get_padding_truncation_strategies(
2706
+ padding=padding,
2707
+ truncation=truncation,
2708
+ max_length=max_length,
2709
+ pad_to_multiple_of=pad_to_multiple_of,
2710
+ verbose=verbose,
2711
+ **kwargs,
2712
+ )
2713
+ )
2714
+
2715
+ return self._batch_encode_plus(
2716
+ batch_text_or_text_pairs=batch_text_or_text_pairs,
2717
+ add_special_tokens=add_special_tokens,
2718
+ padding_strategy=padding_strategy,
2719
+ truncation_strategy=truncation_strategy,
2720
+ max_length=max_length,
2721
+ stride=stride,
2722
+ is_split_into_words=is_split_into_words,
2723
+ pad_to_multiple_of=pad_to_multiple_of,
2724
+ padding_side=padding_side,
2725
+ return_tensors=return_tensors,
2726
+ return_position_ids=return_position_ids,
2727
+ return_token_type_ids=return_token_type_ids,
2728
+ return_attention_mask=return_attention_mask,
2729
+ return_overflowing_tokens=return_overflowing_tokens,
2730
+ return_special_tokens_mask=return_special_tokens_mask,
2731
+ return_dict=return_dict,
2732
+ return_offsets_mapping=return_offsets_mapping,
2733
+ return_length=return_length,
2734
+ verbose=verbose,
2735
+ **kwargs,
2736
+ )
2737
+
2738
+ def _batch_encode_plus(
2739
+ self,
2740
+ batch_text_or_text_pairs: Union[
2741
+ List[TextInput],
2742
+ List[TextInputPair],
2743
+ List[PreTokenizedInput],
2744
+ List[PreTokenizedInputPair],
2745
+ List[EncodedInput],
2746
+ List[EncodedInputPair],
2747
+ ],
2748
+ add_special_tokens: bool = True,
2749
+ padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
2750
+ truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
2751
+ max_length: Optional[int] = None,
2752
+ stride: int = 0,
2753
+ is_split_into_words: bool = False,
2754
+ pad_to_multiple_of: Optional[int] = None,
2755
+ padding_side: Optional[Literal["right", "left"]] = None,
2756
+ return_position_ids: Optional[bool] = None,
2757
+ return_tensors: Optional[Union[str, TensorType]] = None,
2758
+ return_token_type_ids: Optional[bool] = None,
2759
+ return_attention_mask: Optional[bool] = None,
2760
+ return_overflowing_tokens: bool = False,
2761
+ return_special_tokens_mask: bool = False,
2762
+ return_dict: bool = True,
2763
+ return_offsets_mapping: bool = False,
2764
+ return_length: bool = False,
2765
+ verbose: bool = True,
2766
+ **kwargs,
2767
+ ) -> BatchEncoding:
2768
+ raise NotImplementedError
2769
+
2770
+ def pad(
2771
+ self,
2772
+ encoded_inputs: Union[
2773
+ BatchEncoding,
2774
+ List[BatchEncoding],
2775
+ Dict[str, EncodedInput],
2776
+ Dict[str, List[EncodedInput]],
2777
+ List[Dict[str, EncodedInput]],
2778
+ ],
2779
+ padding: Union[bool, str, PaddingStrategy] = True,
2780
+ max_length: Optional[int] = None,
2781
+ padding_side: Optional[Literal["right", "left"]] = None,
2782
+ pad_to_multiple_of: Optional[int] = None,
2783
+ return_attention_mask: Optional[bool] = None,
2784
+ return_tensors: Optional[Union[str, TensorType]] = None,
2785
+ verbose: bool = True,
2786
+ ) -> BatchEncoding:
2787
+ """
2788
+ Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
2789
+ in the batch.
2790
+
2791
+ Padding side (left/right) padding token ids are defined at the tokenizer level (with `self.padding_side`,
2792
+ `self.pad_token_id` and `self.pad_token_type_id`)
2793
+
2794
+ <Tip>
2795
+
2796
+ If the `encoded_inputs` passed are dictionary of numpy arrays, Paddle tensors, the
2797
+ result will use the same type unless you provide a different tensor type with `return_tensors`.
2798
+ </Tip>
2799
+
2800
+ Args:
2801
+ encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`], `Dict[str, List[int]]`, `Dict[str, List[List[int]]` or `List[Dict[str, List[int]]]`):
2802
+ Tokenized inputs. Can represent one input ([`BatchEncoding`] or `Dict[str, List[int]]`) or a batch of
2803
+ tokenized inputs (list of [`BatchEncoding`], *Dict[str, List[List[int]]]* or *List[Dict[str,
2804
+ List[int]]]*) so you can use this method during preprocessing as well as in a Paddle Dataloader
2805
+ collate function.
2806
+
2807
+ Instead of `List[int]` you can have tensors (numpy arrays, Paddle tensors), see
2808
+ the note above for the return type.
2809
+ padding (`bool`, `str` or [`PaddingStrategy`], *optional*, defaults to `True`):
2810
+ Select a strategy to pad the returned sequences (according to the model's padding side and padding
2811
+ index) among:
2812
+
2813
+ - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
2814
+ sequence if provided).
2815
+ - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
2816
+ acceptable input length for the model if that argument is not provided.
2817
+ - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
2818
+ lengths).
2819
+ max_length (`int`, *optional*):
2820
+ Maximum length of the returned list and optionally padding length (see above).
2821
+ pad_to_multiple_of (`int`, *optional*):
2822
+ If set will pad the sequence to a multiple of the provided value.
2823
+
2824
+ This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
2825
+ >= 7.5 (Volta).
2826
+ padding_side (`str`, *optional*):
2827
+ The side on which the model should have padding applied. Should be selected between ['right', 'left'].
2828
+ Default value is picked from the class attribute of the same name.
2829
+ return_attention_mask (`bool`, *optional*):
2830
+ Whether to return the attention mask. If left to the default, will return the attention mask according
2831
+ to the specific tokenizer's default, defined by the `return_outputs` attribute.
2832
+
2833
+ [What are attention masks?](../glossary#attention-mask)
2834
+ return_tensors (`str` or [`TensorType`], *optional*):
2835
+ If set, will return tensors instead of list of python integers. Acceptable values are:
2836
+
2837
+ - `'pd'`: Return Paddle `paddle.Tensor` objects.
2838
+ - `'np'`: Return Numpy `np.ndarray` objects.
2839
+ verbose (`bool`, *optional*, defaults to `True`):
2840
+ Whether or not to print more information and warnings.
2841
+ """
2842
+ import paddle
2843
+
2844
+ # If we have a list of dicts, let's convert it in a dict of lists
2845
+ if isinstance(encoded_inputs, (list, tuple)) and isinstance(
2846
+ encoded_inputs[0], (dict, BatchEncoding)
2847
+ ):
2848
+ encoded_inputs = {
2849
+ key: [example[key] for example in encoded_inputs]
2850
+ for key in encoded_inputs[0].keys()
2851
+ }
2852
+
2853
+ # The model's main input name, usually `input_ids`, has be passed for padding
2854
+ if self.model_input_names[0] not in encoded_inputs:
2855
+ raise ValueError(
2856
+ "You should supply an encoding or a list of encodings to this method "
2857
+ f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
2858
+ )
2859
+
2860
+ required_input = encoded_inputs[self.model_input_names[0]]
2861
+
2862
+ if not required_input:
2863
+ if return_attention_mask:
2864
+ encoded_inputs["attention_mask"] = []
2865
+ return encoded_inputs
2866
+
2867
+ # If we have Paddle/NumPy tensors/arrays as inputs, we cast them as python objects
2868
+ # and rebuild them afterwards if no return_tensors is specified
2869
+
2870
+ first_element = required_input[0]
2871
+ if isinstance(first_element, (list, tuple)):
2872
+ # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
2873
+ for item in required_input:
2874
+ if len(item) != 0:
2875
+ first_element = item[0]
2876
+ break
2877
+ # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
2878
+ if not isinstance(first_element, (int, list, tuple)):
2879
+ if isinstance(first_element, paddle.Tensor):
2880
+ return_tensors = "pd" if return_tensors is None else return_tensors
2881
+ else:
2882
+ raise ValueError(
2883
+ f"type of {first_element} unknown: {type(first_element)}. "
2884
+ f"Should be either python or paddle object."
2885
+ )
2886
+
2887
+ for key, value in encoded_inputs.items():
2888
+ encoded_inputs[key] = to_py_obj(value)
2889
+
2890
+ # Convert padding_strategy in PaddingStrategy
2891
+ padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
2892
+ padding=padding, max_length=max_length, verbose=verbose
2893
+ )
2894
+
2895
+ required_input = encoded_inputs[self.model_input_names[0]]
2896
+ if required_input and not isinstance(required_input[0], (list, tuple)):
2897
+ # some tokenizers might not have the padding_side attribute
2898
+ if "padding_side" in set(inspect.signature(self._pad).parameters.keys()):
2899
+ encoded_inputs = self._pad(
2900
+ encoded_inputs,
2901
+ max_length=max_length,
2902
+ padding_strategy=padding_strategy,
2903
+ pad_to_multiple_of=pad_to_multiple_of,
2904
+ padding_side=padding_side,
2905
+ return_attention_mask=return_attention_mask,
2906
+ )
2907
+ else:
2908
+ original_padding_side = self.padding_side
2909
+ self.padding_side = padding_side
2910
+ encoded_inputs = self._pad(
2911
+ encoded_inputs,
2912
+ max_length=max_length,
2913
+ padding_strategy=padding_strategy,
2914
+ pad_to_multiple_of=pad_to_multiple_of,
2915
+ return_attention_mask=return_attention_mask,
2916
+ )
2917
+ self.padding_side = original_padding_side
2918
+
2919
+ return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
2920
+
2921
+ batch_size = len(required_input)
2922
+ assert all(
2923
+ len(v) == batch_size for v in encoded_inputs.values()
2924
+ ), "Some items in the output dictionary have a different batch size than others."
2925
+
2926
+ if padding_strategy == PaddingStrategy.LONGEST:
2927
+ max_length = max(len(inputs) for inputs in required_input)
2928
+ padding_strategy = PaddingStrategy.MAX_LENGTH
2929
+
2930
+ batch_outputs = {}
2931
+ for i in range(batch_size):
2932
+ inputs = dict((k, v[i]) for k, v in encoded_inputs.items())
2933
+ outputs = self._pad(
2934
+ inputs,
2935
+ max_length=max_length,
2936
+ padding_strategy=padding_strategy,
2937
+ padding_side=padding_side,
2938
+ pad_to_multiple_of=pad_to_multiple_of,
2939
+ return_attention_mask=return_attention_mask,
2940
+ )
2941
+
2942
+ for key, value in outputs.items():
2943
+ if key not in batch_outputs:
2944
+ batch_outputs[key] = []
2945
+ batch_outputs[key].append(value)
2946
+
2947
+ return BatchEncoding(batch_outputs, tensor_type=return_tensors)
2948
+
2949
+ def create_token_type_ids_from_sequences(
2950
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
2951
+ ) -> List[int]:
2952
+ """
2953
+ Create the token type IDs corresponding to the sequences passed. [What are token type
2954
+ IDs?](../glossary#token-type-ids)
2955
+
2956
+ Should be overridden in a subclass if the model has a special way of building those.
2957
+
2958
+ Args:
2959
+ token_ids_0 (`List[int]`): The first tokenized sequence.
2960
+ token_ids_1 (`List[int]`, *optional*): The second tokenized sequence.
2961
+
2962
+ Returns:
2963
+ `List[int]`: The token type ids.
2964
+ """
2965
+ if token_ids_1 is None:
2966
+ return len(token_ids_0) * [0]
2967
+ return [0] * len(token_ids_0) + [1] * len(token_ids_1)
2968
+
2969
+ def build_inputs_with_special_tokens(
2970
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
2971
+ ) -> List[int]:
2972
+ """
2973
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
2974
+ adding special tokens.
2975
+
2976
+ This implementation does not add special tokens and this method should be overridden in a subclass.
2977
+
2978
+ Args:
2979
+ token_ids_0 (`List[int]`): The first tokenized sequence.
2980
+ token_ids_1 (`List[int]`, *optional*): The second tokenized sequence.
2981
+
2982
+ Returns:
2983
+ `List[int]`: The model input with special tokens.
2984
+ """
2985
+ if token_ids_1 is None:
2986
+ return token_ids_0
2987
+ return token_ids_0 + token_ids_1
2988
+
2989
+ def build_offset_mapping_with_special_tokens(
2990
+ self, offset_mapping_0, offset_mapping_1=None
2991
+ ):
2992
+ """
2993
+ Build offset map from a pair of offset map by concatenating and adding offsets of special tokens.
2994
+
2995
+ Should be overridden in a subclass if the model has a special way of building those.
2996
+
2997
+ Args:
2998
+ offset_mapping_0 (List[tuple]):
2999
+ List of char offsets to which the special tokens will be added.
3000
+ offset_mapping_1 (List[tuple], optional):
3001
+ Optional second list of char offsets for offset mapping pairs.
3002
+
3003
+ Returns:
3004
+ List[tuple]: List of char offsets with the appropriate offsets of special tokens.
3005
+ """
3006
+ if offset_mapping_1 is None:
3007
+ return offset_mapping_0
3008
+
3009
+ return offset_mapping_0 + offset_mapping_1
3010
+
3011
+ def prepare_for_model(
3012
+ self,
3013
+ ids,
3014
+ pair_ids=None,
3015
+ padding: Union[bool, str, PaddingStrategy] = False,
3016
+ truncation: Union[bool, str, TruncationStrategy] = False,
3017
+ max_length: Optional[int] = None,
3018
+ stride: int = 0,
3019
+ pad_to_multiple_of: Optional[int] = None,
3020
+ padding_side: Optional[Literal["right", "left"]] = None,
3021
+ return_tensors: Optional[Union[str, TensorType]] = None,
3022
+ return_position_ids=None,
3023
+ return_token_type_ids: Optional[bool] = None,
3024
+ return_attention_mask: Optional[bool] = None,
3025
+ return_length=False,
3026
+ return_overflowing_tokens=False,
3027
+ return_special_tokens_mask=False,
3028
+ return_offsets_mapping=False,
3029
+ add_special_tokens=True,
3030
+ verbose: bool = True,
3031
+ prepend_batch_axis: bool = False,
3032
+ **kwargs,
3033
+ ):
3034
+ """
3035
+ Performs tokenization and uses the tokenized tokens to prepare model
3036
+ inputs. It supports sequence or sequence pair as input, and batch input
3037
+ is not allowed.
3038
+ """
3039
+ padding_strategy, truncation_strategy, max_length, kwargs = (
3040
+ self._get_padding_truncation_strategies(
3041
+ padding=padding,
3042
+ truncation=truncation,
3043
+ max_length=max_length,
3044
+ pad_to_multiple_of=pad_to_multiple_of,
3045
+ verbose=verbose,
3046
+ **kwargs,
3047
+ )
3048
+ )
3049
+
3050
+ pair = bool(pair_ids is not None)
3051
+ len_ids = len(ids)
3052
+ len_pair_ids = len(pair_ids) if pair else 0
3053
+
3054
+ if return_token_type_ids and not add_special_tokens:
3055
+ raise ValueError(
3056
+ "Asking to return token_type_ids while setting add_special_tokens to False "
3057
+ "results in an undefined behavior. Please set add_special_tokens to True or "
3058
+ "set return_token_type_ids to None."
3059
+ )
3060
+
3061
+ if (
3062
+ return_overflowing_tokens
3063
+ and truncation_strategy == TruncationStrategy.LONGEST_FIRST
3064
+ and pair_ids is not None
3065
+ ):
3066
+ raise ValueError(
3067
+ "Not possible to return overflowing tokens for pair of sequences with the "
3068
+ "`longest_first`. Please select another truncation strategy than `longest_first`, "
3069
+ "for instance `only_second` or `only_first`."
3070
+ )
3071
+
3072
+ # Load from model defaults
3073
+ if return_token_type_ids is None:
3074
+ return_token_type_ids = "token_type_ids" in self.model_input_names
3075
+ if return_attention_mask is None:
3076
+ return_attention_mask = "attention_mask" in self.model_input_names
3077
+ if return_position_ids is None:
3078
+ return_position_ids = "position_ids" in self.model_input_names
3079
+ encoded_inputs = {}
3080
+ # Truncation: Handle max sequence length
3081
+ total_len = (
3082
+ len_ids
3083
+ + len_pair_ids
3084
+ + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0)
3085
+ )
3086
+
3087
+ overflowing_tokens = []
3088
+
3089
+ if (
3090
+ truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE
3091
+ and max_length
3092
+ and total_len > max_length
3093
+ ):
3094
+ ids, pair_ids, overflowing_tokens = self.truncate_sequences(
3095
+ ids,
3096
+ pair_ids=pair_ids,
3097
+ num_tokens_to_remove=total_len - max_length,
3098
+ truncation_strategy=truncation_strategy,
3099
+ stride=stride,
3100
+ )
3101
+ if return_overflowing_tokens:
3102
+ encoded_inputs["overflowing_tokens"] = overflowing_tokens
3103
+ encoded_inputs["num_truncated_tokens"] = total_len - max_length
3104
+
3105
+ # Add special tokens
3106
+ if add_special_tokens:
3107
+ sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
3108
+ token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
3109
+ else:
3110
+ sequence = ids + pair_ids if pair else ids
3111
+ token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else [])
3112
+
3113
+ # Build output dictionary
3114
+ encoded_inputs["input_ids"] = sequence
3115
+ if return_token_type_ids:
3116
+ encoded_inputs["token_type_ids"] = token_type_ids
3117
+ if return_special_tokens_mask:
3118
+ if add_special_tokens:
3119
+ encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(
3120
+ ids, pair_ids
3121
+ )
3122
+ else:
3123
+ encoded_inputs["special_tokens_mask"] = [0] * len(sequence)
3124
+
3125
+ if return_offsets_mapping and "text" in kwargs and "text_pair" in kwargs:
3126
+ text = kwargs.pop("text")
3127
+ text_pair = kwargs.pop("text_pair")
3128
+
3129
+ token_offset_mapping = self.get_offset_mapping(text)
3130
+ token_pair_offset_mapping = (
3131
+ self.get_offset_mapping(text_pair) if text_pair is not None else None
3132
+ )
3133
+ if max_length and total_len > max_length:
3134
+ token_offset_mapping, token_pair_offset_mapping, _ = (
3135
+ self.truncate_sequences(
3136
+ token_offset_mapping,
3137
+ pair_ids=token_pair_offset_mapping,
3138
+ num_tokens_to_remove=total_len - max_length,
3139
+ truncation_strategy=truncation_strategy,
3140
+ stride=stride,
3141
+ )
3142
+ )
3143
+ if add_special_tokens:
3144
+ offset_mapping = self.build_offset_mapping_with_special_tokens(
3145
+ token_offset_mapping, token_pair_offset_mapping
3146
+ )
3147
+ else:
3148
+ offset_mapping = (
3149
+ token_offset_mapping + token_pair_offset_mapping
3150
+ if token_pair_offset_mapping
3151
+ else token_offset_mapping
3152
+ )
3153
+ encoded_inputs["offset_mapping"] = offset_mapping
3154
+
3155
+ # Check lengths
3156
+ self._eventual_warn_about_too_long_sequence(
3157
+ encoded_inputs["input_ids"], max_length, verbose
3158
+ )
3159
+
3160
+ if return_position_ids:
3161
+ encoded_inputs["position_ids"] = list(
3162
+ range(len(encoded_inputs["input_ids"]))
3163
+ )
3164
+
3165
+ if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
3166
+ encoded_inputs = self.pad(
3167
+ encoded_inputs,
3168
+ max_length=max_length,
3169
+ padding=padding_strategy.value,
3170
+ pad_to_multiple_of=pad_to_multiple_of,
3171
+ padding_side=padding_side,
3172
+ return_attention_mask=return_attention_mask,
3173
+ )
3174
+
3175
+ if return_length:
3176
+ encoded_inputs["length"] = len(encoded_inputs["input_ids"])
3177
+ # for compatibility
3178
+ encoded_inputs["seq_len"] = encoded_inputs["length"]
3179
+
3180
+ batch_outputs = BatchEncoding(
3181
+ encoded_inputs,
3182
+ tensor_type=return_tensors,
3183
+ prepend_batch_axis=prepend_batch_axis,
3184
+ )
3185
+
3186
+ return batch_outputs
3187
+
3188
+ def truncate_sequences(
3189
+ self,
3190
+ ids: List[int],
3191
+ pair_ids: Optional[List[int]] = None,
3192
+ num_tokens_to_remove: int = 0,
3193
+ truncation_strategy: Union[str, TruncationStrategy] = "longest_first",
3194
+ stride: int = 0,
3195
+ ) -> Tuple[List[int], List[int], List[int]]:
3196
+ """
3197
+ Truncates a sequence pair in-place following the strategy.
3198
+
3199
+ Args:
3200
+ ids (`List[int]`):
3201
+ Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and
3202
+ `convert_tokens_to_ids` methods.
3203
+ pair_ids (`List[int]`, *optional*):
3204
+ Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
3205
+ and `convert_tokens_to_ids` methods.
3206
+ num_tokens_to_remove (`int`, *optional*, defaults to 0):
3207
+ Number of tokens to remove using the truncation strategy.
3208
+ truncation_strategy (`str` or [`TruncationStrategy`], *optional*, defaults to `False`):
3209
+ The strategy to follow for truncation. Can be:
3210
+
3211
+ - `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
3212
+ maximum acceptable input length for the model if that argument is not provided. This will truncate
3213
+ token by token, removing a token from the longest sequence in the pair if a pair of sequences (or a
3214
+ batch of pairs) is provided.
3215
+ - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
3216
+ maximum acceptable input length for the model if that argument is not provided. This will only
3217
+ truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
3218
+ - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
3219
+ maximum acceptable input length for the model if that argument is not provided. This will only
3220
+ truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
3221
+ - `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths greater
3222
+ than the model maximum admissible input size).
3223
+ stride (`int`, *optional*, defaults to 0):
3224
+ If set to a positive number, the overflowing tokens returned will contain some tokens from the main
3225
+ sequence returned. The value of this argument defines the number of additional tokens.
3226
+
3227
+ Returns:
3228
+ `Tuple[List[int], List[int], List[int]]`: The truncated `ids`, the truncated `pair_ids` and the list of
3229
+ overflowing tokens. Note: The *longest_first* strategy returns empty list of overflowing tokens if a pair
3230
+ of sequences (or a batch of pairs) is provided.
3231
+ """
3232
+ if num_tokens_to_remove <= 0:
3233
+ return ids, pair_ids, []
3234
+
3235
+ if not isinstance(truncation_strategy, TruncationStrategy):
3236
+ truncation_strategy = TruncationStrategy(truncation_strategy)
3237
+
3238
+ overflowing_tokens = []
3239
+ if truncation_strategy == TruncationStrategy.ONLY_FIRST or (
3240
+ truncation_strategy == TruncationStrategy.LONGEST_FIRST and pair_ids is None
3241
+ ):
3242
+ if len(ids) > num_tokens_to_remove:
3243
+ window_len = min(len(ids), stride + num_tokens_to_remove)
3244
+ if self.truncation_side == "left":
3245
+ overflowing_tokens = ids[:window_len]
3246
+ ids = ids[num_tokens_to_remove:]
3247
+ elif self.truncation_side == "right":
3248
+ overflowing_tokens = ids[-window_len:]
3249
+ ids = ids[:-num_tokens_to_remove]
3250
+ else:
3251
+ raise ValueError(
3252
+ f"invalid truncation strategy: {self.truncation_side}, use 'left' or 'right'."
3253
+ )
3254
+
3255
+ else:
3256
+ error_msg = (
3257
+ f"We need to remove {num_tokens_to_remove} to truncate the input "
3258
+ f"but the first sequence has a length {len(ids)}. "
3259
+ )
3260
+ if truncation_strategy == TruncationStrategy.ONLY_FIRST:
3261
+ error_msg = (
3262
+ error_msg + "Please select another truncation strategy than "
3263
+ f"{truncation_strategy}, for instance 'longest_first' or 'only_second'."
3264
+ )
3265
+ logging.error(error_msg)
3266
+ elif truncation_strategy == TruncationStrategy.LONGEST_FIRST:
3267
+ warnings.warn(
3268
+ f"Be aware, overflowing tokens are not returned for the setting you have chosen,"
3269
+ f" i.e. sequence pairs with the '{TruncationStrategy.LONGEST_FIRST.value}' "
3270
+ f"truncation strategy. So the returned list will always be empty even if some "
3271
+ f"tokens have been removed."
3272
+ )
3273
+ for _ in range(num_tokens_to_remove):
3274
+ if pair_ids is None or len(ids) > len(pair_ids):
3275
+ if self.truncation_side == "right":
3276
+ ids = ids[:-1]
3277
+ elif self.truncation_side == "left":
3278
+ ids = ids[1:]
3279
+ else:
3280
+ raise ValueError(
3281
+ "invalid truncation strategy:" + str(self.truncation_side)
3282
+ )
3283
+ else:
3284
+ if self.truncation_side == "right":
3285
+ pair_ids = pair_ids[:-1]
3286
+ elif self.truncation_side == "left":
3287
+ pair_ids = pair_ids[1:]
3288
+ else:
3289
+ raise ValueError(
3290
+ "invalid truncation strategy:" + str(self.truncation_side)
3291
+ )
3292
+ elif (
3293
+ truncation_strategy == TruncationStrategy.ONLY_SECOND
3294
+ and pair_ids is not None
3295
+ ):
3296
+ if len(pair_ids) > num_tokens_to_remove:
3297
+ window_len = min(len(pair_ids), stride + num_tokens_to_remove)
3298
+ if self.truncation_side == "right":
3299
+ overflowing_tokens = pair_ids[-window_len:]
3300
+ pair_ids = pair_ids[:-num_tokens_to_remove]
3301
+ elif self.truncation_side == "left":
3302
+ overflowing_tokens = pair_ids[:window_len]
3303
+ pair_ids = pair_ids[num_tokens_to_remove:]
3304
+ else:
3305
+ raise ValueError(
3306
+ "invalid truncation strategy:" + str(self.truncation_side)
3307
+ )
3308
+ else:
3309
+ logging.error(
3310
+ f"We need to remove {num_tokens_to_remove} to truncate the input "
3311
+ f"but the second sequence has a length {len(pair_ids)}. "
3312
+ f"Please select another truncation strategy than {truncation_strategy}, "
3313
+ f"for instance 'longest_first' or 'only_first'."
3314
+ )
3315
+
3316
+ return (ids, pair_ids, overflowing_tokens)
3317
+
3318
+ def _pad(
3319
+ self,
3320
+ encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
3321
+ max_length: Optional[int] = None,
3322
+ padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
3323
+ pad_to_multiple_of: Optional[int] = None,
3324
+ padding_side: Optional[Literal["right", "left"]] = None,
3325
+ return_attention_mask: Optional[bool] = None,
3326
+ ) -> dict:
3327
+ """
3328
+ Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
3329
+
3330
+ Args:
3331
+ encoded_inputs:
3332
+ Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
3333
+ max_length: maximum length of the returned list and optionally padding length (see below).
3334
+ Will truncate by taking into account the special tokens.
3335
+ padding_strategy: PaddingStrategy to use for padding.
3336
+
3337
+ - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
3338
+ - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
3339
+ - PaddingStrategy.DO_NOT_PAD: Do not pad
3340
+ The tokenizer padding sides are defined in `padding_side` argument:
3341
+
3342
+ - 'left': pads on the left of the sequences
3343
+ - 'right': pads on the right of the sequences
3344
+ pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
3345
+ This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
3346
+ >= 7.5 (Volta).
3347
+ padding_side: (optional) The side on which the model should have padding applied.
3348
+ Should be selected between ['right', 'left'].
3349
+ Default value is picked from the class attribute of the same name.
3350
+ return_attention_mask:
3351
+ (optional) Set to False to avoid returning attention mask (default: set to model specifics)
3352
+ """
3353
+ # Load from model defaults
3354
+ if return_attention_mask is None:
3355
+ return_attention_mask = (
3356
+ "attention_mask" in self.model_input_names
3357
+ or "attention_mask" in encoded_inputs
3358
+ )
3359
+
3360
+ required_input = encoded_inputs[self.model_input_names[0]]
3361
+
3362
+ if padding_strategy == PaddingStrategy.LONGEST:
3363
+ max_length = len(required_input)
3364
+
3365
+ if (
3366
+ max_length is not None
3367
+ and pad_to_multiple_of is not None
3368
+ and (max_length % pad_to_multiple_of != 0)
3369
+ ):
3370
+ max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
3371
+
3372
+ needs_to_be_padded = (
3373
+ padding_strategy != PaddingStrategy.DO_NOT_PAD
3374
+ and len(required_input) != max_length
3375
+ )
3376
+
3377
+ # Initialize attention mask if not present.
3378
+ if return_attention_mask and "attention_mask" not in encoded_inputs:
3379
+ encoded_inputs["attention_mask"] = [1] * len(required_input)
3380
+
3381
+ if needs_to_be_padded:
3382
+ difference = max_length - len(required_input)
3383
+ padding_side = (
3384
+ padding_side if padding_side is not None else self.padding_side
3385
+ )
3386
+
3387
+ if padding_side == "right":
3388
+ if return_attention_mask:
3389
+ if len(np.shape(encoded_inputs["attention_mask"])) > 2:
3390
+ encoded_inputs["attention_mask"] = np.pad(
3391
+ encoded_inputs["attention_mask"],
3392
+ pad_width=[(0, 0), (0, difference), (0, difference)],
3393
+ mode="constant",
3394
+ constant_values=0,
3395
+ ).tolist()
3396
+ else:
3397
+ encoded_inputs["attention_mask"] = (
3398
+ encoded_inputs["attention_mask"] + [0] * difference
3399
+ )
3400
+ if "attn_mask_startend_row_indices" in encoded_inputs:
3401
+ encoded_inputs["attn_mask_startend_row_indices"] = np.concatenate(
3402
+ [
3403
+ np.array(
3404
+ [encoded_inputs["attn_mask_startend_row_indices"]],
3405
+ dtype=np.int32,
3406
+ ),
3407
+ np.zeros([1, difference], dtype=np.int32),
3408
+ ],
3409
+ axis=-1,
3410
+ )
3411
+ if "token_type_ids" in encoded_inputs:
3412
+ encoded_inputs["token_type_ids"] = (
3413
+ encoded_inputs["token_type_ids"]
3414
+ + [self.pad_token_type_id] * difference
3415
+ )
3416
+ if "special_tokens_mask" in encoded_inputs:
3417
+ encoded_inputs["special_tokens_mask"] = (
3418
+ encoded_inputs["special_tokens_mask"] + [1] * difference
3419
+ )
3420
+ if "offset_mapping" in encoded_inputs:
3421
+ encoded_inputs["offset_mapping"] = (
3422
+ encoded_inputs["offset_mapping"] + [(0, 0)] * difference
3423
+ )
3424
+ if "position_ids" in encoded_inputs:
3425
+ encoded_inputs["position_ids"] = (
3426
+ encoded_inputs["position_ids"] + [0] * difference
3427
+ )
3428
+ # NOTE: In ernie3.0-qa, the type of `*_positions` is int.
3429
+ if "start_positions" in encoded_inputs and isinstance(
3430
+ encoded_inputs["start_positions"], list
3431
+ ):
3432
+ encoded_inputs["start_positions"] = (
3433
+ encoded_inputs["start_positions"] + [0] * difference
3434
+ )
3435
+ if "end_positions" in encoded_inputs and isinstance(
3436
+ encoded_inputs["end_positions"], list
3437
+ ):
3438
+ encoded_inputs["end_positions"] = (
3439
+ encoded_inputs["end_positions"] + [0] * difference
3440
+ )
3441
+ encoded_inputs[self.model_input_names[0]] = (
3442
+ required_input + [self.pad_token_id] * difference
3443
+ )
3444
+ elif padding_side == "left":
3445
+ if return_attention_mask:
3446
+ if len(np.shape(encoded_inputs["attention_mask"])) > 2:
3447
+ # attention_mask shape [1,seq_len,seq_len]
3448
+ encoded_inputs["attention_mask"] = np.pad(
3449
+ encoded_inputs["attention_mask"],
3450
+ pad_width=[(0, 0), (difference, 0), (difference, 0)],
3451
+ mode="constant",
3452
+ constant_values=0,
3453
+ ).tolist()
3454
+ else:
3455
+ encoded_inputs["attention_mask"] = [
3456
+ 0
3457
+ ] * difference + encoded_inputs["attention_mask"]
3458
+ if "attn_mask_startend_row_indices" in encoded_inputs:
3459
+ encoded_inputs["attn_mask_startend_row_indices"] = np.concatenate(
3460
+ [
3461
+ np.zeros([1, difference], dtype=np.int32),
3462
+ np.array(
3463
+ [encoded_inputs["attn_mask_startend_row_indices"]],
3464
+ dtype=np.int32,
3465
+ )
3466
+ + difference,
3467
+ ],
3468
+ axis=-1,
3469
+ )
3470
+ if "token_type_ids" in encoded_inputs:
3471
+ encoded_inputs["token_type_ids"] = [
3472
+ self.pad_token_type_id
3473
+ ] * difference + encoded_inputs["token_type_ids"]
3474
+ if "special_tokens_mask" in encoded_inputs:
3475
+ encoded_inputs["special_tokens_mask"] = [
3476
+ 1
3477
+ ] * difference + encoded_inputs["special_tokens_mask"]
3478
+ if "offset_mapping" in encoded_inputs:
3479
+ encoded_inputs["offset_mapping"] = [
3480
+ (0, 0)
3481
+ ] * difference + encoded_inputs["offset_mapping"]
3482
+ if "position_ids" in encoded_inputs:
3483
+ encoded_inputs["position_ids"] = [0] * difference + encoded_inputs[
3484
+ "position_ids"
3485
+ ]
3486
+ if "start_positions" in encoded_inputs and isinstance(
3487
+ encoded_inputs["start_positions"], list
3488
+ ):
3489
+ encoded_inputs["start_positions"] = [
3490
+ 0
3491
+ ] * difference + encoded_inputs["start_positions"]
3492
+ if "end_positions" in encoded_inputs and isinstance(
3493
+ encoded_inputs["end_positions"], list
3494
+ ):
3495
+ encoded_inputs["end_positions"] = [0] * difference + encoded_inputs[
3496
+ "end_positions"
3497
+ ]
3498
+ encoded_inputs[self.model_input_names[0]] = [
3499
+ self.pad_token_id
3500
+ ] * difference + required_input
3501
+ else:
3502
+ raise ValueError("Invalid padding strategy:" + str(self.padding_side))
3503
+ else:
3504
+ if "attn_mask_startend_row_indices" in encoded_inputs:
3505
+ if len(np.shape(encoded_inputs["attn_mask_startend_row_indices"])) == 1:
3506
+ encoded_inputs["attn_mask_startend_row_indices"] = np.array([encoded_inputs["attn_mask_startend_row_indices"]], dtype=np.int32) # fmt:skip
3507
+
3508
+ if "attn_mask_startend_row_indices" in encoded_inputs:
3509
+ assert (
3510
+ len(np.shape(encoded_inputs["attn_mask_startend_row_indices"])) == 2
3511
+ ) # [num_head, seq_len]
3512
+
3513
+ return encoded_inputs
3514
+
3515
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
3516
+ """
3517
+ Converts a sequence of tokens in a single string. The most simple way to do it is `" ".join(tokens)` but we
3518
+ often want to remove sub-word tokenization artifacts at the same time.
3519
+
3520
+ Args:
3521
+ tokens (`List[str]`): The token to join in a string.
3522
+
3523
+ Returns:
3524
+ `str`: The joined tokens.
3525
+ """
3526
+ raise NotImplementedError
3527
+
3528
+ def decode_token(
3529
+ self,
3530
+ all_input_ids: List[int],
3531
+ prefix_offset: int = 0,
3532
+ read_offset: int = 0,
3533
+ ) -> Tuple[str, int, int]:
3534
+ """tokenizer decoding for the streaming generation use case. This method can be overridden for tokenizer that doesn't follow this API"""
3535
+ prefix_text = self.decode(
3536
+ all_input_ids[prefix_offset:read_offset],
3537
+ skip_special_tokens=False,
3538
+ clean_up_tokenization_spaces=False,
3539
+ )
3540
+ new_text = self.decode(
3541
+ all_input_ids[prefix_offset:],
3542
+ skip_special_tokens=False,
3543
+ clean_up_tokenization_spaces=False,
3544
+ )
3545
+
3546
+ if (
3547
+ len(new_text) > len(prefix_text)
3548
+ and not prefix_text.endswith("�")
3549
+ and not new_text.endswith("�")
3550
+ ):
3551
+ prefix_index = new_text.index(prefix_text)
3552
+ new_text = new_text[prefix_index + len(prefix_text) :]
3553
+ return new_text, read_offset, len(all_input_ids)
3554
+ else:
3555
+ return "", prefix_offset, read_offset
3556
+
3557
+ def batch_decode(
3558
+ self,
3559
+ sequences,
3560
+ skip_special_tokens: bool = False,
3561
+ clean_up_tokenization_spaces: bool = True,
3562
+ **kwargs,
3563
+ ) -> List[str]:
3564
+ """
3565
+ Convert a list of lists of token ids into a list of strings by calling decode.
3566
+
3567
+ Args:
3568
+ sequences (`Union[List[int], List[List[int]], np.ndarray, paddle.Tensor]`):
3569
+ List of tokenized input ids. Can be obtained using the `__call__` method.
3570
+ skip_special_tokens (`bool`, *optional*, defaults to `False`):
3571
+ Whether or not to remove special tokens in the decoding.
3572
+ clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
3573
+ Whether or not to clean up the tokenization spaces.
3574
+ kwargs (additional keyword arguments, *optional*):
3575
+ Will be passed to the underlying model specific decode method.
3576
+
3577
+ Returns:
3578
+ `List[str]`: The list of decoded sentences.
3579
+ """
3580
+ return [
3581
+ self.decode(
3582
+ seq,
3583
+ skip_special_tokens=skip_special_tokens,
3584
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
3585
+ **kwargs,
3586
+ )
3587
+ for seq in sequences
3588
+ ]
3589
+
3590
+ def decode(
3591
+ self,
3592
+ token_ids,
3593
+ skip_special_tokens: bool = False,
3594
+ clean_up_tokenization_spaces: bool = True,
3595
+ **kwargs,
3596
+ ) -> str:
3597
+ """
3598
+ Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
3599
+ tokens and clean up tokenization spaces.
3600
+
3601
+ Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.
3602
+
3603
+ Args:
3604
+ token_ids (`Union[int, List[int], np.ndarray, paddle.Tensor]`):
3605
+ List of tokenized input ids. Can be obtained using the `__call__` method.
3606
+ skip_special_tokens (`bool`, *optional*, defaults to `False`):
3607
+ Whether or not to remove special tokens in the decoding.
3608
+ clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
3609
+ Whether or not to clean up the tokenization spaces.
3610
+ kwargs (additional keyword arguments, *optional*):
3611
+ Will be passed to the underlying model specific decode method.
3612
+
3613
+ Returns:
3614
+ `str`: The decoded sentence.
3615
+ """
3616
+ # Convert inputs to python lists
3617
+ token_ids = to_py_obj(token_ids)
3618
+
3619
+ return self._decode(
3620
+ token_ids=token_ids,
3621
+ skip_special_tokens=skip_special_tokens,
3622
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
3623
+ **kwargs,
3624
+ )
3625
+
3626
+ def _decode(
3627
+ self,
3628
+ token_ids: Union[int, List[int]],
3629
+ skip_special_tokens: bool = False,
3630
+ clean_up_tokenization_spaces: bool = True,
3631
+ **kwargs,
3632
+ ) -> str:
3633
+ raise NotImplementedError
3634
+
3635
+ def get_special_tokens_mask(
3636
+ self,
3637
+ token_ids_0: List[int],
3638
+ token_ids_1: Optional[List[int]] = None,
3639
+ already_has_special_tokens: bool = False,
3640
+ ) -> List[int]:
3641
+ """
3642
+ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
3643
+ special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
3644
+
3645
+ Args:
3646
+ token_ids_0 (`List[int]`):
3647
+ List of ids of the first sequence.
3648
+ token_ids_1 (`List[int]`, *optional*):
3649
+ List of ids of the second sequence.
3650
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
3651
+ Whether or not the token list is already formatted with special tokens for the model.
3652
+
3653
+ Returns:
3654
+ A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
3655
+ """
3656
+ assert already_has_special_tokens and token_ids_1 is None, (
3657
+ "You cannot use ``already_has_special_tokens=False`` with this tokenizer. "
3658
+ "Please use a slow (full python) tokenizer to activate this argument. "
3659
+ "Or set `return_special_tokens_mask=True` when calling the encoding method "
3660
+ "to get the special tokens mask in any tokenizer. "
3661
+ )
3662
+
3663
+ all_special_ids = self.all_special_ids # cache the property
3664
+
3665
+ special_tokens_mask = [
3666
+ 1 if token in all_special_ids else 0 for token in token_ids_0
3667
+ ]
3668
+
3669
+ return special_tokens_mask
3670
+
3671
+ @staticmethod
3672
+ def clean_up_tokenization(out_string: str) -> str:
3673
+ """
3674
+ Clean up a list of simple English tokenization artifacts like spaces before punctuations and abbreviated forms.
3675
+
3676
+ Args:
3677
+ out_string (`str`): The text to clean up.
3678
+
3679
+ Returns:
3680
+ `str`: The cleaned-up string.
3681
+ """
3682
+ out_string = (
3683
+ out_string.replace(" .", ".")
3684
+ .replace(" ?", "?")
3685
+ .replace(" !", "!")
3686
+ .replace(" ,", ",")
3687
+ .replace(" ' ", "'")
3688
+ .replace(" n't", "n't")
3689
+ .replace(" 'm", "'m")
3690
+ .replace(" 's", "'s")
3691
+ .replace(" 've", "'ve")
3692
+ .replace(" 're", "'re")
3693
+ )
3694
+ return out_string
3695
+
3696
+ def _eventual_warn_about_too_long_sequence(
3697
+ self, ids: List[int], max_length: Optional[int], verbose: bool
3698
+ ):
3699
+ """
3700
+ Depending on the input and internal state we might trigger a warning about a sequence that is too long for its
3701
+ corresponding model
3702
+
3703
+ Args:
3704
+ ids (`List[str]`): The ids produced by the tokenization
3705
+ max_length (`int`, *optional*): The max_length desired (does not trigger a warning if it is set)
3706
+ verbose (`bool`): Whether or not to print more information and warnings.
3707
+
3708
+ """
3709
+ if max_length is None and len(ids) > self.model_max_length and verbose:
3710
+ if not self.deprecation_warnings.get(
3711
+ "sequence-length-is-longer-than-the-specified-maximum", False
3712
+ ):
3713
+ logging.warning(
3714
+ "Token indices sequence length is longer than the specified maximum sequence length "
3715
+ f"for this model ({len(ids)} > {self.model_max_length}). Running this sequence through the model "
3716
+ "will result in indexing errors"
3717
+ )
3718
+ self.deprecation_warnings[
3719
+ "sequence-length-is-longer-than-the-specified-maximum"
3720
+ ] = True