paddlex 2.1.0__py3-none-any.whl → 3.0.0rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1708) hide show
  1. paddlex/.version +1 -0
  2. paddlex/__init__.py +52 -19
  3. paddlex/__main__.py +39 -0
  4. paddlex/configs/modules/3d_bev_detection/BEVFusion.yaml +38 -0
  5. paddlex/configs/modules/doc_text_orientation/PP-LCNet_x1_0_doc_ori.yaml +41 -0
  6. paddlex/configs/modules/face_detection/BlazeFace-FPN-SSH.yaml +40 -0
  7. paddlex/configs/modules/face_detection/BlazeFace.yaml +40 -0
  8. paddlex/configs/modules/face_detection/PP-YOLOE_plus-S_face.yaml +40 -0
  9. paddlex/configs/modules/face_detection/PicoDet_LCNet_x2_5_face.yaml +40 -0
  10. paddlex/configs/modules/face_feature/MobileFaceNet.yaml +41 -0
  11. paddlex/configs/modules/face_feature/ResNet50_face.yaml +41 -0
  12. paddlex/configs/modules/formula_recognition/LaTeX_OCR_rec.yaml +40 -0
  13. paddlex/configs/modules/formula_recognition/PP-FormulaNet-L.yaml +40 -0
  14. paddlex/configs/modules/formula_recognition/PP-FormulaNet-S.yaml +40 -0
  15. paddlex/configs/modules/formula_recognition/UniMERNet.yaml +40 -0
  16. paddlex/configs/modules/human_detection/PP-YOLOE-L_human.yaml +42 -0
  17. paddlex/configs/modules/human_detection/PP-YOLOE-S_human.yaml +42 -0
  18. paddlex/configs/modules/image_anomaly_detection/STFPM.yaml +41 -0
  19. paddlex/configs/modules/image_classification/CLIP_vit_base_patch16_224.yaml +41 -0
  20. paddlex/configs/modules/image_classification/CLIP_vit_large_patch14_224.yaml +41 -0
  21. paddlex/configs/modules/image_classification/ConvNeXt_base_224.yaml +41 -0
  22. paddlex/configs/modules/image_classification/ConvNeXt_base_384.yaml +41 -0
  23. paddlex/configs/modules/image_classification/ConvNeXt_large_224.yaml +41 -0
  24. paddlex/configs/modules/image_classification/ConvNeXt_large_384.yaml +41 -0
  25. paddlex/configs/modules/image_classification/ConvNeXt_small.yaml +41 -0
  26. paddlex/configs/modules/image_classification/ConvNeXt_tiny.yaml +41 -0
  27. paddlex/configs/modules/image_classification/FasterNet-L.yaml +40 -0
  28. paddlex/configs/modules/image_classification/FasterNet-M.yaml +40 -0
  29. paddlex/configs/modules/image_classification/FasterNet-S.yaml +40 -0
  30. paddlex/configs/modules/image_classification/FasterNet-T0.yaml +40 -0
  31. paddlex/configs/modules/image_classification/FasterNet-T1.yaml +40 -0
  32. paddlex/configs/modules/image_classification/FasterNet-T2.yaml +40 -0
  33. paddlex/configs/modules/image_classification/MobileNetV1_x0_25.yaml +41 -0
  34. paddlex/configs/modules/image_classification/MobileNetV1_x0_5.yaml +41 -0
  35. paddlex/configs/modules/image_classification/MobileNetV1_x0_75.yaml +41 -0
  36. paddlex/configs/modules/image_classification/MobileNetV1_x1_0.yaml +41 -0
  37. paddlex/configs/modules/image_classification/MobileNetV2_x0_25.yaml +41 -0
  38. paddlex/configs/modules/image_classification/MobileNetV2_x0_5.yaml +41 -0
  39. paddlex/configs/modules/image_classification/MobileNetV2_x1_0.yaml +41 -0
  40. paddlex/configs/modules/image_classification/MobileNetV2_x1_5.yaml +41 -0
  41. paddlex/configs/modules/image_classification/MobileNetV2_x2_0.yaml +41 -0
  42. paddlex/configs/modules/image_classification/MobileNetV3_large_x0_35.yaml +41 -0
  43. paddlex/configs/modules/image_classification/MobileNetV3_large_x0_5.yaml +41 -0
  44. paddlex/configs/modules/image_classification/MobileNetV3_large_x0_75.yaml +41 -0
  45. paddlex/configs/modules/image_classification/MobileNetV3_large_x1_0.yaml +41 -0
  46. paddlex/configs/modules/image_classification/MobileNetV3_large_x1_25.yaml +41 -0
  47. paddlex/configs/modules/image_classification/MobileNetV3_small_x0_35.yaml +41 -0
  48. paddlex/configs/modules/image_classification/MobileNetV3_small_x0_5.yaml +41 -0
  49. paddlex/configs/modules/image_classification/MobileNetV3_small_x0_75.yaml +41 -0
  50. paddlex/configs/modules/image_classification/MobileNetV3_small_x1_0.yaml +41 -0
  51. paddlex/configs/modules/image_classification/MobileNetV3_small_x1_25.yaml +41 -0
  52. paddlex/configs/modules/image_classification/MobileNetV4_conv_large.yaml +41 -0
  53. paddlex/configs/modules/image_classification/MobileNetV4_conv_medium.yaml +41 -0
  54. paddlex/configs/modules/image_classification/MobileNetV4_conv_small.yaml +41 -0
  55. paddlex/configs/modules/image_classification/MobileNetV4_hybrid_large.yaml +41 -0
  56. paddlex/configs/modules/image_classification/MobileNetV4_hybrid_medium.yaml +41 -0
  57. paddlex/configs/modules/image_classification/PP-HGNetV2-B0.yaml +41 -0
  58. paddlex/configs/modules/image_classification/PP-HGNetV2-B1.yaml +41 -0
  59. paddlex/configs/modules/image_classification/PP-HGNetV2-B2.yaml +41 -0
  60. paddlex/configs/modules/image_classification/PP-HGNetV2-B3.yaml +41 -0
  61. paddlex/configs/modules/image_classification/PP-HGNetV2-B4.yaml +41 -0
  62. paddlex/configs/modules/image_classification/PP-HGNetV2-B5.yaml +41 -0
  63. paddlex/configs/modules/image_classification/PP-HGNetV2-B6.yaml +41 -0
  64. paddlex/configs/modules/image_classification/PP-HGNet_base.yaml +41 -0
  65. paddlex/configs/modules/image_classification/PP-HGNet_small.yaml +41 -0
  66. paddlex/configs/modules/image_classification/PP-HGNet_tiny.yaml +41 -0
  67. paddlex/configs/modules/image_classification/PP-LCNetV2_base.yaml +41 -0
  68. paddlex/configs/modules/image_classification/PP-LCNetV2_large.yaml +41 -0
  69. paddlex/configs/modules/image_classification/PP-LCNetV2_small.yaml +41 -0
  70. paddlex/configs/modules/image_classification/PP-LCNet_x0_25.yaml +41 -0
  71. paddlex/configs/modules/image_classification/PP-LCNet_x0_35.yaml +41 -0
  72. paddlex/configs/modules/image_classification/PP-LCNet_x0_5.yaml +41 -0
  73. paddlex/configs/modules/image_classification/PP-LCNet_x0_75.yaml +41 -0
  74. paddlex/configs/modules/image_classification/PP-LCNet_x1_0.yaml +41 -0
  75. paddlex/configs/modules/image_classification/PP-LCNet_x1_5.yaml +41 -0
  76. paddlex/configs/modules/image_classification/PP-LCNet_x2_0.yaml +41 -0
  77. paddlex/configs/modules/image_classification/PP-LCNet_x2_5.yaml +41 -0
  78. paddlex/configs/modules/image_classification/ResNet101.yaml +41 -0
  79. paddlex/configs/modules/image_classification/ResNet101_vd.yaml +41 -0
  80. paddlex/configs/modules/image_classification/ResNet152.yaml +41 -0
  81. paddlex/configs/modules/image_classification/ResNet152_vd.yaml +41 -0
  82. paddlex/configs/modules/image_classification/ResNet18.yaml +41 -0
  83. paddlex/configs/modules/image_classification/ResNet18_vd.yaml +41 -0
  84. paddlex/configs/modules/image_classification/ResNet200_vd.yaml +41 -0
  85. paddlex/configs/modules/image_classification/ResNet34.yaml +41 -0
  86. paddlex/configs/modules/image_classification/ResNet34_vd.yaml +41 -0
  87. paddlex/configs/modules/image_classification/ResNet50.yaml +41 -0
  88. paddlex/configs/modules/image_classification/ResNet50_vd.yaml +41 -0
  89. paddlex/configs/modules/image_classification/StarNet-S1.yaml +41 -0
  90. paddlex/configs/modules/image_classification/StarNet-S2.yaml +41 -0
  91. paddlex/configs/modules/image_classification/StarNet-S3.yaml +41 -0
  92. paddlex/configs/modules/image_classification/StarNet-S4.yaml +41 -0
  93. paddlex/configs/modules/image_classification/SwinTransformer_base_patch4_window12_384.yaml +41 -0
  94. paddlex/configs/modules/image_classification/SwinTransformer_base_patch4_window7_224.yaml +41 -0
  95. paddlex/configs/modules/image_classification/SwinTransformer_large_patch4_window12_384.yaml +41 -0
  96. paddlex/configs/modules/image_classification/SwinTransformer_large_patch4_window7_224.yaml +41 -0
  97. paddlex/configs/modules/image_classification/SwinTransformer_small_patch4_window7_224.yaml +41 -0
  98. paddlex/configs/modules/image_classification/SwinTransformer_tiny_patch4_window7_224.yaml +41 -0
  99. paddlex/configs/modules/image_feature/PP-ShiTuV2_rec.yaml +42 -0
  100. paddlex/configs/modules/image_feature/PP-ShiTuV2_rec_CLIP_vit_base.yaml +42 -0
  101. paddlex/configs/modules/image_feature/PP-ShiTuV2_rec_CLIP_vit_large.yaml +41 -0
  102. paddlex/configs/modules/image_multilabel_classification/CLIP_vit_base_patch16_448_ML.yaml +41 -0
  103. paddlex/configs/modules/image_multilabel_classification/PP-HGNetV2-B0_ML.yaml +41 -0
  104. paddlex/configs/modules/image_multilabel_classification/PP-HGNetV2-B4_ML.yaml +41 -0
  105. paddlex/configs/modules/image_multilabel_classification/PP-HGNetV2-B6_ML.yaml +41 -0
  106. paddlex/configs/modules/image_multilabel_classification/PP-LCNet_x1_0_ML.yaml +41 -0
  107. paddlex/configs/modules/image_multilabel_classification/ResNet50_ML.yaml +41 -0
  108. paddlex/configs/modules/image_unwarping/UVDoc.yaml +12 -0
  109. paddlex/configs/modules/instance_segmentation/Cascade-MaskRCNN-ResNet50-FPN.yaml +40 -0
  110. paddlex/configs/modules/instance_segmentation/Cascade-MaskRCNN-ResNet50-vd-SSLDv2-FPN.yaml +40 -0
  111. paddlex/configs/modules/instance_segmentation/Mask-RT-DETR-H.yaml +40 -0
  112. paddlex/configs/modules/instance_segmentation/Mask-RT-DETR-L.yaml +40 -0
  113. paddlex/configs/modules/instance_segmentation/Mask-RT-DETR-M.yaml +40 -0
  114. paddlex/configs/modules/instance_segmentation/Mask-RT-DETR-S.yaml +40 -0
  115. paddlex/configs/modules/instance_segmentation/Mask-RT-DETR-X.yaml +40 -0
  116. paddlex/configs/modules/instance_segmentation/MaskRCNN-ResNeXt101-vd-FPN.yaml +39 -0
  117. paddlex/configs/modules/instance_segmentation/MaskRCNN-ResNet101-FPN.yaml +40 -0
  118. paddlex/configs/modules/instance_segmentation/MaskRCNN-ResNet101-vd-FPN.yaml +40 -0
  119. paddlex/configs/modules/instance_segmentation/MaskRCNN-ResNet50-FPN.yaml +40 -0
  120. paddlex/configs/modules/instance_segmentation/MaskRCNN-ResNet50-vd-FPN.yaml +40 -0
  121. paddlex/configs/modules/instance_segmentation/MaskRCNN-ResNet50.yaml +40 -0
  122. paddlex/configs/modules/instance_segmentation/PP-YOLOE_seg-S.yaml +40 -0
  123. paddlex/configs/modules/instance_segmentation/SOLOv2.yaml +40 -0
  124. paddlex/configs/modules/keypoint_detection/PP-TinyPose_128x96.yaml +40 -0
  125. paddlex/configs/modules/keypoint_detection/PP-TinyPose_256x192.yaml +40 -0
  126. paddlex/configs/modules/layout_detection/PP-DocLayout-L.yaml +40 -0
  127. paddlex/configs/modules/layout_detection/PP-DocLayout-M.yaml +40 -0
  128. paddlex/configs/modules/layout_detection/PP-DocLayout-S.yaml +40 -0
  129. paddlex/configs/modules/layout_detection/PicoDet-L_layout_17cls.yaml +40 -0
  130. paddlex/configs/modules/layout_detection/PicoDet-L_layout_3cls.yaml +40 -0
  131. paddlex/configs/modules/layout_detection/PicoDet-S_layout_17cls.yaml +40 -0
  132. paddlex/configs/modules/layout_detection/PicoDet-S_layout_3cls.yaml +40 -0
  133. paddlex/configs/modules/layout_detection/PicoDet_layout_1x.yaml +40 -0
  134. paddlex/configs/modules/layout_detection/PicoDet_layout_1x_table.yaml +40 -0
  135. paddlex/configs/modules/layout_detection/RT-DETR-H_layout_17cls.yaml +40 -0
  136. paddlex/configs/modules/layout_detection/RT-DETR-H_layout_3cls.yaml +40 -0
  137. paddlex/configs/modules/mainbody_detection/PP-ShiTuV2_det.yaml +41 -0
  138. paddlex/configs/modules/multilingual_speech_recognition/whisper_base.yaml +12 -0
  139. paddlex/configs/modules/multilingual_speech_recognition/whisper_large.yaml +12 -0
  140. paddlex/configs/modules/multilingual_speech_recognition/whisper_medium.yaml +12 -0
  141. paddlex/configs/modules/multilingual_speech_recognition/whisper_small.yaml +12 -0
  142. paddlex/configs/modules/multilingual_speech_recognition/whisper_tiny.yaml +12 -0
  143. paddlex/configs/modules/object_detection/Cascade-FasterRCNN-ResNet50-FPN.yaml +41 -0
  144. paddlex/configs/modules/object_detection/Cascade-FasterRCNN-ResNet50-vd-SSLDv2-FPN.yaml +42 -0
  145. paddlex/configs/modules/object_detection/CenterNet-DLA-34.yaml +41 -0
  146. paddlex/configs/modules/object_detection/CenterNet-ResNet50.yaml +41 -0
  147. paddlex/configs/modules/object_detection/Co-DINO-R50.yaml +40 -0
  148. paddlex/configs/modules/object_detection/Co-DINO-Swin-L.yaml +40 -0
  149. paddlex/configs/modules/object_detection/Co-Deformable-DETR-R50.yaml +40 -0
  150. paddlex/configs/modules/object_detection/Co-Deformable-DETR-Swin-T.yaml +40 -0
  151. paddlex/configs/modules/object_detection/DETR-R50.yaml +42 -0
  152. paddlex/configs/modules/object_detection/FCOS-ResNet50.yaml +41 -0
  153. paddlex/configs/modules/object_detection/FasterRCNN-ResNeXt101-vd-FPN.yaml +42 -0
  154. paddlex/configs/modules/object_detection/FasterRCNN-ResNet101-FPN.yaml +42 -0
  155. paddlex/configs/modules/object_detection/FasterRCNN-ResNet101.yaml +42 -0
  156. paddlex/configs/modules/object_detection/FasterRCNN-ResNet34-FPN.yaml +42 -0
  157. paddlex/configs/modules/object_detection/FasterRCNN-ResNet50-FPN.yaml +42 -0
  158. paddlex/configs/modules/object_detection/FasterRCNN-ResNet50-vd-FPN.yaml +42 -0
  159. paddlex/configs/modules/object_detection/FasterRCNN-ResNet50-vd-SSLDv2-FPN.yaml +42 -0
  160. paddlex/configs/modules/object_detection/FasterRCNN-ResNet50.yaml +42 -0
  161. paddlex/configs/modules/object_detection/FasterRCNN-Swin-Tiny-FPN.yaml +42 -0
  162. paddlex/configs/modules/object_detection/PP-YOLOE_plus-L.yaml +40 -0
  163. paddlex/configs/modules/object_detection/PP-YOLOE_plus-M.yaml +40 -0
  164. paddlex/configs/modules/object_detection/PP-YOLOE_plus-S.yaml +40 -0
  165. paddlex/configs/modules/object_detection/PP-YOLOE_plus-X.yaml +40 -0
  166. paddlex/configs/modules/object_detection/PicoDet-L.yaml +40 -0
  167. paddlex/configs/modules/object_detection/PicoDet-M.yaml +42 -0
  168. paddlex/configs/modules/object_detection/PicoDet-S.yaml +40 -0
  169. paddlex/configs/modules/object_detection/PicoDet-XS.yaml +42 -0
  170. paddlex/configs/modules/object_detection/RT-DETR-H.yaml +40 -0
  171. paddlex/configs/modules/object_detection/RT-DETR-L.yaml +40 -0
  172. paddlex/configs/modules/object_detection/RT-DETR-R18.yaml +40 -0
  173. paddlex/configs/modules/object_detection/RT-DETR-R50.yaml +40 -0
  174. paddlex/configs/modules/object_detection/RT-DETR-X.yaml +40 -0
  175. paddlex/configs/modules/object_detection/YOLOX-L.yaml +40 -0
  176. paddlex/configs/modules/object_detection/YOLOX-M.yaml +40 -0
  177. paddlex/configs/modules/object_detection/YOLOX-N.yaml +40 -0
  178. paddlex/configs/modules/object_detection/YOLOX-S.yaml +40 -0
  179. paddlex/configs/modules/object_detection/YOLOX-T.yaml +40 -0
  180. paddlex/configs/modules/object_detection/YOLOX-X.yaml +40 -0
  181. paddlex/configs/modules/object_detection/YOLOv3-DarkNet53.yaml +40 -0
  182. paddlex/configs/modules/object_detection/YOLOv3-MobileNetV3.yaml +40 -0
  183. paddlex/configs/modules/object_detection/YOLOv3-ResNet50_vd_DCN.yaml +40 -0
  184. paddlex/configs/modules/open_vocabulary_detection/GroundingDINO-T.yaml +13 -0
  185. paddlex/configs/modules/open_vocabulary_segmentation/SAM-H_box.yaml +17 -0
  186. paddlex/configs/modules/open_vocabulary_segmentation/SAM-H_point.yaml +15 -0
  187. paddlex/configs/modules/pedestrian_attribute_recognition/PP-LCNet_x1_0_pedestrian_attribute.yaml +41 -0
  188. paddlex/configs/modules/rotated_object_detection/PP-YOLOE-R-L.yaml +40 -0
  189. paddlex/configs/modules/seal_text_detection/PP-OCRv4_mobile_seal_det.yaml +40 -0
  190. paddlex/configs/modules/seal_text_detection/PP-OCRv4_server_seal_det.yaml +40 -0
  191. paddlex/configs/modules/semantic_segmentation/Deeplabv3-R101.yaml +40 -0
  192. paddlex/configs/modules/semantic_segmentation/Deeplabv3-R50.yaml +40 -0
  193. paddlex/configs/modules/semantic_segmentation/Deeplabv3_Plus-R101.yaml +40 -0
  194. paddlex/configs/modules/semantic_segmentation/Deeplabv3_Plus-R50.yaml +40 -0
  195. paddlex/configs/modules/semantic_segmentation/MaskFormer_small.yaml +42 -0
  196. paddlex/configs/modules/semantic_segmentation/MaskFormer_tiny.yaml +42 -0
  197. paddlex/configs/modules/semantic_segmentation/OCRNet_HRNet-W18.yaml +40 -0
  198. paddlex/configs/modules/semantic_segmentation/OCRNet_HRNet-W48.yaml +40 -0
  199. paddlex/configs/modules/semantic_segmentation/PP-LiteSeg-B.yaml +41 -0
  200. paddlex/configs/modules/semantic_segmentation/PP-LiteSeg-T.yaml +40 -0
  201. paddlex/configs/modules/semantic_segmentation/SeaFormer_base.yaml +40 -0
  202. paddlex/configs/modules/semantic_segmentation/SeaFormer_large.yaml +40 -0
  203. paddlex/configs/modules/semantic_segmentation/SeaFormer_small.yaml +40 -0
  204. paddlex/configs/modules/semantic_segmentation/SeaFormer_tiny.yaml +40 -0
  205. paddlex/configs/modules/semantic_segmentation/SegFormer-B0.yaml +40 -0
  206. paddlex/configs/modules/semantic_segmentation/SegFormer-B1.yaml +40 -0
  207. paddlex/configs/modules/semantic_segmentation/SegFormer-B2.yaml +40 -0
  208. paddlex/configs/modules/semantic_segmentation/SegFormer-B3.yaml +40 -0
  209. paddlex/configs/modules/semantic_segmentation/SegFormer-B4.yaml +40 -0
  210. paddlex/configs/modules/semantic_segmentation/SegFormer-B5.yaml +40 -0
  211. paddlex/configs/modules/small_object_detection/PP-YOLOE_plus_SOD-L.yaml +42 -0
  212. paddlex/configs/modules/small_object_detection/PP-YOLOE_plus_SOD-S.yaml +42 -0
  213. paddlex/configs/modules/small_object_detection/PP-YOLOE_plus_SOD-largesize-L.yaml +42 -0
  214. paddlex/configs/modules/table_cells_detection/RT-DETR-L_wired_table_cell_det.yaml +40 -0
  215. paddlex/configs/modules/table_cells_detection/RT-DETR-L_wireless_table_cell_det.yaml +40 -0
  216. paddlex/configs/modules/table_classification/PP-LCNet_x1_0_table_cls.yaml +41 -0
  217. paddlex/configs/modules/table_structure_recognition/SLANeXt_wired.yaml +39 -0
  218. paddlex/configs/modules/table_structure_recognition/SLANeXt_wireless.yaml +39 -0
  219. paddlex/configs/modules/table_structure_recognition/SLANet.yaml +39 -0
  220. paddlex/configs/modules/table_structure_recognition/SLANet_plus.yaml +39 -0
  221. paddlex/configs/modules/text_detection/PP-OCRv3_mobile_det.yaml +40 -0
  222. paddlex/configs/modules/text_detection/PP-OCRv3_server_det.yaml +40 -0
  223. paddlex/configs/modules/text_detection/PP-OCRv4_mobile_det.yaml +40 -0
  224. paddlex/configs/modules/text_detection/PP-OCRv4_server_det.yaml +40 -0
  225. paddlex/configs/modules/text_recognition/PP-OCRv3_mobile_rec.yaml +39 -0
  226. paddlex/configs/modules/text_recognition/PP-OCRv4_mobile_rec.yaml +39 -0
  227. paddlex/configs/modules/text_recognition/PP-OCRv4_server_rec.yaml +39 -0
  228. paddlex/configs/modules/text_recognition/PP-OCRv4_server_rec_doc.yaml +39 -0
  229. paddlex/configs/modules/text_recognition/arabic_PP-OCRv3_mobile_rec.yaml +39 -0
  230. paddlex/configs/modules/text_recognition/ch_RepSVTR_rec.yaml +39 -0
  231. paddlex/configs/modules/text_recognition/ch_SVTRv2_rec.yaml +39 -0
  232. paddlex/configs/modules/text_recognition/chinese_cht_PP-OCRv3_mobile_rec.yaml +39 -0
  233. paddlex/configs/modules/text_recognition/cyrillic_PP-OCRv3_mobile_rec.yaml +39 -0
  234. paddlex/configs/modules/text_recognition/devanagari_PP-OCRv3_mobile_rec.yaml +39 -0
  235. paddlex/configs/modules/text_recognition/en_PP-OCRv3_mobile_rec.yaml +39 -0
  236. paddlex/configs/modules/text_recognition/en_PP-OCRv4_mobile_rec.yaml +39 -0
  237. paddlex/configs/modules/text_recognition/japan_PP-OCRv3_mobile_rec.yaml +39 -0
  238. paddlex/configs/modules/text_recognition/ka_PP-OCRv3_mobile_rec.yaml +39 -0
  239. paddlex/configs/modules/text_recognition/korean_PP-OCRv3_mobile_rec.yaml +39 -0
  240. paddlex/configs/modules/text_recognition/latin_PP-OCRv3_mobile_rec.yaml +39 -0
  241. paddlex/configs/modules/text_recognition/ta_PP-OCRv3_mobile_rec.yaml +39 -0
  242. paddlex/configs/modules/text_recognition/te_PP-OCRv3_mobile_rec.yaml +39 -0
  243. paddlex/configs/modules/textline_orientation/PP-LCNet_x0_25_textline_ori.yaml +41 -0
  244. paddlex/configs/modules/ts_anomaly_detection/AutoEncoder_ad.yaml +37 -0
  245. paddlex/configs/modules/ts_anomaly_detection/DLinear_ad.yaml +37 -0
  246. paddlex/configs/modules/ts_anomaly_detection/Nonstationary_ad.yaml +37 -0
  247. paddlex/configs/modules/ts_anomaly_detection/PatchTST_ad.yaml +37 -0
  248. paddlex/configs/modules/ts_anomaly_detection/TimesNet_ad.yaml +37 -0
  249. paddlex/configs/modules/ts_classification/TimesNet_cls.yaml +37 -0
  250. paddlex/configs/modules/ts_forecast/DLinear.yaml +38 -0
  251. paddlex/configs/modules/ts_forecast/NLinear.yaml +38 -0
  252. paddlex/configs/modules/ts_forecast/Nonstationary.yaml +38 -0
  253. paddlex/configs/modules/ts_forecast/PatchTST.yaml +38 -0
  254. paddlex/configs/modules/ts_forecast/RLinear.yaml +38 -0
  255. paddlex/configs/modules/ts_forecast/TiDE.yaml +38 -0
  256. paddlex/configs/modules/ts_forecast/TimesNet.yaml +38 -0
  257. paddlex/configs/modules/vehicle_attribute_recognition/PP-LCNet_x1_0_vehicle_attribute.yaml +41 -0
  258. paddlex/configs/modules/vehicle_detection/PP-YOLOE-L_vehicle.yaml +41 -0
  259. paddlex/configs/modules/vehicle_detection/PP-YOLOE-S_vehicle.yaml +42 -0
  260. paddlex/configs/modules/video_classification/PP-TSM-R50_8frames_uniform.yaml +42 -0
  261. paddlex/configs/modules/video_classification/PP-TSMv2-LCNetV2_16frames_uniform.yaml +42 -0
  262. paddlex/configs/modules/video_classification/PP-TSMv2-LCNetV2_8frames_uniform.yaml +42 -0
  263. paddlex/configs/modules/video_detection/YOWO.yaml +40 -0
  264. paddlex/configs/pipelines/3d_bev_detection.yaml +9 -0
  265. paddlex/configs/pipelines/OCR.yaml +44 -0
  266. paddlex/configs/pipelines/PP-ChatOCRv3-doc.yaml +149 -0
  267. paddlex/configs/pipelines/PP-ChatOCRv4-doc.yaml +184 -0
  268. paddlex/configs/pipelines/PP-ShiTuV2.yaml +18 -0
  269. paddlex/configs/pipelines/PP-StructureV3.yaml +226 -0
  270. paddlex/configs/pipelines/anomaly_detection.yaml +8 -0
  271. paddlex/configs/pipelines/doc_preprocessor.yaml +15 -0
  272. paddlex/configs/pipelines/face_recognition.yaml +18 -0
  273. paddlex/configs/pipelines/formula_recognition.yaml +39 -0
  274. paddlex/configs/pipelines/human_keypoint_detection.yaml +17 -0
  275. paddlex/configs/pipelines/image_classification.yaml +10 -0
  276. paddlex/configs/pipelines/image_multilabel_classification.yaml +9 -0
  277. paddlex/configs/pipelines/instance_segmentation.yaml +10 -0
  278. paddlex/configs/pipelines/layout_parsing.yaml +101 -0
  279. paddlex/configs/pipelines/multilingual_speech_recognition.yaml +9 -0
  280. paddlex/configs/pipelines/object_detection.yaml +10 -0
  281. paddlex/configs/pipelines/open_vocabulary_detection.yaml +12 -0
  282. paddlex/configs/pipelines/open_vocabulary_segmentation.yaml +13 -0
  283. paddlex/configs/pipelines/pedestrian_attribute_recognition.yaml +15 -0
  284. paddlex/configs/pipelines/rotated_object_detection.yaml +10 -0
  285. paddlex/configs/pipelines/seal_recognition.yaml +51 -0
  286. paddlex/configs/pipelines/semantic_segmentation.yaml +10 -0
  287. paddlex/configs/pipelines/small_object_detection.yaml +10 -0
  288. paddlex/configs/pipelines/table_recognition.yaml +56 -0
  289. paddlex/configs/pipelines/table_recognition_v2.yaml +76 -0
  290. paddlex/configs/pipelines/ts_anomaly_detection.yaml +8 -0
  291. paddlex/configs/pipelines/ts_classification.yaml +8 -0
  292. paddlex/configs/pipelines/ts_forecast.yaml +8 -0
  293. paddlex/configs/pipelines/vehicle_attribute_recognition.yaml +15 -0
  294. paddlex/configs/pipelines/video_classification.yaml +9 -0
  295. paddlex/configs/pipelines/video_detection.yaml +10 -0
  296. paddlex/engine.py +54 -0
  297. paddlex/hpip_links.html +19 -0
  298. paddlex/inference/__init__.py +19 -0
  299. paddlex/inference/common/__init__.py +13 -0
  300. paddlex/inference/common/batch_sampler/__init__.py +20 -0
  301. paddlex/inference/common/batch_sampler/audio_batch_sampler.py +84 -0
  302. paddlex/inference/common/batch_sampler/base_batch_sampler.py +90 -0
  303. paddlex/inference/common/batch_sampler/det_3d_batch_sampler.py +147 -0
  304. paddlex/inference/common/batch_sampler/image_batch_sampler.py +136 -0
  305. paddlex/inference/common/batch_sampler/ts_batch_sampler.py +110 -0
  306. paddlex/inference/common/batch_sampler/video_batch_sampler.py +94 -0
  307. paddlex/inference/common/reader/__init__.py +19 -0
  308. paddlex/inference/common/reader/audio_reader.py +46 -0
  309. paddlex/inference/common/reader/det_3d_reader.py +239 -0
  310. paddlex/inference/common/reader/image_reader.py +69 -0
  311. paddlex/inference/common/reader/ts_reader.py +45 -0
  312. paddlex/inference/common/reader/video_reader.py +42 -0
  313. paddlex/inference/common/result/__init__.py +29 -0
  314. paddlex/inference/common/result/base_cv_result.py +31 -0
  315. paddlex/inference/common/result/base_result.py +70 -0
  316. paddlex/inference/common/result/base_ts_result.py +42 -0
  317. paddlex/inference/common/result/base_video_result.py +36 -0
  318. paddlex/inference/common/result/mixin.py +703 -0
  319. paddlex/inference/models/3d_bev_detection/__init__.py +15 -0
  320. paddlex/inference/models/3d_bev_detection/predictor.py +314 -0
  321. paddlex/inference/models/3d_bev_detection/processors.py +978 -0
  322. paddlex/inference/models/3d_bev_detection/result.py +65 -0
  323. paddlex/inference/models/3d_bev_detection/visualizer_3d.py +131 -0
  324. paddlex/inference/models/__init__.py +130 -0
  325. paddlex/inference/models/anomaly_detection/__init__.py +15 -0
  326. paddlex/inference/models/anomaly_detection/predictor.py +145 -0
  327. paddlex/inference/models/anomaly_detection/processors.py +46 -0
  328. paddlex/inference/models/anomaly_detection/result.py +70 -0
  329. paddlex/inference/models/base/__init__.py +15 -0
  330. paddlex/inference/models/base/predictor/__init__.py +16 -0
  331. paddlex/inference/models/base/predictor/base_predictor.py +175 -0
  332. paddlex/inference/models/base/predictor/basic_predictor.py +139 -0
  333. paddlex/inference/models/common/__init__.py +35 -0
  334. paddlex/inference/models/common/static_infer.py +329 -0
  335. paddlex/inference/models/common/tokenizer/__init__.py +17 -0
  336. paddlex/inference/models/common/tokenizer/bert_tokenizer.py +655 -0
  337. paddlex/inference/models/common/tokenizer/gpt_tokenizer.py +451 -0
  338. paddlex/inference/models/common/tokenizer/tokenizer_utils.py +2141 -0
  339. paddlex/inference/models/common/tokenizer/tokenizer_utils_base.py +3504 -0
  340. paddlex/inference/models/common/tokenizer/utils.py +66 -0
  341. paddlex/inference/models/common/tokenizer/vocab.py +647 -0
  342. paddlex/inference/models/common/ts/__init__.py +15 -0
  343. paddlex/inference/models/common/ts/funcs.py +533 -0
  344. paddlex/inference/models/common/ts/processors.py +313 -0
  345. paddlex/inference/models/common/vision/__init__.py +23 -0
  346. paddlex/inference/models/common/vision/funcs.py +93 -0
  347. paddlex/inference/models/common/vision/processors.py +270 -0
  348. paddlex/inference/models/face_feature/__init__.py +15 -0
  349. paddlex/inference/models/face_feature/predictor.py +65 -0
  350. paddlex/inference/models/formula_recognition/__init__.py +15 -0
  351. paddlex/inference/models/formula_recognition/predictor.py +203 -0
  352. paddlex/inference/models/formula_recognition/processors.py +986 -0
  353. paddlex/inference/models/formula_recognition/result.py +403 -0
  354. paddlex/inference/models/image_classification/__init__.py +15 -0
  355. paddlex/inference/models/image_classification/predictor.py +182 -0
  356. paddlex/inference/models/image_classification/processors.py +87 -0
  357. paddlex/inference/models/image_classification/result.py +92 -0
  358. paddlex/inference/models/image_feature/__init__.py +15 -0
  359. paddlex/inference/models/image_feature/predictor.py +156 -0
  360. paddlex/inference/models/image_feature/processors.py +29 -0
  361. paddlex/inference/models/image_feature/result.py +33 -0
  362. paddlex/inference/models/image_multilabel_classification/__init__.py +15 -0
  363. paddlex/inference/models/image_multilabel_classification/predictor.py +94 -0
  364. paddlex/inference/models/image_multilabel_classification/processors.py +85 -0
  365. paddlex/inference/models/image_multilabel_classification/result.py +95 -0
  366. paddlex/inference/models/image_unwarping/__init__.py +15 -0
  367. paddlex/inference/models/image_unwarping/predictor.py +105 -0
  368. paddlex/inference/models/image_unwarping/processors.py +88 -0
  369. paddlex/inference/models/image_unwarping/result.py +45 -0
  370. paddlex/inference/models/instance_segmentation/__init__.py +15 -0
  371. paddlex/inference/models/instance_segmentation/predictor.py +210 -0
  372. paddlex/inference/models/instance_segmentation/processors.py +105 -0
  373. paddlex/inference/models/instance_segmentation/result.py +161 -0
  374. paddlex/inference/models/keypoint_detection/__init__.py +15 -0
  375. paddlex/inference/models/keypoint_detection/predictor.py +188 -0
  376. paddlex/inference/models/keypoint_detection/processors.py +359 -0
  377. paddlex/inference/models/keypoint_detection/result.py +192 -0
  378. paddlex/inference/models/multilingual_speech_recognition/__init__.py +15 -0
  379. paddlex/inference/models/multilingual_speech_recognition/predictor.py +141 -0
  380. paddlex/inference/models/multilingual_speech_recognition/processors.py +1941 -0
  381. paddlex/inference/models/multilingual_speech_recognition/result.py +21 -0
  382. paddlex/inference/models/object_detection/__init__.py +15 -0
  383. paddlex/inference/models/object_detection/predictor.py +348 -0
  384. paddlex/inference/models/object_detection/processors.py +855 -0
  385. paddlex/inference/models/object_detection/result.py +113 -0
  386. paddlex/inference/models/object_detection/utils.py +68 -0
  387. paddlex/inference/models/open_vocabulary_detection/__init__.py +15 -0
  388. paddlex/inference/models/open_vocabulary_detection/predictor.py +155 -0
  389. paddlex/inference/models/open_vocabulary_detection/processors/__init__.py +15 -0
  390. paddlex/inference/models/open_vocabulary_detection/processors/groundingdino_processors.py +485 -0
  391. paddlex/inference/models/open_vocabulary_segmentation/__init__.py +15 -0
  392. paddlex/inference/models/open_vocabulary_segmentation/predictor.py +120 -0
  393. paddlex/inference/models/open_vocabulary_segmentation/processors/__init__.py +15 -0
  394. paddlex/inference/models/open_vocabulary_segmentation/processors/sam_processer.py +249 -0
  395. paddlex/inference/models/open_vocabulary_segmentation/results/__init__.py +15 -0
  396. paddlex/inference/models/open_vocabulary_segmentation/results/sam_result.py +147 -0
  397. paddlex/inference/models/semantic_segmentation/__init__.py +15 -0
  398. paddlex/inference/models/semantic_segmentation/predictor.py +167 -0
  399. paddlex/inference/models/semantic_segmentation/processors.py +114 -0
  400. paddlex/inference/models/semantic_segmentation/result.py +72 -0
  401. paddlex/inference/models/table_structure_recognition/__init__.py +15 -0
  402. paddlex/inference/models/table_structure_recognition/predictor.py +171 -0
  403. paddlex/inference/models/table_structure_recognition/processors.py +235 -0
  404. paddlex/inference/models/table_structure_recognition/result.py +70 -0
  405. paddlex/inference/models/text_detection/__init__.py +15 -0
  406. paddlex/inference/models/text_detection/predictor.py +191 -0
  407. paddlex/inference/models/text_detection/processors.py +466 -0
  408. paddlex/inference/models/text_detection/result.py +51 -0
  409. paddlex/inference/models/text_recognition/__init__.py +15 -0
  410. paddlex/inference/models/text_recognition/predictor.py +106 -0
  411. paddlex/inference/models/text_recognition/processors.py +231 -0
  412. paddlex/inference/models/text_recognition/result.py +75 -0
  413. paddlex/inference/models/ts_anomaly_detection/__init__.py +15 -0
  414. paddlex/inference/models/ts_anomaly_detection/predictor.py +146 -0
  415. paddlex/inference/models/ts_anomaly_detection/processors.py +94 -0
  416. paddlex/inference/models/ts_anomaly_detection/result.py +72 -0
  417. paddlex/inference/models/ts_classification/__init__.py +15 -0
  418. paddlex/inference/models/ts_classification/predictor.py +135 -0
  419. paddlex/inference/models/ts_classification/processors.py +117 -0
  420. paddlex/inference/models/ts_classification/result.py +78 -0
  421. paddlex/inference/models/ts_forecasting/__init__.py +15 -0
  422. paddlex/inference/models/ts_forecasting/predictor.py +159 -0
  423. paddlex/inference/models/ts_forecasting/processors.py +149 -0
  424. paddlex/inference/models/ts_forecasting/result.py +83 -0
  425. paddlex/inference/models/video_classification/__init__.py +15 -0
  426. paddlex/inference/models/video_classification/predictor.py +147 -0
  427. paddlex/inference/models/video_classification/processors.py +409 -0
  428. paddlex/inference/models/video_classification/result.py +92 -0
  429. paddlex/inference/models/video_detection/__init__.py +15 -0
  430. paddlex/inference/models/video_detection/predictor.py +136 -0
  431. paddlex/inference/models/video_detection/processors.py +450 -0
  432. paddlex/inference/models/video_detection/result.py +104 -0
  433. paddlex/inference/pipelines/3d_bev_detection/__init__.py +15 -0
  434. paddlex/inference/pipelines/3d_bev_detection/pipeline.py +67 -0
  435. paddlex/inference/pipelines/__init__.py +228 -0
  436. paddlex/inference/pipelines/anomaly_detection/__init__.py +15 -0
  437. paddlex/inference/pipelines/anomaly_detection/pipeline.py +62 -0
  438. paddlex/inference/pipelines/attribute_recognition/__init__.py +15 -0
  439. paddlex/inference/pipelines/attribute_recognition/pipeline.py +105 -0
  440. paddlex/inference/pipelines/attribute_recognition/result.py +100 -0
  441. paddlex/inference/pipelines/base.py +132 -0
  442. paddlex/inference/pipelines/components/__init__.py +23 -0
  443. paddlex/inference/pipelines/components/chat_server/__init__.py +16 -0
  444. paddlex/inference/pipelines/components/chat_server/base.py +39 -0
  445. paddlex/inference/pipelines/components/chat_server/openai_bot_chat.py +236 -0
  446. paddlex/inference/pipelines/components/common/__init__.py +18 -0
  447. paddlex/inference/pipelines/components/common/base_operator.py +36 -0
  448. paddlex/inference/pipelines/components/common/base_result.py +65 -0
  449. paddlex/inference/pipelines/components/common/convert_points_and_boxes.py +46 -0
  450. paddlex/inference/pipelines/components/common/crop_image_regions.py +550 -0
  451. paddlex/inference/pipelines/components/common/seal_det_warp.py +941 -0
  452. paddlex/inference/pipelines/components/common/sort_boxes.py +83 -0
  453. paddlex/inference/pipelines/components/faisser.py +352 -0
  454. paddlex/inference/pipelines/components/prompt_engineering/__init__.py +16 -0
  455. paddlex/inference/pipelines/components/prompt_engineering/base.py +35 -0
  456. paddlex/inference/pipelines/components/prompt_engineering/generate_ensemble_prompt.py +127 -0
  457. paddlex/inference/pipelines/components/prompt_engineering/generate_kie_prompt.py +148 -0
  458. paddlex/inference/pipelines/components/retriever/__init__.py +16 -0
  459. paddlex/inference/pipelines/components/retriever/base.py +226 -0
  460. paddlex/inference/pipelines/components/retriever/openai_bot_retriever.py +70 -0
  461. paddlex/inference/pipelines/components/retriever/qianfan_bot_retriever.py +163 -0
  462. paddlex/inference/pipelines/components/utils/__init__.py +13 -0
  463. paddlex/inference/pipelines/components/utils/mixin.py +206 -0
  464. paddlex/inference/pipelines/doc_preprocessor/__init__.py +15 -0
  465. paddlex/inference/pipelines/doc_preprocessor/pipeline.py +190 -0
  466. paddlex/inference/pipelines/doc_preprocessor/result.py +103 -0
  467. paddlex/inference/pipelines/face_recognition/__init__.py +15 -0
  468. paddlex/inference/pipelines/face_recognition/pipeline.py +61 -0
  469. paddlex/inference/pipelines/face_recognition/result.py +43 -0
  470. paddlex/inference/pipelines/formula_recognition/__init__.py +15 -0
  471. paddlex/inference/pipelines/formula_recognition/pipeline.py +303 -0
  472. paddlex/inference/pipelines/formula_recognition/result.py +291 -0
  473. paddlex/inference/pipelines/image_classification/__init__.py +15 -0
  474. paddlex/inference/pipelines/image_classification/pipeline.py +71 -0
  475. paddlex/inference/pipelines/image_multilabel_classification/__init__.py +15 -0
  476. paddlex/inference/pipelines/image_multilabel_classification/pipeline.py +78 -0
  477. paddlex/inference/pipelines/instance_segmentation/__init__.py +15 -0
  478. paddlex/inference/pipelines/instance_segmentation/pipeline.py +70 -0
  479. paddlex/inference/pipelines/keypoint_detection/__init__.py +15 -0
  480. paddlex/inference/pipelines/keypoint_detection/pipeline.py +137 -0
  481. paddlex/inference/pipelines/layout_parsing/__init__.py +16 -0
  482. paddlex/inference/pipelines/layout_parsing/pipeline.py +570 -0
  483. paddlex/inference/pipelines/layout_parsing/pipeline_v2.py +739 -0
  484. paddlex/inference/pipelines/layout_parsing/result.py +203 -0
  485. paddlex/inference/pipelines/layout_parsing/result_v2.py +470 -0
  486. paddlex/inference/pipelines/layout_parsing/utils.py +2385 -0
  487. paddlex/inference/pipelines/multilingual_speech_recognition/__init__.py +15 -0
  488. paddlex/inference/pipelines/multilingual_speech_recognition/pipeline.py +67 -0
  489. paddlex/inference/pipelines/object_detection/__init__.py +15 -0
  490. paddlex/inference/pipelines/object_detection/pipeline.py +95 -0
  491. paddlex/inference/pipelines/ocr/__init__.py +15 -0
  492. paddlex/inference/pipelines/ocr/pipeline.py +389 -0
  493. paddlex/inference/pipelines/ocr/result.py +248 -0
  494. paddlex/inference/pipelines/open_vocabulary_detection/__init__.py +15 -0
  495. paddlex/inference/pipelines/open_vocabulary_detection/pipeline.py +75 -0
  496. paddlex/inference/pipelines/open_vocabulary_segmentation/__init__.py +15 -0
  497. paddlex/inference/pipelines/open_vocabulary_segmentation/pipeline.py +89 -0
  498. paddlex/inference/pipelines/pp_chatocr/__init__.py +16 -0
  499. paddlex/inference/pipelines/pp_chatocr/pipeline_base.py +102 -0
  500. paddlex/inference/pipelines/pp_chatocr/pipeline_v3.py +773 -0
  501. paddlex/inference/pipelines/pp_chatocr/pipeline_v4.py +977 -0
  502. paddlex/inference/pipelines/pp_shitu_v2/__init__.py +15 -0
  503. paddlex/inference/pipelines/pp_shitu_v2/pipeline.py +152 -0
  504. paddlex/inference/pipelines/pp_shitu_v2/result.py +126 -0
  505. paddlex/inference/pipelines/rotated_object_detection/__init__.py +15 -0
  506. paddlex/inference/pipelines/rotated_object_detection/pipeline.py +74 -0
  507. paddlex/inference/pipelines/seal_recognition/__init__.py +15 -0
  508. paddlex/inference/pipelines/seal_recognition/pipeline.py +271 -0
  509. paddlex/inference/pipelines/seal_recognition/result.py +87 -0
  510. paddlex/inference/pipelines/semantic_segmentation/__init__.py +15 -0
  511. paddlex/inference/pipelines/semantic_segmentation/pipeline.py +74 -0
  512. paddlex/inference/pipelines/small_object_detection/__init__.py +15 -0
  513. paddlex/inference/pipelines/small_object_detection/pipeline.py +74 -0
  514. paddlex/inference/pipelines/table_recognition/__init__.py +16 -0
  515. paddlex/inference/pipelines/table_recognition/pipeline.py +462 -0
  516. paddlex/inference/pipelines/table_recognition/pipeline_v2.py +792 -0
  517. paddlex/inference/pipelines/table_recognition/result.py +216 -0
  518. paddlex/inference/pipelines/table_recognition/table_recognition_post_processing.py +362 -0
  519. paddlex/inference/pipelines/table_recognition/table_recognition_post_processing_v2.py +470 -0
  520. paddlex/inference/pipelines/table_recognition/utils.py +44 -0
  521. paddlex/inference/pipelines/ts_anomaly_detection/__init__.py +15 -0
  522. paddlex/inference/pipelines/ts_anomaly_detection/pipeline.py +62 -0
  523. paddlex/inference/pipelines/ts_classification/__init__.py +15 -0
  524. paddlex/inference/pipelines/ts_classification/pipeline.py +62 -0
  525. paddlex/inference/pipelines/ts_forecasting/__init__.py +15 -0
  526. paddlex/inference/pipelines/ts_forecasting/pipeline.py +62 -0
  527. paddlex/inference/pipelines/video_classification/__init__.py +15 -0
  528. paddlex/inference/pipelines/video_classification/pipeline.py +68 -0
  529. paddlex/inference/pipelines/video_detection/__init__.py +15 -0
  530. paddlex/inference/pipelines/video_detection/pipeline.py +73 -0
  531. paddlex/inference/serving/__init__.py +13 -0
  532. paddlex/inference/serving/basic_serving/__init__.py +18 -0
  533. paddlex/inference/serving/basic_serving/_app.py +209 -0
  534. paddlex/inference/serving/basic_serving/_pipeline_apps/__init__.py +41 -0
  535. paddlex/inference/serving/basic_serving/_pipeline_apps/_common/__init__.py +13 -0
  536. paddlex/inference/serving/basic_serving/_pipeline_apps/_common/common.py +96 -0
  537. paddlex/inference/serving/basic_serving/_pipeline_apps/_common/image_recognition.py +36 -0
  538. paddlex/inference/serving/basic_serving/_pipeline_apps/_common/ocr.py +90 -0
  539. paddlex/inference/serving/basic_serving/_pipeline_apps/anomaly_detection.py +64 -0
  540. paddlex/inference/serving/basic_serving/_pipeline_apps/doc_preprocessor.py +97 -0
  541. paddlex/inference/serving/basic_serving/_pipeline_apps/face_recognition.py +223 -0
  542. paddlex/inference/serving/basic_serving/_pipeline_apps/formula_recognition.py +97 -0
  543. paddlex/inference/serving/basic_serving/_pipeline_apps/human_keypoint_detection.py +78 -0
  544. paddlex/inference/serving/basic_serving/_pipeline_apps/image_classification.py +66 -0
  545. paddlex/inference/serving/basic_serving/_pipeline_apps/image_multilabel_classification.py +70 -0
  546. paddlex/inference/serving/basic_serving/_pipeline_apps/instance_segmentation.py +81 -0
  547. paddlex/inference/serving/basic_serving/_pipeline_apps/layout_parsing.py +115 -0
  548. paddlex/inference/serving/basic_serving/_pipeline_apps/m_3d_bev_detection.py +76 -0
  549. paddlex/inference/serving/basic_serving/_pipeline_apps/multilingual_speech_recognition.py +89 -0
  550. paddlex/inference/serving/basic_serving/_pipeline_apps/object_detection.py +74 -0
  551. paddlex/inference/serving/basic_serving/_pipeline_apps/ocr.py +99 -0
  552. paddlex/inference/serving/basic_serving/_pipeline_apps/open_vocabulary_detection.py +78 -0
  553. paddlex/inference/serving/basic_serving/_pipeline_apps/open_vocabulary_segmentation.py +85 -0
  554. paddlex/inference/serving/basic_serving/_pipeline_apps/pedestrian_attribute_recognition.py +81 -0
  555. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv3_doc.py +191 -0
  556. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv4_doc.py +221 -0
  557. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_shituv2.py +218 -0
  558. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_structurev3.py +136 -0
  559. paddlex/inference/serving/basic_serving/_pipeline_apps/rotated_object_detection.py +78 -0
  560. paddlex/inference/serving/basic_serving/_pipeline_apps/seal_recognition.py +103 -0
  561. paddlex/inference/serving/basic_serving/_pipeline_apps/semantic_segmentation.py +64 -0
  562. paddlex/inference/serving/basic_serving/_pipeline_apps/small_object_detection.py +69 -0
  563. paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition.py +105 -0
  564. paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition_v2.py +107 -0
  565. paddlex/inference/serving/basic_serving/_pipeline_apps/ts_anomaly_detection.py +62 -0
  566. paddlex/inference/serving/basic_serving/_pipeline_apps/ts_classification.py +61 -0
  567. paddlex/inference/serving/basic_serving/_pipeline_apps/ts_forecast.py +62 -0
  568. paddlex/inference/serving/basic_serving/_pipeline_apps/vehicle_attribute_recognition.py +81 -0
  569. paddlex/inference/serving/basic_serving/_pipeline_apps/video_classification.py +73 -0
  570. paddlex/inference/serving/basic_serving/_pipeline_apps/video_detection.py +89 -0
  571. paddlex/inference/serving/basic_serving/_server.py +35 -0
  572. paddlex/inference/serving/infra/__init__.py +13 -0
  573. paddlex/inference/serving/infra/config.py +36 -0
  574. paddlex/inference/serving/infra/models.py +72 -0
  575. paddlex/inference/serving/infra/storage.py +175 -0
  576. paddlex/inference/serving/infra/utils.py +259 -0
  577. paddlex/inference/serving/schemas/__init__.py +13 -0
  578. paddlex/inference/serving/schemas/anomaly_detection.py +39 -0
  579. paddlex/inference/serving/schemas/doc_preprocessor.py +54 -0
  580. paddlex/inference/serving/schemas/face_recognition.py +124 -0
  581. paddlex/inference/serving/schemas/formula_recognition.py +56 -0
  582. paddlex/inference/serving/schemas/human_keypoint_detection.py +55 -0
  583. paddlex/inference/serving/schemas/image_classification.py +45 -0
  584. paddlex/inference/serving/schemas/image_multilabel_classification.py +47 -0
  585. paddlex/inference/serving/schemas/instance_segmentation.py +53 -0
  586. paddlex/inference/serving/schemas/layout_parsing.py +72 -0
  587. paddlex/inference/serving/schemas/m_3d_bev_detection.py +48 -0
  588. paddlex/inference/serving/schemas/multilingual_speech_recognition.py +57 -0
  589. paddlex/inference/serving/schemas/object_detection.py +52 -0
  590. paddlex/inference/serving/schemas/ocr.py +60 -0
  591. paddlex/inference/serving/schemas/open_vocabulary_detection.py +52 -0
  592. paddlex/inference/serving/schemas/open_vocabulary_segmentation.py +52 -0
  593. paddlex/inference/serving/schemas/pedestrian_attribute_recognition.py +61 -0
  594. paddlex/inference/serving/schemas/pp_chatocrv3_doc.py +134 -0
  595. paddlex/inference/serving/schemas/pp_chatocrv4_doc.py +151 -0
  596. paddlex/inference/serving/schemas/pp_shituv2.py +124 -0
  597. paddlex/inference/serving/schemas/pp_structurev3.py +84 -0
  598. paddlex/inference/serving/schemas/rotated_object_detection.py +52 -0
  599. paddlex/inference/serving/schemas/seal_recognition.py +62 -0
  600. paddlex/inference/serving/schemas/semantic_segmentation.py +45 -0
  601. paddlex/inference/serving/schemas/shared/__init__.py +13 -0
  602. paddlex/inference/serving/schemas/shared/classification.py +23 -0
  603. paddlex/inference/serving/schemas/shared/image_segmentation.py +28 -0
  604. paddlex/inference/serving/schemas/shared/object_detection.py +24 -0
  605. paddlex/inference/serving/schemas/shared/ocr.py +25 -0
  606. paddlex/inference/serving/schemas/small_object_detection.py +52 -0
  607. paddlex/inference/serving/schemas/table_recognition.py +64 -0
  608. paddlex/inference/serving/schemas/table_recognition_v2.py +66 -0
  609. paddlex/inference/serving/schemas/ts_anomaly_detection.py +37 -0
  610. paddlex/inference/serving/schemas/ts_classification.py +38 -0
  611. paddlex/inference/serving/schemas/ts_forecast.py +37 -0
  612. paddlex/inference/serving/schemas/vehicle_attribute_recognition.py +61 -0
  613. paddlex/inference/serving/schemas/video_classification.py +44 -0
  614. paddlex/inference/serving/schemas/video_detection.py +56 -0
  615. paddlex/inference/utils/__init__.py +13 -0
  616. paddlex/inference/utils/benchmark.py +226 -0
  617. paddlex/inference/utils/color_map.py +123 -0
  618. paddlex/inference/utils/get_pipeline_path.py +27 -0
  619. paddlex/inference/utils/io/__init__.py +36 -0
  620. paddlex/inference/utils/io/readers.py +500 -0
  621. paddlex/inference/utils/io/style.py +374 -0
  622. paddlex/inference/utils/io/tablepyxl.py +149 -0
  623. paddlex/inference/utils/io/writers.py +459 -0
  624. paddlex/inference/utils/new_ir_blacklist.py +28 -0
  625. paddlex/inference/utils/official_models.py +352 -0
  626. paddlex/inference/utils/pp_option.py +256 -0
  627. paddlex/model.py +113 -0
  628. paddlex/modules/3d_bev_detection/__init__.py +18 -0
  629. paddlex/modules/3d_bev_detection/dataset_checker/__init__.py +95 -0
  630. paddlex/modules/3d_bev_detection/dataset_checker/dataset_src/__init__.py +17 -0
  631. paddlex/modules/3d_bev_detection/dataset_checker/dataset_src/analyse_dataset.py +106 -0
  632. paddlex/modules/3d_bev_detection/dataset_checker/dataset_src/check_dataset.py +102 -0
  633. paddlex/modules/3d_bev_detection/evaluator.py +46 -0
  634. paddlex/modules/3d_bev_detection/exportor.py +22 -0
  635. paddlex/modules/3d_bev_detection/model_list.py +18 -0
  636. paddlex/modules/3d_bev_detection/trainer.py +70 -0
  637. paddlex/modules/__init__.py +138 -0
  638. paddlex/modules/anomaly_detection/__init__.py +18 -0
  639. paddlex/modules/anomaly_detection/dataset_checker/__init__.py +95 -0
  640. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/__init__.py +19 -0
  641. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/analyse_dataset.py +79 -0
  642. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/check_dataset.py +87 -0
  643. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/convert_dataset.py +230 -0
  644. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/split_dataset.py +87 -0
  645. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/utils/__init__.py +13 -0
  646. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/utils/visualizer.py +71 -0
  647. paddlex/modules/anomaly_detection/evaluator.py +58 -0
  648. paddlex/modules/anomaly_detection/exportor.py +22 -0
  649. paddlex/modules/anomaly_detection/model_list.py +16 -0
  650. paddlex/modules/anomaly_detection/trainer.py +71 -0
  651. paddlex/modules/base/__init__.py +18 -0
  652. paddlex/modules/base/build_model.py +34 -0
  653. paddlex/modules/base/dataset_checker/__init__.py +16 -0
  654. paddlex/modules/base/dataset_checker/dataset_checker.py +169 -0
  655. paddlex/modules/base/dataset_checker/utils.py +110 -0
  656. paddlex/modules/base/evaluator.py +170 -0
  657. paddlex/modules/base/exportor.py +146 -0
  658. paddlex/modules/base/trainer.py +134 -0
  659. paddlex/modules/face_recognition/__init__.py +18 -0
  660. paddlex/modules/face_recognition/dataset_checker/__init__.py +71 -0
  661. paddlex/modules/face_recognition/dataset_checker/dataset_src/__init__.py +16 -0
  662. paddlex/modules/face_recognition/dataset_checker/dataset_src/check_dataset.py +174 -0
  663. paddlex/modules/face_recognition/dataset_checker/dataset_src/utils/__init__.py +13 -0
  664. paddlex/modules/face_recognition/dataset_checker/dataset_src/utils/visualizer.py +156 -0
  665. paddlex/modules/face_recognition/evaluator.py +52 -0
  666. paddlex/modules/face_recognition/exportor.py +22 -0
  667. paddlex/modules/face_recognition/model_list.py +15 -0
  668. paddlex/modules/face_recognition/trainer.py +75 -0
  669. paddlex/modules/formula_recognition/__init__.py +18 -0
  670. paddlex/modules/formula_recognition/dataset_checker/__init__.py +113 -0
  671. paddlex/modules/formula_recognition/dataset_checker/dataset_src/__init__.py +19 -0
  672. paddlex/modules/formula_recognition/dataset_checker/dataset_src/analyse_dataset.py +157 -0
  673. paddlex/modules/formula_recognition/dataset_checker/dataset_src/check_dataset.py +80 -0
  674. paddlex/modules/formula_recognition/dataset_checker/dataset_src/convert_dataset.py +94 -0
  675. paddlex/modules/formula_recognition/dataset_checker/dataset_src/split_dataset.py +81 -0
  676. paddlex/modules/formula_recognition/evaluator.py +77 -0
  677. paddlex/modules/formula_recognition/exportor.py +22 -0
  678. paddlex/modules/formula_recognition/model_list.py +20 -0
  679. paddlex/modules/formula_recognition/trainer.py +121 -0
  680. paddlex/modules/general_recognition/__init__.py +18 -0
  681. paddlex/modules/general_recognition/dataset_checker/__init__.py +107 -0
  682. paddlex/modules/general_recognition/dataset_checker/dataset_src/__init__.py +19 -0
  683. paddlex/modules/general_recognition/dataset_checker/dataset_src/analyse_dataset.py +98 -0
  684. paddlex/modules/general_recognition/dataset_checker/dataset_src/check_dataset.py +100 -0
  685. paddlex/modules/general_recognition/dataset_checker/dataset_src/convert_dataset.py +99 -0
  686. paddlex/modules/general_recognition/dataset_checker/dataset_src/split_dataset.py +82 -0
  687. paddlex/modules/general_recognition/dataset_checker/dataset_src/utils/__init__.py +13 -0
  688. paddlex/modules/general_recognition/dataset_checker/dataset_src/utils/visualizer.py +150 -0
  689. paddlex/modules/general_recognition/evaluator.py +31 -0
  690. paddlex/modules/general_recognition/exportor.py +22 -0
  691. paddlex/modules/general_recognition/model_list.py +19 -0
  692. paddlex/modules/general_recognition/trainer.py +52 -0
  693. paddlex/modules/image_classification/__init__.py +18 -0
  694. paddlex/modules/image_classification/dataset_checker/__init__.py +104 -0
  695. paddlex/modules/image_classification/dataset_checker/dataset_src/__init__.py +19 -0
  696. paddlex/modules/image_classification/dataset_checker/dataset_src/analyse_dataset.py +93 -0
  697. paddlex/modules/image_classification/dataset_checker/dataset_src/check_dataset.py +131 -0
  698. paddlex/modules/image_classification/dataset_checker/dataset_src/convert_dataset.py +51 -0
  699. paddlex/modules/image_classification/dataset_checker/dataset_src/split_dataset.py +81 -0
  700. paddlex/modules/image_classification/dataset_checker/dataset_src/utils/__init__.py +13 -0
  701. paddlex/modules/image_classification/dataset_checker/dataset_src/utils/visualizer.py +156 -0
  702. paddlex/modules/image_classification/evaluator.py +43 -0
  703. paddlex/modules/image_classification/exportor.py +22 -0
  704. paddlex/modules/image_classification/model_list.py +99 -0
  705. paddlex/modules/image_classification/trainer.py +82 -0
  706. paddlex/modules/image_unwarping/__init__.py +13 -0
  707. paddlex/modules/image_unwarping/model_list.py +17 -0
  708. paddlex/modules/instance_segmentation/__init__.py +18 -0
  709. paddlex/modules/instance_segmentation/dataset_checker/__init__.py +108 -0
  710. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/__init__.py +19 -0
  711. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/analyse_dataset.py +78 -0
  712. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/check_dataset.py +92 -0
  713. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/convert_dataset.py +241 -0
  714. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/split_dataset.py +119 -0
  715. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/utils/__init__.py +13 -0
  716. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/utils/visualizer.py +221 -0
  717. paddlex/modules/instance_segmentation/evaluator.py +32 -0
  718. paddlex/modules/instance_segmentation/exportor.py +22 -0
  719. paddlex/modules/instance_segmentation/model_list.py +33 -0
  720. paddlex/modules/instance_segmentation/trainer.py +31 -0
  721. paddlex/modules/keypoint_detection/__init__.py +18 -0
  722. paddlex/modules/keypoint_detection/dataset_checker/__init__.py +56 -0
  723. paddlex/modules/keypoint_detection/dataset_checker/dataset_src/__init__.py +15 -0
  724. paddlex/modules/keypoint_detection/dataset_checker/dataset_src/check_dataset.py +86 -0
  725. paddlex/modules/keypoint_detection/dataset_checker/dataset_src/utils/__init__.py +13 -0
  726. paddlex/modules/keypoint_detection/dataset_checker/dataset_src/utils/visualizer.py +119 -0
  727. paddlex/modules/keypoint_detection/evaluator.py +41 -0
  728. paddlex/modules/keypoint_detection/exportor.py +22 -0
  729. paddlex/modules/keypoint_detection/model_list.py +16 -0
  730. paddlex/modules/keypoint_detection/trainer.py +39 -0
  731. paddlex/modules/multilabel_classification/__init__.py +18 -0
  732. paddlex/modules/multilabel_classification/dataset_checker/__init__.py +106 -0
  733. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/__init__.py +19 -0
  734. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/analyse_dataset.py +95 -0
  735. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/check_dataset.py +131 -0
  736. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/convert_dataset.py +117 -0
  737. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/split_dataset.py +81 -0
  738. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/utils/__init__.py +13 -0
  739. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/utils/visualizer.py +153 -0
  740. paddlex/modules/multilabel_classification/evaluator.py +43 -0
  741. paddlex/modules/multilabel_classification/exportor.py +22 -0
  742. paddlex/modules/multilabel_classification/model_list.py +24 -0
  743. paddlex/modules/multilabel_classification/trainer.py +85 -0
  744. paddlex/modules/multilingual_speech_recognition/__init__.py +18 -0
  745. paddlex/modules/multilingual_speech_recognition/dataset_checker.py +27 -0
  746. paddlex/modules/multilingual_speech_recognition/evaluator.py +27 -0
  747. paddlex/modules/multilingual_speech_recognition/exportor.py +27 -0
  748. paddlex/modules/multilingual_speech_recognition/model_list.py +22 -0
  749. paddlex/modules/multilingual_speech_recognition/trainer.py +40 -0
  750. paddlex/modules/object_detection/__init__.py +18 -0
  751. paddlex/modules/object_detection/dataset_checker/__init__.py +115 -0
  752. paddlex/modules/object_detection/dataset_checker/dataset_src/__init__.py +19 -0
  753. paddlex/modules/object_detection/dataset_checker/dataset_src/analyse_dataset.py +80 -0
  754. paddlex/modules/object_detection/dataset_checker/dataset_src/check_dataset.py +86 -0
  755. paddlex/modules/object_detection/dataset_checker/dataset_src/convert_dataset.py +433 -0
  756. paddlex/modules/object_detection/dataset_checker/dataset_src/split_dataset.py +119 -0
  757. paddlex/modules/object_detection/dataset_checker/dataset_src/utils/__init__.py +13 -0
  758. paddlex/modules/object_detection/dataset_checker/dataset_src/utils/visualizer.py +192 -0
  759. paddlex/modules/object_detection/evaluator.py +52 -0
  760. paddlex/modules/object_detection/exportor.py +22 -0
  761. paddlex/modules/object_detection/model_list.py +84 -0
  762. paddlex/modules/object_detection/trainer.py +99 -0
  763. paddlex/modules/open_vocabulary_detection/__init__.py +18 -0
  764. paddlex/modules/open_vocabulary_detection/dataset_checker.py +29 -0
  765. paddlex/modules/open_vocabulary_detection/evaluator.py +29 -0
  766. paddlex/modules/open_vocabulary_detection/exportor.py +29 -0
  767. paddlex/modules/open_vocabulary_detection/model_list.py +18 -0
  768. paddlex/modules/open_vocabulary_detection/trainer.py +42 -0
  769. paddlex/modules/open_vocabulary_segmentation/__init__.py +18 -0
  770. paddlex/modules/open_vocabulary_segmentation/dataset_checker.py +29 -0
  771. paddlex/modules/open_vocabulary_segmentation/evaluator.py +29 -0
  772. paddlex/modules/open_vocabulary_segmentation/exportor.py +29 -0
  773. paddlex/modules/open_vocabulary_segmentation/model_list.py +19 -0
  774. paddlex/modules/open_vocabulary_segmentation/trainer.py +42 -0
  775. paddlex/modules/semantic_segmentation/__init__.py +18 -0
  776. paddlex/modules/semantic_segmentation/dataset_checker/__init__.py +110 -0
  777. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/__init__.py +19 -0
  778. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/analyse_dataset.py +73 -0
  779. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/check_dataset.py +80 -0
  780. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/convert_dataset.py +162 -0
  781. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/split_dataset.py +87 -0
  782. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/utils/__init__.py +13 -0
  783. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/utils/visualizer.py +71 -0
  784. paddlex/modules/semantic_segmentation/evaluator.py +58 -0
  785. paddlex/modules/semantic_segmentation/exportor.py +31 -0
  786. paddlex/modules/semantic_segmentation/model_list.py +37 -0
  787. paddlex/modules/semantic_segmentation/trainer.py +73 -0
  788. paddlex/modules/table_recognition/__init__.py +18 -0
  789. paddlex/modules/table_recognition/dataset_checker/__init__.py +98 -0
  790. paddlex/modules/table_recognition/dataset_checker/dataset_src/__init__.py +18 -0
  791. paddlex/modules/table_recognition/dataset_checker/dataset_src/analyse_dataset.py +58 -0
  792. paddlex/modules/table_recognition/dataset_checker/dataset_src/check_dataset.py +86 -0
  793. paddlex/modules/table_recognition/dataset_checker/dataset_src/split_dataset.py +79 -0
  794. paddlex/modules/table_recognition/evaluator.py +43 -0
  795. paddlex/modules/table_recognition/exportor.py +22 -0
  796. paddlex/modules/table_recognition/model_list.py +21 -0
  797. paddlex/modules/table_recognition/trainer.py +70 -0
  798. paddlex/modules/text_detection/__init__.py +18 -0
  799. paddlex/modules/text_detection/dataset_checker/__init__.py +109 -0
  800. paddlex/modules/text_detection/dataset_checker/dataset_src/__init__.py +18 -0
  801. paddlex/modules/text_detection/dataset_checker/dataset_src/analyse_dataset.py +217 -0
  802. paddlex/modules/text_detection/dataset_checker/dataset_src/check_dataset.py +106 -0
  803. paddlex/modules/text_detection/dataset_checker/dataset_src/split_dataset.py +140 -0
  804. paddlex/modules/text_detection/evaluator.py +41 -0
  805. paddlex/modules/text_detection/exportor.py +22 -0
  806. paddlex/modules/text_detection/model_list.py +24 -0
  807. paddlex/modules/text_detection/trainer.py +68 -0
  808. paddlex/modules/text_recognition/__init__.py +18 -0
  809. paddlex/modules/text_recognition/dataset_checker/__init__.py +126 -0
  810. paddlex/modules/text_recognition/dataset_checker/dataset_src/__init__.py +19 -0
  811. paddlex/modules/text_recognition/dataset_checker/dataset_src/analyse_dataset.py +161 -0
  812. paddlex/modules/text_recognition/dataset_checker/dataset_src/check_dataset.py +107 -0
  813. paddlex/modules/text_recognition/dataset_checker/dataset_src/convert_dataset.py +94 -0
  814. paddlex/modules/text_recognition/dataset_checker/dataset_src/split_dataset.py +81 -0
  815. paddlex/modules/text_recognition/evaluator.py +64 -0
  816. paddlex/modules/text_recognition/exportor.py +22 -0
  817. paddlex/modules/text_recognition/model_list.py +34 -0
  818. paddlex/modules/text_recognition/trainer.py +106 -0
  819. paddlex/modules/ts_anomaly_detection/__init__.py +19 -0
  820. paddlex/modules/ts_anomaly_detection/dataset_checker/__init__.py +112 -0
  821. paddlex/modules/ts_anomaly_detection/dataset_checker/dataset_src/__init__.py +19 -0
  822. paddlex/modules/ts_anomaly_detection/dataset_checker/dataset_src/analyse_dataset.py +27 -0
  823. paddlex/modules/ts_anomaly_detection/dataset_checker/dataset_src/check_dataset.py +64 -0
  824. paddlex/modules/ts_anomaly_detection/dataset_checker/dataset_src/convert_dataset.py +78 -0
  825. paddlex/modules/ts_anomaly_detection/dataset_checker/dataset_src/split_dataset.py +63 -0
  826. paddlex/modules/ts_anomaly_detection/evaluator.py +67 -0
  827. paddlex/modules/ts_anomaly_detection/exportor.py +45 -0
  828. paddlex/modules/ts_anomaly_detection/model_list.py +22 -0
  829. paddlex/modules/ts_anomaly_detection/trainer.py +113 -0
  830. paddlex/modules/ts_classification/__init__.py +19 -0
  831. paddlex/modules/ts_classification/dataset_checker/__init__.py +112 -0
  832. paddlex/modules/ts_classification/dataset_checker/dataset_src/__init__.py +19 -0
  833. paddlex/modules/ts_classification/dataset_checker/dataset_src/analyse_dataset.py +74 -0
  834. paddlex/modules/ts_classification/dataset_checker/dataset_src/check_dataset.py +64 -0
  835. paddlex/modules/ts_classification/dataset_checker/dataset_src/convert_dataset.py +78 -0
  836. paddlex/modules/ts_classification/dataset_checker/dataset_src/split_dataset.py +88 -0
  837. paddlex/modules/ts_classification/evaluator.py +66 -0
  838. paddlex/modules/ts_classification/exportor.py +45 -0
  839. paddlex/modules/ts_classification/model_list.py +18 -0
  840. paddlex/modules/ts_classification/trainer.py +108 -0
  841. paddlex/modules/ts_forecast/__init__.py +19 -0
  842. paddlex/modules/ts_forecast/dataset_checker/__init__.py +112 -0
  843. paddlex/modules/ts_forecast/dataset_checker/dataset_src/__init__.py +19 -0
  844. paddlex/modules/ts_forecast/dataset_checker/dataset_src/analyse_dataset.py +27 -0
  845. paddlex/modules/ts_forecast/dataset_checker/dataset_src/check_dataset.py +64 -0
  846. paddlex/modules/ts_forecast/dataset_checker/dataset_src/convert_dataset.py +77 -0
  847. paddlex/modules/ts_forecast/dataset_checker/dataset_src/split_dataset.py +63 -0
  848. paddlex/modules/ts_forecast/evaluator.py +66 -0
  849. paddlex/modules/ts_forecast/exportor.py +45 -0
  850. paddlex/modules/ts_forecast/model_list.py +24 -0
  851. paddlex/modules/ts_forecast/trainer.py +108 -0
  852. paddlex/modules/video_classification/__init__.py +18 -0
  853. paddlex/modules/video_classification/dataset_checker/__init__.py +93 -0
  854. paddlex/modules/video_classification/dataset_checker/dataset_src/__init__.py +18 -0
  855. paddlex/modules/video_classification/dataset_checker/dataset_src/analyse_dataset.py +93 -0
  856. paddlex/modules/video_classification/dataset_checker/dataset_src/check_dataset.py +121 -0
  857. paddlex/modules/video_classification/dataset_checker/dataset_src/split_dataset.py +82 -0
  858. paddlex/modules/video_classification/evaluator.py +44 -0
  859. paddlex/modules/video_classification/exportor.py +22 -0
  860. paddlex/modules/video_classification/model_list.py +19 -0
  861. paddlex/modules/video_classification/trainer.py +88 -0
  862. paddlex/modules/video_detection/__init__.py +18 -0
  863. paddlex/modules/video_detection/dataset_checker/__init__.py +86 -0
  864. paddlex/modules/video_detection/dataset_checker/dataset_src/__init__.py +17 -0
  865. paddlex/modules/video_detection/dataset_checker/dataset_src/analyse_dataset.py +101 -0
  866. paddlex/modules/video_detection/dataset_checker/dataset_src/check_dataset.py +134 -0
  867. paddlex/modules/video_detection/evaluator.py +42 -0
  868. paddlex/modules/video_detection/exportor.py +22 -0
  869. paddlex/modules/video_detection/model_list.py +15 -0
  870. paddlex/modules/video_detection/trainer.py +82 -0
  871. paddlex/ops/__init__.py +149 -0
  872. paddlex/ops/iou3d_nms/iou3d_cpu.cpp +264 -0
  873. paddlex/ops/iou3d_nms/iou3d_cpu.h +27 -0
  874. paddlex/ops/iou3d_nms/iou3d_nms.cpp +204 -0
  875. paddlex/ops/iou3d_nms/iou3d_nms.h +33 -0
  876. paddlex/ops/iou3d_nms/iou3d_nms_api.cpp +108 -0
  877. paddlex/ops/iou3d_nms/iou3d_nms_kernel.cu +482 -0
  878. paddlex/ops/setup.py +37 -0
  879. paddlex/ops/voxel/voxelize_op.cc +191 -0
  880. paddlex/ops/voxel/voxelize_op.cu +346 -0
  881. paddlex/paddle2onnx_requirements.txt +1 -0
  882. paddlex/paddlex_cli.py +464 -0
  883. paddlex/repo_apis/Paddle3D_api/__init__.py +17 -0
  884. paddlex/repo_apis/Paddle3D_api/bev_fusion/__init__.py +18 -0
  885. paddlex/repo_apis/Paddle3D_api/bev_fusion/config.py +118 -0
  886. paddlex/repo_apis/Paddle3D_api/bev_fusion/model.py +238 -0
  887. paddlex/repo_apis/Paddle3D_api/bev_fusion/register.py +55 -0
  888. paddlex/repo_apis/Paddle3D_api/bev_fusion/runner.py +104 -0
  889. paddlex/repo_apis/Paddle3D_api/pp3d_config.py +144 -0
  890. paddlex/repo_apis/PaddleClas_api/__init__.py +17 -0
  891. paddlex/repo_apis/PaddleClas_api/cls/__init__.py +19 -0
  892. paddlex/repo_apis/PaddleClas_api/cls/config.py +594 -0
  893. paddlex/repo_apis/PaddleClas_api/cls/model.py +355 -0
  894. paddlex/repo_apis/PaddleClas_api/cls/register.py +908 -0
  895. paddlex/repo_apis/PaddleClas_api/cls/runner.py +219 -0
  896. paddlex/repo_apis/PaddleClas_api/shitu_rec/__init__.py +18 -0
  897. paddlex/repo_apis/PaddleClas_api/shitu_rec/config.py +141 -0
  898. paddlex/repo_apis/PaddleClas_api/shitu_rec/model.py +23 -0
  899. paddlex/repo_apis/PaddleClas_api/shitu_rec/register.py +68 -0
  900. paddlex/repo_apis/PaddleClas_api/shitu_rec/runner.py +55 -0
  901. paddlex/repo_apis/PaddleDetection_api/__init__.py +17 -0
  902. paddlex/repo_apis/PaddleDetection_api/config_helper.py +280 -0
  903. paddlex/repo_apis/PaddleDetection_api/instance_seg/__init__.py +18 -0
  904. paddlex/repo_apis/PaddleDetection_api/instance_seg/config.py +458 -0
  905. paddlex/repo_apis/PaddleDetection_api/instance_seg/model.py +403 -0
  906. paddlex/repo_apis/PaddleDetection_api/instance_seg/register.py +263 -0
  907. paddlex/repo_apis/PaddleDetection_api/instance_seg/runner.py +226 -0
  908. paddlex/repo_apis/PaddleDetection_api/object_det/__init__.py +19 -0
  909. paddlex/repo_apis/PaddleDetection_api/object_det/config.py +539 -0
  910. paddlex/repo_apis/PaddleDetection_api/object_det/model.py +430 -0
  911. paddlex/repo_apis/PaddleDetection_api/object_det/official_categories.py +220 -0
  912. paddlex/repo_apis/PaddleDetection_api/object_det/register.py +1106 -0
  913. paddlex/repo_apis/PaddleDetection_api/object_det/runner.py +226 -0
  914. paddlex/repo_apis/PaddleNLP_api/__init__.py +13 -0
  915. paddlex/repo_apis/PaddleOCR_api/__init__.py +21 -0
  916. paddlex/repo_apis/PaddleOCR_api/config_utils.py +53 -0
  917. paddlex/repo_apis/PaddleOCR_api/formula_rec/__init__.py +16 -0
  918. paddlex/repo_apis/PaddleOCR_api/formula_rec/config.py +570 -0
  919. paddlex/repo_apis/PaddleOCR_api/formula_rec/model.py +402 -0
  920. paddlex/repo_apis/PaddleOCR_api/formula_rec/register.py +73 -0
  921. paddlex/repo_apis/PaddleOCR_api/formula_rec/runner.py +240 -0
  922. paddlex/repo_apis/PaddleOCR_api/table_rec/__init__.py +16 -0
  923. paddlex/repo_apis/PaddleOCR_api/table_rec/config.py +64 -0
  924. paddlex/repo_apis/PaddleOCR_api/table_rec/model.py +126 -0
  925. paddlex/repo_apis/PaddleOCR_api/table_rec/register.py +71 -0
  926. paddlex/repo_apis/PaddleOCR_api/table_rec/runner.py +51 -0
  927. paddlex/repo_apis/PaddleOCR_api/text_det/__init__.py +16 -0
  928. paddlex/repo_apis/PaddleOCR_api/text_det/config.py +62 -0
  929. paddlex/repo_apis/PaddleOCR_api/text_det/model.py +72 -0
  930. paddlex/repo_apis/PaddleOCR_api/text_det/register.py +90 -0
  931. paddlex/repo_apis/PaddleOCR_api/text_det/runner.py +53 -0
  932. paddlex/repo_apis/PaddleOCR_api/text_rec/__init__.py +16 -0
  933. paddlex/repo_apis/PaddleOCR_api/text_rec/config.py +563 -0
  934. paddlex/repo_apis/PaddleOCR_api/text_rec/model.py +402 -0
  935. paddlex/repo_apis/PaddleOCR_api/text_rec/register.py +199 -0
  936. paddlex/repo_apis/PaddleOCR_api/text_rec/runner.py +240 -0
  937. paddlex/repo_apis/PaddleSeg_api/__init__.py +16 -0
  938. paddlex/repo_apis/PaddleSeg_api/base_seg_config.py +134 -0
  939. paddlex/repo_apis/PaddleSeg_api/seg/__init__.py +16 -0
  940. paddlex/repo_apis/PaddleSeg_api/seg/config.py +186 -0
  941. paddlex/repo_apis/PaddleSeg_api/seg/model.py +491 -0
  942. paddlex/repo_apis/PaddleSeg_api/seg/register.py +273 -0
  943. paddlex/repo_apis/PaddleSeg_api/seg/runner.py +262 -0
  944. paddlex/repo_apis/PaddleTS_api/__init__.py +19 -0
  945. paddlex/repo_apis/PaddleTS_api/ts_ad/__init__.py +16 -0
  946. paddlex/repo_apis/PaddleTS_api/ts_ad/config.py +89 -0
  947. paddlex/repo_apis/PaddleTS_api/ts_ad/register.py +146 -0
  948. paddlex/repo_apis/PaddleTS_api/ts_ad/runner.py +158 -0
  949. paddlex/repo_apis/PaddleTS_api/ts_base/__init__.py +13 -0
  950. paddlex/repo_apis/PaddleTS_api/ts_base/config.py +246 -0
  951. paddlex/repo_apis/PaddleTS_api/ts_base/model.py +276 -0
  952. paddlex/repo_apis/PaddleTS_api/ts_base/runner.py +158 -0
  953. paddlex/repo_apis/PaddleTS_api/ts_cls/__init__.py +16 -0
  954. paddlex/repo_apis/PaddleTS_api/ts_cls/config.py +73 -0
  955. paddlex/repo_apis/PaddleTS_api/ts_cls/register.py +59 -0
  956. paddlex/repo_apis/PaddleTS_api/ts_cls/runner.py +158 -0
  957. paddlex/repo_apis/PaddleTS_api/ts_fc/__init__.py +16 -0
  958. paddlex/repo_apis/PaddleTS_api/ts_fc/config.py +137 -0
  959. paddlex/repo_apis/PaddleTS_api/ts_fc/register.py +186 -0
  960. paddlex/repo_apis/PaddleVideo_api/__init__.py +17 -0
  961. paddlex/repo_apis/PaddleVideo_api/config_utils.py +51 -0
  962. paddlex/repo_apis/PaddleVideo_api/video_cls/__init__.py +19 -0
  963. paddlex/repo_apis/PaddleVideo_api/video_cls/config.py +547 -0
  964. paddlex/repo_apis/PaddleVideo_api/video_cls/model.py +346 -0
  965. paddlex/repo_apis/PaddleVideo_api/video_cls/register.py +71 -0
  966. paddlex/repo_apis/PaddleVideo_api/video_cls/runner.py +205 -0
  967. paddlex/repo_apis/PaddleVideo_api/video_det/__init__.py +19 -0
  968. paddlex/repo_apis/PaddleVideo_api/video_det/config.py +548 -0
  969. paddlex/repo_apis/PaddleVideo_api/video_det/model.py +298 -0
  970. paddlex/repo_apis/PaddleVideo_api/video_det/register.py +45 -0
  971. paddlex/repo_apis/PaddleVideo_api/video_det/runner.py +200 -0
  972. paddlex/repo_apis/__init__.py +13 -0
  973. paddlex/repo_apis/base/__init__.py +23 -0
  974. paddlex/repo_apis/base/config.py +238 -0
  975. paddlex/repo_apis/base/model.py +571 -0
  976. paddlex/repo_apis/base/register.py +135 -0
  977. paddlex/repo_apis/base/runner.py +391 -0
  978. paddlex/repo_apis/base/utils/__init__.py +13 -0
  979. paddlex/repo_apis/base/utils/arg.py +64 -0
  980. paddlex/repo_apis/base/utils/subprocess.py +107 -0
  981. paddlex/repo_manager/__init__.py +24 -0
  982. paddlex/repo_manager/core.py +271 -0
  983. paddlex/repo_manager/meta.py +170 -0
  984. paddlex/repo_manager/repo.py +415 -0
  985. paddlex/repo_manager/requirements.txt +21 -0
  986. paddlex/repo_manager/utils.py +359 -0
  987. paddlex/serving_requirements.txt +9 -0
  988. paddlex/utils/__init__.py +1 -12
  989. paddlex/utils/cache.py +148 -0
  990. paddlex/utils/config.py +215 -0
  991. paddlex/utils/custom_device_whitelist.py +457 -0
  992. paddlex/utils/device.py +151 -0
  993. paddlex/utils/download.py +168 -182
  994. paddlex/utils/env.py +11 -50
  995. paddlex/utils/errors/__init__.py +17 -0
  996. paddlex/utils/errors/dataset_checker.py +78 -0
  997. paddlex/utils/errors/others.py +152 -0
  998. paddlex/utils/file_interface.py +212 -0
  999. paddlex/utils/flags.py +65 -0
  1000. paddlex/utils/fonts/__init__.py +67 -0
  1001. paddlex/utils/func_register.py +41 -0
  1002. paddlex/utils/interactive_get_pipeline.py +55 -0
  1003. paddlex/utils/lazy_loader.py +68 -0
  1004. paddlex/utils/logging.py +131 -33
  1005. paddlex/utils/misc.py +201 -0
  1006. paddlex/utils/pipeline_arguments.py +711 -0
  1007. paddlex/utils/result_saver.py +59 -0
  1008. paddlex/utils/subclass_register.py +101 -0
  1009. paddlex/version.py +54 -0
  1010. paddlex-3.0.0rc0.dist-info/LICENSE +169 -0
  1011. paddlex-3.0.0rc0.dist-info/METADATA +1035 -0
  1012. paddlex-3.0.0rc0.dist-info/RECORD +1015 -0
  1013. paddlex-3.0.0rc0.dist-info/WHEEL +5 -0
  1014. paddlex-3.0.0rc0.dist-info/entry_points.txt +2 -0
  1015. paddlex-3.0.0rc0.dist-info/top_level.txt +1 -0
  1016. PaddleClas/__init__.py +0 -16
  1017. PaddleClas/deploy/__init__.py +0 -1
  1018. PaddleClas/deploy/paddleserving/__init__.py +0 -0
  1019. PaddleClas/deploy/paddleserving/classification_web_service.py +0 -74
  1020. PaddleClas/deploy/paddleserving/cpu_utilization.py +0 -4
  1021. PaddleClas/deploy/paddleserving/pipeline_http_client.py +0 -20
  1022. PaddleClas/deploy/paddleserving/pipeline_rpc_client.py +0 -33
  1023. PaddleClas/deploy/paddleserving/recognition/__init__.py +0 -0
  1024. PaddleClas/deploy/paddleserving/recognition/pipeline_http_client.py +0 -21
  1025. PaddleClas/deploy/paddleserving/recognition/pipeline_rpc_client.py +0 -34
  1026. PaddleClas/deploy/paddleserving/recognition/recognition_web_service.py +0 -209
  1027. PaddleClas/deploy/python/__init__.py +0 -0
  1028. PaddleClas/deploy/python/build_gallery.py +0 -214
  1029. PaddleClas/deploy/python/det_preprocess.py +0 -205
  1030. PaddleClas/deploy/python/postprocess.py +0 -161
  1031. PaddleClas/deploy/python/predict_cls.py +0 -142
  1032. PaddleClas/deploy/python/predict_det.py +0 -158
  1033. PaddleClas/deploy/python/predict_rec.py +0 -138
  1034. PaddleClas/deploy/python/predict_system.py +0 -144
  1035. PaddleClas/deploy/python/preprocess.py +0 -337
  1036. PaddleClas/deploy/utils/__init__.py +0 -5
  1037. PaddleClas/deploy/utils/config.py +0 -197
  1038. PaddleClas/deploy/utils/draw_bbox.py +0 -61
  1039. PaddleClas/deploy/utils/encode_decode.py +0 -31
  1040. PaddleClas/deploy/utils/get_image_list.py +0 -49
  1041. PaddleClas/deploy/utils/logger.py +0 -120
  1042. PaddleClas/deploy/utils/predictor.py +0 -71
  1043. PaddleClas/deploy/vector_search/__init__.py +0 -1
  1044. PaddleClas/deploy/vector_search/interface.py +0 -272
  1045. PaddleClas/deploy/vector_search/test.py +0 -34
  1046. PaddleClas/hubconf.py +0 -788
  1047. PaddleClas/paddleclas.py +0 -552
  1048. PaddleClas/ppcls/__init__.py +0 -20
  1049. PaddleClas/ppcls/arch/__init__.py +0 -127
  1050. PaddleClas/ppcls/arch/backbone/__init__.py +0 -80
  1051. PaddleClas/ppcls/arch/backbone/base/__init__.py +0 -0
  1052. PaddleClas/ppcls/arch/backbone/base/theseus_layer.py +0 -126
  1053. PaddleClas/ppcls/arch/backbone/legendary_models/__init__.py +0 -6
  1054. PaddleClas/ppcls/arch/backbone/legendary_models/esnet.py +0 -355
  1055. PaddleClas/ppcls/arch/backbone/legendary_models/hrnet.py +0 -744
  1056. PaddleClas/ppcls/arch/backbone/legendary_models/inception_v3.py +0 -539
  1057. PaddleClas/ppcls/arch/backbone/legendary_models/mobilenet_v1.py +0 -234
  1058. PaddleClas/ppcls/arch/backbone/legendary_models/mobilenet_v3.py +0 -561
  1059. PaddleClas/ppcls/arch/backbone/legendary_models/pp_lcnet.py +0 -399
  1060. PaddleClas/ppcls/arch/backbone/legendary_models/resnet.py +0 -534
  1061. PaddleClas/ppcls/arch/backbone/legendary_models/vgg.py +0 -231
  1062. PaddleClas/ppcls/arch/backbone/model_zoo/__init__.py +0 -0
  1063. PaddleClas/ppcls/arch/backbone/model_zoo/alexnet.py +0 -168
  1064. PaddleClas/ppcls/arch/backbone/model_zoo/cspnet.py +0 -376
  1065. PaddleClas/ppcls/arch/backbone/model_zoo/darknet.py +0 -197
  1066. PaddleClas/ppcls/arch/backbone/model_zoo/densenet.py +0 -344
  1067. PaddleClas/ppcls/arch/backbone/model_zoo/distilled_vision_transformer.py +0 -272
  1068. PaddleClas/ppcls/arch/backbone/model_zoo/dla.py +0 -528
  1069. PaddleClas/ppcls/arch/backbone/model_zoo/dpn.py +0 -451
  1070. PaddleClas/ppcls/arch/backbone/model_zoo/efficientnet.py +0 -976
  1071. PaddleClas/ppcls/arch/backbone/model_zoo/ghostnet.py +0 -363
  1072. PaddleClas/ppcls/arch/backbone/model_zoo/googlenet.py +0 -229
  1073. PaddleClas/ppcls/arch/backbone/model_zoo/gvt.py +0 -693
  1074. PaddleClas/ppcls/arch/backbone/model_zoo/hardnet.py +0 -293
  1075. PaddleClas/ppcls/arch/backbone/model_zoo/inception_v4.py +0 -477
  1076. PaddleClas/ppcls/arch/backbone/model_zoo/levit.py +0 -589
  1077. PaddleClas/ppcls/arch/backbone/model_zoo/mixnet.py +0 -815
  1078. PaddleClas/ppcls/arch/backbone/model_zoo/mobilenet_v2.py +0 -287
  1079. PaddleClas/ppcls/arch/backbone/model_zoo/rednet.py +0 -203
  1080. PaddleClas/ppcls/arch/backbone/model_zoo/regnet.py +0 -431
  1081. PaddleClas/ppcls/arch/backbone/model_zoo/repvgg.py +0 -422
  1082. PaddleClas/ppcls/arch/backbone/model_zoo/res2net.py +0 -264
  1083. PaddleClas/ppcls/arch/backbone/model_zoo/res2net_vd.py +0 -305
  1084. PaddleClas/ppcls/arch/backbone/model_zoo/resnest.py +0 -740
  1085. PaddleClas/ppcls/arch/backbone/model_zoo/resnet_vc.py +0 -309
  1086. PaddleClas/ppcls/arch/backbone/model_zoo/resnext.py +0 -298
  1087. PaddleClas/ppcls/arch/backbone/model_zoo/resnext101_wsl.py +0 -490
  1088. PaddleClas/ppcls/arch/backbone/model_zoo/resnext_vd.py +0 -317
  1089. PaddleClas/ppcls/arch/backbone/model_zoo/rexnet.py +0 -281
  1090. PaddleClas/ppcls/arch/backbone/model_zoo/se_resnet_vd.py +0 -390
  1091. PaddleClas/ppcls/arch/backbone/model_zoo/se_resnext.py +0 -364
  1092. PaddleClas/ppcls/arch/backbone/model_zoo/se_resnext_vd.py +0 -309
  1093. PaddleClas/ppcls/arch/backbone/model_zoo/shufflenet_v2.py +0 -362
  1094. PaddleClas/ppcls/arch/backbone/model_zoo/squeezenet.py +0 -194
  1095. PaddleClas/ppcls/arch/backbone/model_zoo/swin_transformer.py +0 -857
  1096. PaddleClas/ppcls/arch/backbone/model_zoo/tnt.py +0 -385
  1097. PaddleClas/ppcls/arch/backbone/model_zoo/vision_transformer.py +0 -495
  1098. PaddleClas/ppcls/arch/backbone/model_zoo/xception.py +0 -377
  1099. PaddleClas/ppcls/arch/backbone/model_zoo/xception_deeplab.py +0 -421
  1100. PaddleClas/ppcls/arch/backbone/variant_models/__init__.py +0 -3
  1101. PaddleClas/ppcls/arch/backbone/variant_models/pp_lcnet_variant.py +0 -29
  1102. PaddleClas/ppcls/arch/backbone/variant_models/resnet_variant.py +0 -23
  1103. PaddleClas/ppcls/arch/backbone/variant_models/vgg_variant.py +0 -28
  1104. PaddleClas/ppcls/arch/gears/__init__.py +0 -32
  1105. PaddleClas/ppcls/arch/gears/arcmargin.py +0 -72
  1106. PaddleClas/ppcls/arch/gears/circlemargin.py +0 -59
  1107. PaddleClas/ppcls/arch/gears/cosmargin.py +0 -55
  1108. PaddleClas/ppcls/arch/gears/fc.py +0 -35
  1109. PaddleClas/ppcls/arch/gears/identity_head.py +0 -9
  1110. PaddleClas/ppcls/arch/gears/vehicle_neck.py +0 -52
  1111. PaddleClas/ppcls/arch/utils.py +0 -53
  1112. PaddleClas/ppcls/data/__init__.py +0 -144
  1113. PaddleClas/ppcls/data/dataloader/DistributedRandomIdentitySampler.py +0 -90
  1114. PaddleClas/ppcls/data/dataloader/__init__.py +0 -9
  1115. PaddleClas/ppcls/data/dataloader/common_dataset.py +0 -84
  1116. PaddleClas/ppcls/data/dataloader/dali.py +0 -319
  1117. PaddleClas/ppcls/data/dataloader/icartoon_dataset.py +0 -36
  1118. PaddleClas/ppcls/data/dataloader/imagenet_dataset.py +0 -38
  1119. PaddleClas/ppcls/data/dataloader/logo_dataset.py +0 -46
  1120. PaddleClas/ppcls/data/dataloader/mix_dataset.py +0 -49
  1121. PaddleClas/ppcls/data/dataloader/mix_sampler.py +0 -79
  1122. PaddleClas/ppcls/data/dataloader/multilabel_dataset.py +0 -59
  1123. PaddleClas/ppcls/data/dataloader/pk_sampler.py +0 -105
  1124. PaddleClas/ppcls/data/dataloader/vehicle_dataset.py +0 -138
  1125. PaddleClas/ppcls/data/postprocess/__init__.py +0 -41
  1126. PaddleClas/ppcls/data/postprocess/topk.py +0 -85
  1127. PaddleClas/ppcls/data/preprocess/__init__.py +0 -100
  1128. PaddleClas/ppcls/data/preprocess/batch_ops/__init__.py +0 -1
  1129. PaddleClas/ppcls/data/preprocess/batch_ops/batch_operators.py +0 -231
  1130. PaddleClas/ppcls/data/preprocess/ops/__init__.py +0 -1
  1131. PaddleClas/ppcls/data/preprocess/ops/autoaugment.py +0 -264
  1132. PaddleClas/ppcls/data/preprocess/ops/cutout.py +0 -41
  1133. PaddleClas/ppcls/data/preprocess/ops/fmix.py +0 -217
  1134. PaddleClas/ppcls/data/preprocess/ops/functional.py +0 -138
  1135. PaddleClas/ppcls/data/preprocess/ops/grid.py +0 -89
  1136. PaddleClas/ppcls/data/preprocess/ops/hide_and_seek.py +0 -44
  1137. PaddleClas/ppcls/data/preprocess/ops/operators.py +0 -384
  1138. PaddleClas/ppcls/data/preprocess/ops/randaugment.py +0 -106
  1139. PaddleClas/ppcls/data/preprocess/ops/random_erasing.py +0 -90
  1140. PaddleClas/ppcls/data/preprocess/ops/timm_autoaugment.py +0 -877
  1141. PaddleClas/ppcls/data/utils/__init__.py +0 -13
  1142. PaddleClas/ppcls/data/utils/get_image_list.py +0 -49
  1143. PaddleClas/ppcls/engine/__init__.py +0 -0
  1144. PaddleClas/ppcls/engine/engine.py +0 -436
  1145. PaddleClas/ppcls/engine/evaluation/__init__.py +0 -16
  1146. PaddleClas/ppcls/engine/evaluation/classification.py +0 -143
  1147. PaddleClas/ppcls/engine/evaluation/retrieval.py +0 -169
  1148. PaddleClas/ppcls/engine/slim/__init__.py +0 -16
  1149. PaddleClas/ppcls/engine/slim/prune.py +0 -66
  1150. PaddleClas/ppcls/engine/slim/quant.py +0 -55
  1151. PaddleClas/ppcls/engine/train/__init__.py +0 -14
  1152. PaddleClas/ppcls/engine/train/train.py +0 -79
  1153. PaddleClas/ppcls/engine/train/utils.py +0 -72
  1154. PaddleClas/ppcls/loss/__init__.py +0 -65
  1155. PaddleClas/ppcls/loss/celoss.py +0 -67
  1156. PaddleClas/ppcls/loss/centerloss.py +0 -54
  1157. PaddleClas/ppcls/loss/comfunc.py +0 -45
  1158. PaddleClas/ppcls/loss/deephashloss.py +0 -92
  1159. PaddleClas/ppcls/loss/distanceloss.py +0 -43
  1160. PaddleClas/ppcls/loss/distillationloss.py +0 -141
  1161. PaddleClas/ppcls/loss/dmlloss.py +0 -46
  1162. PaddleClas/ppcls/loss/emlloss.py +0 -97
  1163. PaddleClas/ppcls/loss/googlenetloss.py +0 -41
  1164. PaddleClas/ppcls/loss/msmloss.py +0 -78
  1165. PaddleClas/ppcls/loss/multilabelloss.py +0 -43
  1166. PaddleClas/ppcls/loss/npairsloss.py +0 -38
  1167. PaddleClas/ppcls/loss/pairwisecosface.py +0 -55
  1168. PaddleClas/ppcls/loss/supconloss.py +0 -108
  1169. PaddleClas/ppcls/loss/trihardloss.py +0 -82
  1170. PaddleClas/ppcls/loss/triplet.py +0 -137
  1171. PaddleClas/ppcls/metric/__init__.py +0 -51
  1172. PaddleClas/ppcls/metric/metrics.py +0 -308
  1173. PaddleClas/ppcls/optimizer/__init__.py +0 -72
  1174. PaddleClas/ppcls/optimizer/learning_rate.py +0 -326
  1175. PaddleClas/ppcls/optimizer/optimizer.py +0 -207
  1176. PaddleClas/ppcls/utils/__init__.py +0 -27
  1177. PaddleClas/ppcls/utils/check.py +0 -151
  1178. PaddleClas/ppcls/utils/config.py +0 -210
  1179. PaddleClas/ppcls/utils/download.py +0 -319
  1180. PaddleClas/ppcls/utils/ema.py +0 -63
  1181. PaddleClas/ppcls/utils/logger.py +0 -137
  1182. PaddleClas/ppcls/utils/metrics.py +0 -107
  1183. PaddleClas/ppcls/utils/misc.py +0 -63
  1184. PaddleClas/ppcls/utils/model_zoo.py +0 -213
  1185. PaddleClas/ppcls/utils/profiler.py +0 -111
  1186. PaddleClas/ppcls/utils/save_load.py +0 -136
  1187. PaddleClas/setup.py +0 -58
  1188. PaddleClas/tools/__init__.py +0 -15
  1189. PaddleClas/tools/eval.py +0 -31
  1190. PaddleClas/tools/export_model.py +0 -34
  1191. PaddleClas/tools/infer.py +0 -31
  1192. PaddleClas/tools/train.py +0 -32
  1193. paddlex/cls.py +0 -82
  1194. paddlex/command.py +0 -215
  1195. paddlex/cv/__init__.py +0 -17
  1196. paddlex/cv/datasets/__init__.py +0 -18
  1197. paddlex/cv/datasets/coco.py +0 -208
  1198. paddlex/cv/datasets/imagenet.py +0 -88
  1199. paddlex/cv/datasets/seg_dataset.py +0 -91
  1200. paddlex/cv/datasets/voc.py +0 -445
  1201. paddlex/cv/models/__init__.py +0 -18
  1202. paddlex/cv/models/base.py +0 -631
  1203. paddlex/cv/models/classifier.py +0 -989
  1204. paddlex/cv/models/detector.py +0 -2292
  1205. paddlex/cv/models/load_model.py +0 -148
  1206. paddlex/cv/models/segmenter.py +0 -768
  1207. paddlex/cv/models/slim/__init__.py +0 -13
  1208. paddlex/cv/models/slim/prune.py +0 -55
  1209. paddlex/cv/models/utils/__init__.py +0 -13
  1210. paddlex/cv/models/utils/det_metrics/__init__.py +0 -15
  1211. paddlex/cv/models/utils/det_metrics/coco_utils.py +0 -476
  1212. paddlex/cv/models/utils/det_metrics/metrics.py +0 -220
  1213. paddlex/cv/models/utils/infer_nets.py +0 -45
  1214. paddlex/cv/models/utils/seg_metrics.py +0 -62
  1215. paddlex/cv/models/utils/visualize.py +0 -399
  1216. paddlex/cv/transforms/__init__.py +0 -46
  1217. paddlex/cv/transforms/batch_operators.py +0 -286
  1218. paddlex/cv/transforms/box_utils.py +0 -41
  1219. paddlex/cv/transforms/functions.py +0 -193
  1220. paddlex/cv/transforms/operators.py +0 -1402
  1221. paddlex/deploy.py +0 -268
  1222. paddlex/det.py +0 -49
  1223. paddlex/paddleseg/__init__.py +0 -17
  1224. paddlex/paddleseg/core/__init__.py +0 -20
  1225. paddlex/paddleseg/core/infer.py +0 -289
  1226. paddlex/paddleseg/core/predict.py +0 -145
  1227. paddlex/paddleseg/core/train.py +0 -258
  1228. paddlex/paddleseg/core/val.py +0 -172
  1229. paddlex/paddleseg/cvlibs/__init__.py +0 -17
  1230. paddlex/paddleseg/cvlibs/callbacks.py +0 -279
  1231. paddlex/paddleseg/cvlibs/config.py +0 -359
  1232. paddlex/paddleseg/cvlibs/manager.py +0 -142
  1233. paddlex/paddleseg/cvlibs/param_init.py +0 -91
  1234. paddlex/paddleseg/datasets/__init__.py +0 -21
  1235. paddlex/paddleseg/datasets/ade.py +0 -112
  1236. paddlex/paddleseg/datasets/cityscapes.py +0 -86
  1237. paddlex/paddleseg/datasets/cocostuff.py +0 -79
  1238. paddlex/paddleseg/datasets/dataset.py +0 -164
  1239. paddlex/paddleseg/datasets/mini_deep_globe_road_extraction.py +0 -95
  1240. paddlex/paddleseg/datasets/optic_disc_seg.py +0 -97
  1241. paddlex/paddleseg/datasets/pascal_context.py +0 -80
  1242. paddlex/paddleseg/datasets/voc.py +0 -113
  1243. paddlex/paddleseg/models/__init__.py +0 -39
  1244. paddlex/paddleseg/models/ann.py +0 -436
  1245. paddlex/paddleseg/models/attention_unet.py +0 -189
  1246. paddlex/paddleseg/models/backbones/__init__.py +0 -18
  1247. paddlex/paddleseg/models/backbones/hrnet.py +0 -815
  1248. paddlex/paddleseg/models/backbones/mobilenetv3.py +0 -365
  1249. paddlex/paddleseg/models/backbones/resnet_vd.py +0 -364
  1250. paddlex/paddleseg/models/backbones/xception_deeplab.py +0 -415
  1251. paddlex/paddleseg/models/bisenet.py +0 -311
  1252. paddlex/paddleseg/models/danet.py +0 -220
  1253. paddlex/paddleseg/models/decoupled_segnet.py +0 -233
  1254. paddlex/paddleseg/models/deeplab.py +0 -258
  1255. paddlex/paddleseg/models/dnlnet.py +0 -231
  1256. paddlex/paddleseg/models/emanet.py +0 -219
  1257. paddlex/paddleseg/models/fast_scnn.py +0 -318
  1258. paddlex/paddleseg/models/fcn.py +0 -135
  1259. paddlex/paddleseg/models/gcnet.py +0 -223
  1260. paddlex/paddleseg/models/gscnn.py +0 -357
  1261. paddlex/paddleseg/models/hardnet.py +0 -309
  1262. paddlex/paddleseg/models/isanet.py +0 -202
  1263. paddlex/paddleseg/models/layers/__init__.py +0 -19
  1264. paddlex/paddleseg/models/layers/activation.py +0 -73
  1265. paddlex/paddleseg/models/layers/attention.py +0 -146
  1266. paddlex/paddleseg/models/layers/layer_libs.py +0 -168
  1267. paddlex/paddleseg/models/layers/nonlocal2d.py +0 -155
  1268. paddlex/paddleseg/models/layers/pyramid_pool.py +0 -182
  1269. paddlex/paddleseg/models/losses/__init__.py +0 -27
  1270. paddlex/paddleseg/models/losses/binary_cross_entropy_loss.py +0 -174
  1271. paddlex/paddleseg/models/losses/bootstrapped_cross_entropy.py +0 -73
  1272. paddlex/paddleseg/models/losses/cross_entropy_loss.py +0 -94
  1273. paddlex/paddleseg/models/losses/decoupledsegnet_relax_boundary_loss.py +0 -129
  1274. paddlex/paddleseg/models/losses/dice_loss.py +0 -61
  1275. paddlex/paddleseg/models/losses/edge_attention_loss.py +0 -78
  1276. paddlex/paddleseg/models/losses/gscnn_dual_task_loss.py +0 -141
  1277. paddlex/paddleseg/models/losses/l1_loss.py +0 -76
  1278. paddlex/paddleseg/models/losses/lovasz_loss.py +0 -222
  1279. paddlex/paddleseg/models/losses/mean_square_error_loss.py +0 -65
  1280. paddlex/paddleseg/models/losses/mixed_loss.py +0 -58
  1281. paddlex/paddleseg/models/losses/ohem_cross_entropy_loss.py +0 -99
  1282. paddlex/paddleseg/models/losses/ohem_edge_attention_loss.py +0 -114
  1283. paddlex/paddleseg/models/ocrnet.py +0 -248
  1284. paddlex/paddleseg/models/pspnet.py +0 -147
  1285. paddlex/paddleseg/models/sfnet.py +0 -236
  1286. paddlex/paddleseg/models/shufflenet_slim.py +0 -268
  1287. paddlex/paddleseg/models/u2net.py +0 -574
  1288. paddlex/paddleseg/models/unet.py +0 -155
  1289. paddlex/paddleseg/models/unet_3plus.py +0 -316
  1290. paddlex/paddleseg/models/unet_plusplus.py +0 -237
  1291. paddlex/paddleseg/transforms/__init__.py +0 -16
  1292. paddlex/paddleseg/transforms/functional.py +0 -161
  1293. paddlex/paddleseg/transforms/transforms.py +0 -937
  1294. paddlex/paddleseg/utils/__init__.py +0 -22
  1295. paddlex/paddleseg/utils/config_check.py +0 -60
  1296. paddlex/paddleseg/utils/download.py +0 -163
  1297. paddlex/paddleseg/utils/env/__init__.py +0 -16
  1298. paddlex/paddleseg/utils/env/seg_env.py +0 -56
  1299. paddlex/paddleseg/utils/env/sys_env.py +0 -122
  1300. paddlex/paddleseg/utils/logger.py +0 -48
  1301. paddlex/paddleseg/utils/metrics.py +0 -146
  1302. paddlex/paddleseg/utils/progbar.py +0 -212
  1303. paddlex/paddleseg/utils/timer.py +0 -53
  1304. paddlex/paddleseg/utils/utils.py +0 -120
  1305. paddlex/paddleseg/utils/visualize.py +0 -90
  1306. paddlex/ppcls/__init__.py +0 -20
  1307. paddlex/ppcls/arch/__init__.py +0 -127
  1308. paddlex/ppcls/arch/backbone/__init__.py +0 -80
  1309. paddlex/ppcls/arch/backbone/base/__init__.py +0 -0
  1310. paddlex/ppcls/arch/backbone/base/theseus_layer.py +0 -130
  1311. paddlex/ppcls/arch/backbone/legendary_models/__init__.py +0 -6
  1312. paddlex/ppcls/arch/backbone/legendary_models/esnet.py +0 -355
  1313. paddlex/ppcls/arch/backbone/legendary_models/hrnet.py +0 -748
  1314. paddlex/ppcls/arch/backbone/legendary_models/inception_v3.py +0 -539
  1315. paddlex/ppcls/arch/backbone/legendary_models/mobilenet_v1.py +0 -234
  1316. paddlex/ppcls/arch/backbone/legendary_models/mobilenet_v3.py +0 -561
  1317. paddlex/ppcls/arch/backbone/legendary_models/pp_lcnet.py +0 -399
  1318. paddlex/ppcls/arch/backbone/legendary_models/resnet.py +0 -534
  1319. paddlex/ppcls/arch/backbone/legendary_models/vgg.py +0 -235
  1320. paddlex/ppcls/arch/backbone/model_zoo/__init__.py +0 -0
  1321. paddlex/ppcls/arch/backbone/model_zoo/alexnet.py +0 -168
  1322. paddlex/ppcls/arch/backbone/model_zoo/cspnet.py +0 -376
  1323. paddlex/ppcls/arch/backbone/model_zoo/darknet.py +0 -197
  1324. paddlex/ppcls/arch/backbone/model_zoo/densenet.py +0 -344
  1325. paddlex/ppcls/arch/backbone/model_zoo/distilled_vision_transformer.py +0 -272
  1326. paddlex/ppcls/arch/backbone/model_zoo/dla.py +0 -528
  1327. paddlex/ppcls/arch/backbone/model_zoo/dpn.py +0 -451
  1328. paddlex/ppcls/arch/backbone/model_zoo/efficientnet.py +0 -976
  1329. paddlex/ppcls/arch/backbone/model_zoo/ghostnet.py +0 -363
  1330. paddlex/ppcls/arch/backbone/model_zoo/googlenet.py +0 -229
  1331. paddlex/ppcls/arch/backbone/model_zoo/gvt.py +0 -693
  1332. paddlex/ppcls/arch/backbone/model_zoo/hardnet.py +0 -293
  1333. paddlex/ppcls/arch/backbone/model_zoo/inception_v4.py +0 -477
  1334. paddlex/ppcls/arch/backbone/model_zoo/levit.py +0 -589
  1335. paddlex/ppcls/arch/backbone/model_zoo/mixnet.py +0 -815
  1336. paddlex/ppcls/arch/backbone/model_zoo/mobilenet_v2.py +0 -287
  1337. paddlex/ppcls/arch/backbone/model_zoo/rednet.py +0 -203
  1338. paddlex/ppcls/arch/backbone/model_zoo/regnet.py +0 -431
  1339. paddlex/ppcls/arch/backbone/model_zoo/repvgg.py +0 -422
  1340. paddlex/ppcls/arch/backbone/model_zoo/res2net.py +0 -264
  1341. paddlex/ppcls/arch/backbone/model_zoo/res2net_vd.py +0 -305
  1342. paddlex/ppcls/arch/backbone/model_zoo/resnest.py +0 -740
  1343. paddlex/ppcls/arch/backbone/model_zoo/resnet_vc.py +0 -309
  1344. paddlex/ppcls/arch/backbone/model_zoo/resnext.py +0 -298
  1345. paddlex/ppcls/arch/backbone/model_zoo/resnext101_wsl.py +0 -490
  1346. paddlex/ppcls/arch/backbone/model_zoo/resnext_vd.py +0 -317
  1347. paddlex/ppcls/arch/backbone/model_zoo/rexnet.py +0 -281
  1348. paddlex/ppcls/arch/backbone/model_zoo/se_resnet_vd.py +0 -390
  1349. paddlex/ppcls/arch/backbone/model_zoo/se_resnext.py +0 -364
  1350. paddlex/ppcls/arch/backbone/model_zoo/se_resnext_vd.py +0 -309
  1351. paddlex/ppcls/arch/backbone/model_zoo/shufflenet_v2.py +0 -362
  1352. paddlex/ppcls/arch/backbone/model_zoo/squeezenet.py +0 -194
  1353. paddlex/ppcls/arch/backbone/model_zoo/swin_transformer.py +0 -857
  1354. paddlex/ppcls/arch/backbone/model_zoo/tnt.py +0 -385
  1355. paddlex/ppcls/arch/backbone/model_zoo/vision_transformer.py +0 -495
  1356. paddlex/ppcls/arch/backbone/model_zoo/xception.py +0 -377
  1357. paddlex/ppcls/arch/backbone/model_zoo/xception_deeplab.py +0 -421
  1358. paddlex/ppcls/arch/backbone/variant_models/__init__.py +0 -3
  1359. paddlex/ppcls/arch/backbone/variant_models/pp_lcnet_variant.py +0 -29
  1360. paddlex/ppcls/arch/backbone/variant_models/resnet_variant.py +0 -23
  1361. paddlex/ppcls/arch/backbone/variant_models/vgg_variant.py +0 -28
  1362. paddlex/ppcls/arch/gears/__init__.py +0 -32
  1363. paddlex/ppcls/arch/gears/arcmargin.py +0 -72
  1364. paddlex/ppcls/arch/gears/circlemargin.py +0 -59
  1365. paddlex/ppcls/arch/gears/cosmargin.py +0 -55
  1366. paddlex/ppcls/arch/gears/fc.py +0 -35
  1367. paddlex/ppcls/arch/gears/identity_head.py +0 -9
  1368. paddlex/ppcls/arch/gears/vehicle_neck.py +0 -52
  1369. paddlex/ppcls/arch/utils.py +0 -53
  1370. paddlex/ppcls/data/__init__.py +0 -144
  1371. paddlex/ppcls/data/dataloader/DistributedRandomIdentitySampler.py +0 -90
  1372. paddlex/ppcls/data/dataloader/__init__.py +0 -9
  1373. paddlex/ppcls/data/dataloader/common_dataset.py +0 -84
  1374. paddlex/ppcls/data/dataloader/dali.py +0 -319
  1375. paddlex/ppcls/data/dataloader/icartoon_dataset.py +0 -36
  1376. paddlex/ppcls/data/dataloader/imagenet_dataset.py +0 -38
  1377. paddlex/ppcls/data/dataloader/logo_dataset.py +0 -46
  1378. paddlex/ppcls/data/dataloader/mix_dataset.py +0 -49
  1379. paddlex/ppcls/data/dataloader/mix_sampler.py +0 -79
  1380. paddlex/ppcls/data/dataloader/multilabel_dataset.py +0 -59
  1381. paddlex/ppcls/data/dataloader/pk_sampler.py +0 -105
  1382. paddlex/ppcls/data/dataloader/vehicle_dataset.py +0 -138
  1383. paddlex/ppcls/data/postprocess/__init__.py +0 -41
  1384. paddlex/ppcls/data/postprocess/topk.py +0 -85
  1385. paddlex/ppcls/data/preprocess/__init__.py +0 -100
  1386. paddlex/ppcls/data/preprocess/batch_ops/__init__.py +0 -0
  1387. paddlex/ppcls/data/preprocess/batch_ops/batch_operators.py +0 -231
  1388. paddlex/ppcls/data/preprocess/ops/__init__.py +0 -0
  1389. paddlex/ppcls/data/preprocess/ops/autoaugment.py +0 -264
  1390. paddlex/ppcls/data/preprocess/ops/cutout.py +0 -41
  1391. paddlex/ppcls/data/preprocess/ops/fmix.py +0 -217
  1392. paddlex/ppcls/data/preprocess/ops/functional.py +0 -141
  1393. paddlex/ppcls/data/preprocess/ops/grid.py +0 -89
  1394. paddlex/ppcls/data/preprocess/ops/hide_and_seek.py +0 -44
  1395. paddlex/ppcls/data/preprocess/ops/operators.py +0 -384
  1396. paddlex/ppcls/data/preprocess/ops/randaugment.py +0 -106
  1397. paddlex/ppcls/data/preprocess/ops/random_erasing.py +0 -90
  1398. paddlex/ppcls/data/preprocess/ops/timm_autoaugment.py +0 -877
  1399. paddlex/ppcls/data/utils/__init__.py +0 -13
  1400. paddlex/ppcls/data/utils/get_image_list.py +0 -49
  1401. paddlex/ppcls/engine/__init__.py +0 -0
  1402. paddlex/ppcls/engine/engine.py +0 -436
  1403. paddlex/ppcls/engine/evaluation/__init__.py +0 -16
  1404. paddlex/ppcls/engine/evaluation/classification.py +0 -143
  1405. paddlex/ppcls/engine/evaluation/retrieval.py +0 -169
  1406. paddlex/ppcls/engine/slim/__init__.py +0 -16
  1407. paddlex/ppcls/engine/slim/prune.py +0 -66
  1408. paddlex/ppcls/engine/slim/quant.py +0 -55
  1409. paddlex/ppcls/engine/train/__init__.py +0 -14
  1410. paddlex/ppcls/engine/train/train.py +0 -79
  1411. paddlex/ppcls/engine/train/utils.py +0 -72
  1412. paddlex/ppcls/loss/__init__.py +0 -65
  1413. paddlex/ppcls/loss/celoss.py +0 -67
  1414. paddlex/ppcls/loss/centerloss.py +0 -54
  1415. paddlex/ppcls/loss/comfunc.py +0 -45
  1416. paddlex/ppcls/loss/deephashloss.py +0 -96
  1417. paddlex/ppcls/loss/distanceloss.py +0 -43
  1418. paddlex/ppcls/loss/distillationloss.py +0 -141
  1419. paddlex/ppcls/loss/dmlloss.py +0 -46
  1420. paddlex/ppcls/loss/emlloss.py +0 -97
  1421. paddlex/ppcls/loss/googlenetloss.py +0 -42
  1422. paddlex/ppcls/loss/msmloss.py +0 -78
  1423. paddlex/ppcls/loss/multilabelloss.py +0 -43
  1424. paddlex/ppcls/loss/npairsloss.py +0 -38
  1425. paddlex/ppcls/loss/pairwisecosface.py +0 -59
  1426. paddlex/ppcls/loss/supconloss.py +0 -108
  1427. paddlex/ppcls/loss/trihardloss.py +0 -82
  1428. paddlex/ppcls/loss/triplet.py +0 -137
  1429. paddlex/ppcls/metric/__init__.py +0 -51
  1430. paddlex/ppcls/metric/metrics.py +0 -308
  1431. paddlex/ppcls/optimizer/__init__.py +0 -72
  1432. paddlex/ppcls/optimizer/learning_rate.py +0 -326
  1433. paddlex/ppcls/optimizer/optimizer.py +0 -208
  1434. paddlex/ppcls/utils/__init__.py +0 -27
  1435. paddlex/ppcls/utils/check.py +0 -151
  1436. paddlex/ppcls/utils/config.py +0 -210
  1437. paddlex/ppcls/utils/download.py +0 -319
  1438. paddlex/ppcls/utils/ema.py +0 -63
  1439. paddlex/ppcls/utils/logger.py +0 -137
  1440. paddlex/ppcls/utils/metrics.py +0 -112
  1441. paddlex/ppcls/utils/misc.py +0 -63
  1442. paddlex/ppcls/utils/model_zoo.py +0 -213
  1443. paddlex/ppcls/utils/profiler.py +0 -111
  1444. paddlex/ppcls/utils/save_load.py +0 -136
  1445. paddlex/ppdet/__init__.py +0 -16
  1446. paddlex/ppdet/core/__init__.py +0 -15
  1447. paddlex/ppdet/core/config/__init__.py +0 -13
  1448. paddlex/ppdet/core/config/schema.py +0 -248
  1449. paddlex/ppdet/core/config/yaml_helpers.py +0 -118
  1450. paddlex/ppdet/core/workspace.py +0 -278
  1451. paddlex/ppdet/data/__init__.py +0 -21
  1452. paddlex/ppdet/data/crop_utils/__init__.py +0 -13
  1453. paddlex/ppdet/data/crop_utils/annotation_cropper.py +0 -585
  1454. paddlex/ppdet/data/crop_utils/chip_box_utils.py +0 -170
  1455. paddlex/ppdet/data/reader.py +0 -302
  1456. paddlex/ppdet/data/shm_utils.py +0 -67
  1457. paddlex/ppdet/data/source/__init__.py +0 -29
  1458. paddlex/ppdet/data/source/category.py +0 -904
  1459. paddlex/ppdet/data/source/coco.py +0 -251
  1460. paddlex/ppdet/data/source/dataset.py +0 -197
  1461. paddlex/ppdet/data/source/keypoint_coco.py +0 -669
  1462. paddlex/ppdet/data/source/mot.py +0 -636
  1463. paddlex/ppdet/data/source/sniper_coco.py +0 -191
  1464. paddlex/ppdet/data/source/voc.py +0 -231
  1465. paddlex/ppdet/data/source/widerface.py +0 -180
  1466. paddlex/ppdet/data/transform/__init__.py +0 -28
  1467. paddlex/ppdet/data/transform/atss_assigner.py +0 -270
  1468. paddlex/ppdet/data/transform/autoaugment_utils.py +0 -1591
  1469. paddlex/ppdet/data/transform/batch_operators.py +0 -1080
  1470. paddlex/ppdet/data/transform/gridmask_utils.py +0 -86
  1471. paddlex/ppdet/data/transform/keypoint_operators.py +0 -868
  1472. paddlex/ppdet/data/transform/mot_operators.py +0 -628
  1473. paddlex/ppdet/data/transform/op_helper.py +0 -498
  1474. paddlex/ppdet/data/transform/operators.py +0 -3025
  1475. paddlex/ppdet/engine/__init__.py +0 -30
  1476. paddlex/ppdet/engine/callbacks.py +0 -340
  1477. paddlex/ppdet/engine/env.py +0 -50
  1478. paddlex/ppdet/engine/export_utils.py +0 -177
  1479. paddlex/ppdet/engine/tracker.py +0 -538
  1480. paddlex/ppdet/engine/trainer.py +0 -723
  1481. paddlex/ppdet/metrics/__init__.py +0 -29
  1482. paddlex/ppdet/metrics/coco_utils.py +0 -184
  1483. paddlex/ppdet/metrics/json_results.py +0 -149
  1484. paddlex/ppdet/metrics/keypoint_metrics.py +0 -401
  1485. paddlex/ppdet/metrics/map_utils.py +0 -444
  1486. paddlex/ppdet/metrics/mcmot_metrics.py +0 -470
  1487. paddlex/ppdet/metrics/metrics.py +0 -434
  1488. paddlex/ppdet/metrics/mot_metrics.py +0 -1236
  1489. paddlex/ppdet/metrics/munkres.py +0 -428
  1490. paddlex/ppdet/metrics/widerface_utils.py +0 -393
  1491. paddlex/ppdet/model_zoo/__init__.py +0 -18
  1492. paddlex/ppdet/model_zoo/model_zoo.py +0 -84
  1493. paddlex/ppdet/modeling/__init__.py +0 -45
  1494. paddlex/ppdet/modeling/architectures/__init__.py +0 -51
  1495. paddlex/ppdet/modeling/architectures/blazeface.py +0 -91
  1496. paddlex/ppdet/modeling/architectures/cascade_rcnn.py +0 -144
  1497. paddlex/ppdet/modeling/architectures/centernet.py +0 -108
  1498. paddlex/ppdet/modeling/architectures/deepsort.py +0 -69
  1499. paddlex/ppdet/modeling/architectures/detr.py +0 -93
  1500. paddlex/ppdet/modeling/architectures/fairmot.py +0 -100
  1501. paddlex/ppdet/modeling/architectures/faster_rcnn.py +0 -106
  1502. paddlex/ppdet/modeling/architectures/fcos.py +0 -105
  1503. paddlex/ppdet/modeling/architectures/gfl.py +0 -87
  1504. paddlex/ppdet/modeling/architectures/jde.py +0 -111
  1505. paddlex/ppdet/modeling/architectures/keypoint_hrhrnet.py +0 -287
  1506. paddlex/ppdet/modeling/architectures/keypoint_hrnet.py +0 -267
  1507. paddlex/ppdet/modeling/architectures/mask_rcnn.py +0 -135
  1508. paddlex/ppdet/modeling/architectures/meta_arch.py +0 -128
  1509. paddlex/ppdet/modeling/architectures/picodet.py +0 -91
  1510. paddlex/ppdet/modeling/architectures/s2anet.py +0 -102
  1511. paddlex/ppdet/modeling/architectures/solov2.py +0 -110
  1512. paddlex/ppdet/modeling/architectures/sparse_rcnn.py +0 -99
  1513. paddlex/ppdet/modeling/architectures/ssd.py +0 -93
  1514. paddlex/ppdet/modeling/architectures/tood.py +0 -78
  1515. paddlex/ppdet/modeling/architectures/ttfnet.py +0 -98
  1516. paddlex/ppdet/modeling/architectures/yolo.py +0 -124
  1517. paddlex/ppdet/modeling/assigners/__init__.py +0 -23
  1518. paddlex/ppdet/modeling/assigners/atss_assigner.py +0 -211
  1519. paddlex/ppdet/modeling/assigners/simota_assigner.py +0 -262
  1520. paddlex/ppdet/modeling/assigners/task_aligned_assigner.py +0 -158
  1521. paddlex/ppdet/modeling/assigners/utils.py +0 -195
  1522. paddlex/ppdet/modeling/backbones/__init__.py +0 -49
  1523. paddlex/ppdet/modeling/backbones/blazenet.py +0 -323
  1524. paddlex/ppdet/modeling/backbones/darknet.py +0 -340
  1525. paddlex/ppdet/modeling/backbones/dla.py +0 -244
  1526. paddlex/ppdet/modeling/backbones/esnet.py +0 -290
  1527. paddlex/ppdet/modeling/backbones/ghostnet.py +0 -470
  1528. paddlex/ppdet/modeling/backbones/hardnet.py +0 -224
  1529. paddlex/ppdet/modeling/backbones/hrnet.py +0 -727
  1530. paddlex/ppdet/modeling/backbones/lcnet.py +0 -259
  1531. paddlex/ppdet/modeling/backbones/lite_hrnet.py +0 -886
  1532. paddlex/ppdet/modeling/backbones/mobilenet_v1.py +0 -418
  1533. paddlex/ppdet/modeling/backbones/mobilenet_v3.py +0 -483
  1534. paddlex/ppdet/modeling/backbones/name_adapter.py +0 -69
  1535. paddlex/ppdet/modeling/backbones/res2net.py +0 -358
  1536. paddlex/ppdet/modeling/backbones/resnet.py +0 -613
  1537. paddlex/ppdet/modeling/backbones/senet.py +0 -139
  1538. paddlex/ppdet/modeling/backbones/shufflenet_v2.py +0 -246
  1539. paddlex/ppdet/modeling/backbones/swin_transformer.py +0 -743
  1540. paddlex/ppdet/modeling/backbones/vgg.py +0 -210
  1541. paddlex/ppdet/modeling/bbox_utils.py +0 -778
  1542. paddlex/ppdet/modeling/heads/__init__.py +0 -53
  1543. paddlex/ppdet/modeling/heads/bbox_head.py +0 -377
  1544. paddlex/ppdet/modeling/heads/cascade_head.py +0 -284
  1545. paddlex/ppdet/modeling/heads/centernet_head.py +0 -292
  1546. paddlex/ppdet/modeling/heads/detr_head.py +0 -368
  1547. paddlex/ppdet/modeling/heads/face_head.py +0 -110
  1548. paddlex/ppdet/modeling/heads/fcos_head.py +0 -259
  1549. paddlex/ppdet/modeling/heads/gfl_head.py +0 -487
  1550. paddlex/ppdet/modeling/heads/keypoint_hrhrnet_head.py +0 -108
  1551. paddlex/ppdet/modeling/heads/mask_head.py +0 -250
  1552. paddlex/ppdet/modeling/heads/pico_head.py +0 -278
  1553. paddlex/ppdet/modeling/heads/roi_extractor.py +0 -111
  1554. paddlex/ppdet/modeling/heads/s2anet_head.py +0 -1056
  1555. paddlex/ppdet/modeling/heads/simota_head.py +0 -506
  1556. paddlex/ppdet/modeling/heads/solov2_head.py +0 -560
  1557. paddlex/ppdet/modeling/heads/sparsercnn_head.py +0 -375
  1558. paddlex/ppdet/modeling/heads/ssd_head.py +0 -215
  1559. paddlex/ppdet/modeling/heads/tood_head.py +0 -366
  1560. paddlex/ppdet/modeling/heads/ttf_head.py +0 -316
  1561. paddlex/ppdet/modeling/heads/yolo_head.py +0 -124
  1562. paddlex/ppdet/modeling/initializer.py +0 -317
  1563. paddlex/ppdet/modeling/keypoint_utils.py +0 -342
  1564. paddlex/ppdet/modeling/layers.py +0 -1430
  1565. paddlex/ppdet/modeling/losses/__init__.py +0 -43
  1566. paddlex/ppdet/modeling/losses/ctfocal_loss.py +0 -68
  1567. paddlex/ppdet/modeling/losses/detr_loss.py +0 -233
  1568. paddlex/ppdet/modeling/losses/fairmot_loss.py +0 -41
  1569. paddlex/ppdet/modeling/losses/fcos_loss.py +0 -225
  1570. paddlex/ppdet/modeling/losses/gfocal_loss.py +0 -217
  1571. paddlex/ppdet/modeling/losses/iou_aware_loss.py +0 -47
  1572. paddlex/ppdet/modeling/losses/iou_loss.py +0 -210
  1573. paddlex/ppdet/modeling/losses/jde_loss.py +0 -193
  1574. paddlex/ppdet/modeling/losses/keypoint_loss.py +0 -229
  1575. paddlex/ppdet/modeling/losses/solov2_loss.py +0 -101
  1576. paddlex/ppdet/modeling/losses/sparsercnn_loss.py +0 -425
  1577. paddlex/ppdet/modeling/losses/ssd_loss.py +0 -170
  1578. paddlex/ppdet/modeling/losses/varifocal_loss.py +0 -152
  1579. paddlex/ppdet/modeling/losses/yolo_loss.py +0 -212
  1580. paddlex/ppdet/modeling/mot/__init__.py +0 -25
  1581. paddlex/ppdet/modeling/mot/matching/__init__.py +0 -19
  1582. paddlex/ppdet/modeling/mot/matching/deepsort_matching.py +0 -382
  1583. paddlex/ppdet/modeling/mot/matching/jde_matching.py +0 -144
  1584. paddlex/ppdet/modeling/mot/motion/__init__.py +0 -17
  1585. paddlex/ppdet/modeling/mot/motion/kalman_filter.py +0 -270
  1586. paddlex/ppdet/modeling/mot/tracker/__init__.py +0 -23
  1587. paddlex/ppdet/modeling/mot/tracker/base_jde_tracker.py +0 -297
  1588. paddlex/ppdet/modeling/mot/tracker/base_sde_tracker.py +0 -156
  1589. paddlex/ppdet/modeling/mot/tracker/deepsort_tracker.py +0 -188
  1590. paddlex/ppdet/modeling/mot/tracker/jde_tracker.py +0 -277
  1591. paddlex/ppdet/modeling/mot/utils.py +0 -263
  1592. paddlex/ppdet/modeling/mot/visualization.py +0 -150
  1593. paddlex/ppdet/modeling/necks/__init__.py +0 -30
  1594. paddlex/ppdet/modeling/necks/bifpn.py +0 -302
  1595. paddlex/ppdet/modeling/necks/blazeface_fpn.py +0 -216
  1596. paddlex/ppdet/modeling/necks/centernet_fpn.py +0 -426
  1597. paddlex/ppdet/modeling/necks/csp_pan.py +0 -364
  1598. paddlex/ppdet/modeling/necks/fpn.py +0 -231
  1599. paddlex/ppdet/modeling/necks/hrfpn.py +0 -126
  1600. paddlex/ppdet/modeling/necks/ttf_fpn.py +0 -242
  1601. paddlex/ppdet/modeling/necks/yolo_fpn.py +0 -988
  1602. paddlex/ppdet/modeling/ops.py +0 -1611
  1603. paddlex/ppdet/modeling/post_process.py +0 -731
  1604. paddlex/ppdet/modeling/proposal_generator/__init__.py +0 -2
  1605. paddlex/ppdet/modeling/proposal_generator/anchor_generator.py +0 -135
  1606. paddlex/ppdet/modeling/proposal_generator/proposal_generator.py +0 -77
  1607. paddlex/ppdet/modeling/proposal_generator/rpn_head.py +0 -260
  1608. paddlex/ppdet/modeling/proposal_generator/target.py +0 -681
  1609. paddlex/ppdet/modeling/proposal_generator/target_layer.py +0 -491
  1610. paddlex/ppdet/modeling/reid/__init__.py +0 -25
  1611. paddlex/ppdet/modeling/reid/fairmot_embedding_head.py +0 -225
  1612. paddlex/ppdet/modeling/reid/jde_embedding_head.py +0 -214
  1613. paddlex/ppdet/modeling/reid/pplcnet_embedding.py +0 -282
  1614. paddlex/ppdet/modeling/reid/pyramidal_embedding.py +0 -144
  1615. paddlex/ppdet/modeling/reid/resnet.py +0 -310
  1616. paddlex/ppdet/modeling/shape_spec.py +0 -25
  1617. paddlex/ppdet/modeling/transformers/__init__.py +0 -25
  1618. paddlex/ppdet/modeling/transformers/deformable_transformer.py +0 -517
  1619. paddlex/ppdet/modeling/transformers/detr_transformer.py +0 -353
  1620. paddlex/ppdet/modeling/transformers/matchers.py +0 -127
  1621. paddlex/ppdet/modeling/transformers/position_encoding.py +0 -108
  1622. paddlex/ppdet/modeling/transformers/utils.py +0 -110
  1623. paddlex/ppdet/optimizer.py +0 -335
  1624. paddlex/ppdet/slim/__init__.py +0 -82
  1625. paddlex/ppdet/slim/distill.py +0 -110
  1626. paddlex/ppdet/slim/prune.py +0 -85
  1627. paddlex/ppdet/slim/quant.py +0 -84
  1628. paddlex/ppdet/slim/unstructured_prune.py +0 -66
  1629. paddlex/ppdet/utils/__init__.py +0 -13
  1630. paddlex/ppdet/utils/check.py +0 -112
  1631. paddlex/ppdet/utils/checkpoint.py +0 -226
  1632. paddlex/ppdet/utils/cli.py +0 -151
  1633. paddlex/ppdet/utils/colormap.py +0 -58
  1634. paddlex/ppdet/utils/download.py +0 -558
  1635. paddlex/ppdet/utils/logger.py +0 -70
  1636. paddlex/ppdet/utils/profiler.py +0 -111
  1637. paddlex/ppdet/utils/stats.py +0 -94
  1638. paddlex/ppdet/utils/visualizer.py +0 -321
  1639. paddlex/ppdet/utils/voc_utils.py +0 -86
  1640. paddlex/seg.py +0 -41
  1641. paddlex/tools/__init__.py +0 -17
  1642. paddlex/tools/anchor_clustering/__init__.py +0 -15
  1643. paddlex/tools/anchor_clustering/yolo_cluster.py +0 -178
  1644. paddlex/tools/convert.py +0 -52
  1645. paddlex/tools/dataset_conversion/__init__.py +0 -24
  1646. paddlex/tools/dataset_conversion/x2coco.py +0 -379
  1647. paddlex/tools/dataset_conversion/x2imagenet.py +0 -82
  1648. paddlex/tools/dataset_conversion/x2seg.py +0 -343
  1649. paddlex/tools/dataset_conversion/x2voc.py +0 -230
  1650. paddlex/tools/dataset_split/__init__.py +0 -23
  1651. paddlex/tools/dataset_split/coco_split.py +0 -69
  1652. paddlex/tools/dataset_split/imagenet_split.py +0 -75
  1653. paddlex/tools/dataset_split/seg_split.py +0 -96
  1654. paddlex/tools/dataset_split/utils.py +0 -75
  1655. paddlex/tools/dataset_split/voc_split.py +0 -91
  1656. paddlex/tools/split.py +0 -41
  1657. paddlex/utils/checkpoint.py +0 -492
  1658. paddlex/utils/shm.py +0 -67
  1659. paddlex/utils/stats.py +0 -68
  1660. paddlex/utils/utils.py +0 -229
  1661. paddlex-2.1.0.data/data/paddlex_restful/restful/templates/paddlex_restful_demo.html +0 -5205
  1662. paddlex-2.1.0.dist-info/LICENSE +0 -201
  1663. paddlex-2.1.0.dist-info/METADATA +0 -32
  1664. paddlex-2.1.0.dist-info/RECORD +0 -698
  1665. paddlex-2.1.0.dist-info/WHEEL +0 -5
  1666. paddlex-2.1.0.dist-info/entry_points.txt +0 -4
  1667. paddlex-2.1.0.dist-info/top_level.txt +0 -3
  1668. paddlex_restful/__init__.py +0 -15
  1669. paddlex_restful/command.py +0 -63
  1670. paddlex_restful/restful/__init__.py +0 -15
  1671. paddlex_restful/restful/app.py +0 -969
  1672. paddlex_restful/restful/dataset/__init__.py +0 -13
  1673. paddlex_restful/restful/dataset/cls_dataset.py +0 -159
  1674. paddlex_restful/restful/dataset/dataset.py +0 -266
  1675. paddlex_restful/restful/dataset/datasetbase.py +0 -86
  1676. paddlex_restful/restful/dataset/det_dataset.py +0 -190
  1677. paddlex_restful/restful/dataset/ins_seg_dataset.py +0 -312
  1678. paddlex_restful/restful/dataset/operate.py +0 -155
  1679. paddlex_restful/restful/dataset/seg_dataset.py +0 -222
  1680. paddlex_restful/restful/dataset/utils.py +0 -267
  1681. paddlex_restful/restful/demo.py +0 -202
  1682. paddlex_restful/restful/dir.py +0 -45
  1683. paddlex_restful/restful/model.py +0 -312
  1684. paddlex_restful/restful/project/__init__.py +0 -13
  1685. paddlex_restful/restful/project/evaluate/__init__.py +0 -13
  1686. paddlex_restful/restful/project/evaluate/classification.py +0 -126
  1687. paddlex_restful/restful/project/evaluate/detection.py +0 -789
  1688. paddlex_restful/restful/project/evaluate/draw_pred_result.py +0 -181
  1689. paddlex_restful/restful/project/evaluate/segmentation.py +0 -122
  1690. paddlex_restful/restful/project/operate.py +0 -931
  1691. paddlex_restful/restful/project/project.py +0 -143
  1692. paddlex_restful/restful/project/prune/__init__.py +0 -13
  1693. paddlex_restful/restful/project/prune/classification.py +0 -32
  1694. paddlex_restful/restful/project/prune/detection.py +0 -48
  1695. paddlex_restful/restful/project/prune/segmentation.py +0 -34
  1696. paddlex_restful/restful/project/task.py +0 -884
  1697. paddlex_restful/restful/project/train/__init__.py +0 -13
  1698. paddlex_restful/restful/project/train/classification.py +0 -141
  1699. paddlex_restful/restful/project/train/detection.py +0 -263
  1700. paddlex_restful/restful/project/train/params.py +0 -432
  1701. paddlex_restful/restful/project/train/params_v2.py +0 -326
  1702. paddlex_restful/restful/project/train/segmentation.py +0 -191
  1703. paddlex_restful/restful/project/visualize.py +0 -244
  1704. paddlex_restful/restful/system.py +0 -102
  1705. paddlex_restful/restful/templates/paddlex_restful_demo.html +0 -5205
  1706. paddlex_restful/restful/utils.py +0 -841
  1707. paddlex_restful/restful/workspace.py +0 -343
  1708. paddlex_restful/restful/workspace_pb2.py +0 -1411
@@ -0,0 +1,3504 @@
1
+ # copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import copy
16
+ import io
17
+ import json
18
+ import os
19
+
20
+ import warnings
21
+ from collections import OrderedDict, UserDict
22
+ from dataclasses import dataclass, field
23
+ from enum import Enum
24
+ from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union
25
+
26
+ import numpy as np
27
+ import lazy_paddle as paddle
28
+
29
+ from .....utils import logging
30
+
31
+ __all__ = [
32
+ "AddedToken",
33
+ "FastEncoding",
34
+ "ExplicitEnum",
35
+ "PaddingStrategy",
36
+ "TensorType",
37
+ "TruncationStrategy",
38
+ "CharSpan",
39
+ "TokenSpan",
40
+ "BatchEncoding",
41
+ "SpecialTokensMixin",
42
+ "PretrainedTokenizerBase",
43
+ ]
44
+
45
+ TOKENIZER_CONFIG_NAME = "tokenizer_config.json"
46
+ CHAT_TEMPLATE_CONFIG_NAME = "chat_template.json"
47
+ CHAT_TEMPLATE_CONFIG_NAME = "chat_template.json"
48
+
49
+ VERY_LARGE_INTEGER = int(
50
+ 1e30
51
+ ) # This is used to set the max input length for a model with infinite size input
52
+ LARGE_INTEGER = int(
53
+ 1e20
54
+ ) # This is used when we need something big but slightly smaller than VERY_LARGE_INTEGER
55
+
56
+ # Define type aliases and NamedTuples
57
+ TextInput = str
58
+ PreTokenizedInput = List[str]
59
+ EncodedInput = List[int]
60
+ TextInputPair = Tuple[str, str]
61
+ PreTokenizedInputPair = Tuple[List[str], List[str]]
62
+ EncodedInputPair = Tuple[List[int], List[int]]
63
+
64
+ # Slow tokenizers used to be saved in three separated files
65
+ SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
66
+ ADDED_TOKENS_FILE = "added_tokens.json"
67
+ TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
68
+
69
+
70
+ @dataclass(frozen=True, eq=True)
71
+ class AddedToken:
72
+ """
73
+ AddedToken represents a token to be added to a Tokenizer An AddedToken can have special options defining the
74
+ way it should behave.
75
+ """
76
+
77
+ content: str = field(default_factory=str)
78
+ single_word: bool = False
79
+ lstrip: bool = False
80
+ rstrip: bool = False
81
+ normalized: bool = True
82
+ special: bool = True
83
+
84
+ def __getstate__(self):
85
+ return self.__dict__
86
+
87
+ def __str__(self):
88
+ return self.content
89
+
90
+
91
+ @dataclass
92
+ class FastEncoding:
93
+ """This is dummy class reserved for fast tokenizer"""
94
+
95
+ pass
96
+
97
+
98
+ class ExplicitEnum(Enum):
99
+ """
100
+ Enum with more explicit error message for missing values.
101
+ """
102
+
103
+ @classmethod
104
+ def _missing_(cls, value):
105
+ raise ValueError(
106
+ f"{value} is not a valid {cls.__name__}, please select one of {list(cls._value2member_map_.keys())}"
107
+ )
108
+
109
+
110
+ class PaddingStrategy(ExplicitEnum):
111
+ """
112
+ Possible values for the `padding` argument in [`PretrainedTokenizerBase.__call__`]. Useful for tab-completion in an
113
+ IDE.
114
+ """
115
+
116
+ LONGEST = "longest"
117
+ MAX_LENGTH = "max_length"
118
+ DO_NOT_PAD = "do_not_pad"
119
+
120
+
121
+ class TensorType(ExplicitEnum):
122
+ """
123
+ Possible values for the `return_tensors` argument in [`PretrainedTokenizerBase.__call__`]. Useful for
124
+ tab-completion in an IDE.
125
+ """
126
+
127
+ PADDLE = "pd"
128
+ NUMPY = "np"
129
+
130
+
131
+ def to_py_obj(obj):
132
+ """
133
+ Convert a Paddle tensor, Numpy array or python list to a python list.
134
+ """
135
+ if isinstance(obj, (dict, UserDict)):
136
+ return {k: to_py_obj(v) for k, v in obj.items()}
137
+ elif isinstance(obj, (list, tuple)):
138
+ return [to_py_obj(o) for o in obj]
139
+ elif isinstance(obj, paddle.Tensor):
140
+ return obj.numpy().tolist()
141
+ elif isinstance(obj, (np.ndarray, np.number)): # tolist also works on 0d np arrays
142
+ return obj.tolist()
143
+ else:
144
+ return obj
145
+
146
+
147
+ def _is_numpy(x):
148
+ return isinstance(x, np.ndarray)
149
+
150
+
151
+ class TruncationStrategy(ExplicitEnum):
152
+ """
153
+ Possible values for the `truncation` argument in [`PretrainedTokenizerBase.__call__`]. Useful for tab-completion in
154
+ an IDE.
155
+ """
156
+
157
+ ONLY_FIRST = "only_first"
158
+ ONLY_SECOND = "only_second"
159
+ LONGEST_FIRST = "longest_first"
160
+ DO_NOT_TRUNCATE = "do_not_truncate"
161
+
162
+
163
+ class CharSpan(NamedTuple):
164
+ """
165
+ Character span in the original string.
166
+
167
+ Args:
168
+ start (`int`): Index of the first character in the original string.
169
+ end (`int`): Index of the character following the last character in the original string.
170
+ """
171
+
172
+ start: int
173
+ end: int
174
+
175
+
176
+ class TokenSpan(NamedTuple):
177
+ """
178
+ Token span in an encoded string (list of tokens).
179
+
180
+ Args:
181
+ start (`int`): Index of the first token in the span.
182
+ end (`int`): Index of the token following the last token in the span.
183
+ """
184
+
185
+ start: int
186
+ end: int
187
+
188
+
189
+ class BatchEncoding(UserDict):
190
+ """
191
+ Holds the output of the [`PretrainedTokenizerBase.__call__`],
192
+ [`PretrainedTokenizerBase.encode_plus`] and
193
+ [`PretrainedTokenizerBase.batch_encode_plus`] methods (tokens, attention_masks, etc).
194
+
195
+ This class is derived from a python dictionary and can be used as a dictionary. In addition, this class exposes
196
+ utility methods to map from word/character space to token space.
197
+
198
+ Args:
199
+ data (`dict`):
200
+ Dictionary of lists/arrays/tensors returned by the `__call__`/`encode`/`batch_encode` methods
201
+ ('input_ids', 'attention_mask', etc.).
202
+ tensor_type (`Union[None, str, TensorType]`, *optional*):
203
+ You can give a tensor_type here to convert the lists of integers in Paddle/Numpy Tensors at
204
+ initialization.
205
+ prepend_batch_axis (`bool`, *optional*, defaults to `False`):
206
+ Whether or not to add a batch axis when converting to tensors (see `tensor_type` above).
207
+ """
208
+
209
+ def __init__(
210
+ self,
211
+ data: Optional[Dict[str, Any]] = None,
212
+ encoding: Optional[Union[FastEncoding, Sequence[FastEncoding]]] = None,
213
+ tensor_type: Union[None, str] = None,
214
+ prepend_batch_axis: bool = False,
215
+ n_sequences: Optional[int] = None,
216
+ ):
217
+ super().__init__(data)
218
+
219
+ if isinstance(encoding, FastEncoding):
220
+ encoding = [encoding]
221
+
222
+ self._encodings = encoding
223
+
224
+ if n_sequences is None and encoding is not None and len(encoding):
225
+ n_sequences = encoding[0].n_sequences
226
+
227
+ self._n_sequences = n_sequences
228
+
229
+ self.convert_to_tensors(
230
+ tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis
231
+ )
232
+
233
+ @property
234
+ def n_sequences(self) -> Optional[int]:
235
+ """
236
+ `Optional[int]`: The number of sequences used to generate each sample from the batch encoded in this
237
+ [`BatchEncoding`]. Currently can be one of `None` (unknown), `1` (a single sentence) or `2` (a pair of
238
+ sentences)
239
+ """
240
+ return self._n_sequences
241
+
242
+ @property
243
+ def is_fast(self) -> bool:
244
+ """
245
+ `bool`: Indicate whether this [`BatchEncoding`] was generated from the result of a [`PretrainedFastTokenizer`]
246
+ or not.
247
+ """
248
+ return self._encodings is not None
249
+
250
+ def __getitem__(self, item: Union[int, str]) -> Union[Any, FastEncoding]:
251
+ """
252
+ If the key is a string, returns the value of the dict associated to `key` ('input_ids', 'attention_mask',
253
+ etc.).
254
+
255
+ If the key is an integer, get the `Encoding` for batch item with index `key`.
256
+ """
257
+ if isinstance(item, str):
258
+ return self.data[item]
259
+ elif self._encodings is not None:
260
+ return self._encodings[item]
261
+ else:
262
+ raise KeyError(
263
+ "Indexing with integers is not available when using tokenizer.__call__()"
264
+ " with return_dict=True. Please set return_dict to False to use integer indexing."
265
+ )
266
+
267
+ def __getattr__(self, item: str):
268
+ try:
269
+ return self.data[item]
270
+ except KeyError:
271
+ raise AttributeError
272
+
273
+ def __getstate__(self):
274
+ return {"data": self.data, "encodings": self._encodings}
275
+
276
+ def __setstate__(self, state):
277
+ if "data" in state:
278
+ self.data = state["data"]
279
+
280
+ if "encodings" in state:
281
+ self._encodings = state["encodings"]
282
+
283
+ def keys(self):
284
+ return self.data.keys()
285
+
286
+ def values(self):
287
+ return self.data.values()
288
+
289
+ def items(self):
290
+ return self.data.items()
291
+
292
+ # After this point:
293
+ # Extended properties and methods only available for fast tokenizers
294
+ # not yet supported
295
+
296
+ @property
297
+ def encodings(self) -> Optional[List[FastEncoding]]:
298
+ """
299
+ `Optional[List[FastEncoding]]`: The list all encodings from the tokenization process. Returns `None` if
300
+ the input was tokenized through Python (i.e., not a fast) tokenizer.
301
+ """
302
+ return self._encodings
303
+
304
+ def tokens(self, batch_index: int = 0) -> List[str]:
305
+ """
306
+ Return the list of tokens (sub-parts of the input strings after word/subword splitting and before conversion to
307
+ integer indices) at a given batch index (only works for the output of a fast tokenizer).
308
+
309
+ Args:
310
+ batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.
311
+
312
+ Returns:
313
+ `List[str]`: The list of tokens at that index.
314
+ """
315
+ if not self._encodings:
316
+ raise ValueError(
317
+ "tokens() is not available when using Python-based tokenizers"
318
+ )
319
+ return self._encodings[batch_index].tokens
320
+
321
+ def sequence_ids(self, batch_index: int = 0) -> List[Optional[int]]:
322
+ """
323
+ Return a list mapping the tokens to the id of their original sentences:
324
+
325
+ - `None` for special tokens added around or between sequences,
326
+ - `0` for tokens corresponding to words in the first sequence,
327
+ - `1` for tokens corresponding to words in the second sequence when a pair of sequences was jointly
328
+ encoded.
329
+
330
+ Args:
331
+ batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.
332
+
333
+ Returns:
334
+ `List[Optional[int]]`: A list indicating the sequence id corresponding to each token. Special tokens added
335
+ by the tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding
336
+ sequence.
337
+ """
338
+ if not self._encodings:
339
+ raise ValueError(
340
+ "sequence_ids() is not available when using Python-based tokenizers"
341
+ )
342
+ return self._encodings[batch_index].sequence_ids
343
+
344
+ def words(self, batch_index: int = 0) -> List[Optional[int]]:
345
+ """
346
+ Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.
347
+
348
+ Args:
349
+ batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.
350
+
351
+ Returns:
352
+ `List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by the
353
+ tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding word
354
+ (several tokens will be mapped to the same word index if they are parts of that word).
355
+ """
356
+ if not self._encodings:
357
+ raise ValueError(
358
+ "words() is not available when using Python-based tokenizers"
359
+ )
360
+ warnings.warn(
361
+ "`BatchEncoding.words()` property is deprecated and should be replaced with the identical, "
362
+ "but more self-explanatory `BatchEncoding.word_ids()` property.",
363
+ FutureWarning,
364
+ )
365
+ return self.word_ids(batch_index)
366
+
367
+ def word_ids(self, batch_index: int = 0) -> List[Optional[int]]:
368
+ """
369
+ Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.
370
+
371
+ Args:
372
+ batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.
373
+
374
+ Returns:
375
+ `List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by the
376
+ tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding word
377
+ (several tokens will be mapped to the same word index if they are parts of that word).
378
+ """
379
+ if not self._encodings:
380
+ raise ValueError(
381
+ "word_ids() is not available when using Python-based tokenizers"
382
+ )
383
+ return self._encodings[batch_index].word_ids
384
+
385
+ def token_to_sequence(
386
+ self, batch_or_token_index: int, token_index: Optional[int] = None
387
+ ) -> int:
388
+ """
389
+ Get the index of the sequence represented by the given token. In the general use case, this method returns `0`
390
+ for a single sequence or the first sequence of a pair, and `1` for the second sequence of a pair
391
+
392
+ Can be called as:
393
+
394
+ - `self.token_to_sequence(token_index)` if batch size is 1
395
+ - `self.token_to_sequence(batch_index, token_index)` if batch size is greater than 1
396
+
397
+ This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,
398
+ words are defined by the user). In this case it allows to easily associate encoded tokens with provided
399
+ tokenized words.
400
+
401
+ Args:
402
+ batch_or_token_index (`int`):
403
+ Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of
404
+ the token in the sequence.
405
+ token_index (`int`, *optional*):
406
+ If a batch index is provided in *batch_or_token_index*, this can be the index of the token in the
407
+ sequence.
408
+
409
+ Returns:
410
+ `int`: Index of the word in the input sequence.
411
+ """
412
+
413
+ if not self._encodings:
414
+ raise ValueError(
415
+ "token_to_sequence() is not available when using Python based tokenizers"
416
+ )
417
+ if token_index is not None:
418
+ batch_index = batch_or_token_index
419
+ else:
420
+ batch_index = 0
421
+ token_index = batch_or_token_index
422
+ if batch_index < 0:
423
+ batch_index = self._batch_size + batch_index
424
+ if token_index < 0:
425
+ token_index = self._seq_len + token_index
426
+ return self._encodings[batch_index].token_to_sequence(token_index)
427
+
428
+ def token_to_word(
429
+ self, batch_or_token_index: int, token_index: Optional[int] = None
430
+ ) -> int:
431
+ """
432
+ Get the index of the word corresponding (i.e. comprising) to an encoded token in a sequence of the batch.
433
+
434
+ Can be called as:
435
+
436
+ - `self.token_to_word(token_index)` if batch size is 1
437
+ - `self.token_to_word(batch_index, token_index)` if batch size is greater than 1
438
+
439
+ This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,
440
+ words are defined by the user). In this case it allows to easily associate encoded tokens with provided
441
+ tokenized words.
442
+
443
+ Args:
444
+ batch_or_token_index (`int`):
445
+ Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
446
+ the token in the sequence.
447
+ token_index (`int`, *optional*):
448
+ If a batch index is provided in *batch_or_token_index*, this can be the index of the token in the
449
+ sequence.
450
+
451
+ Returns:
452
+ `int`: Index of the word in the input sequence.
453
+ """
454
+
455
+ if not self._encodings:
456
+ raise ValueError(
457
+ "token_to_word() is not available when using Python based tokenizers"
458
+ )
459
+ if token_index is not None:
460
+ batch_index = batch_or_token_index
461
+ else:
462
+ batch_index = 0
463
+ token_index = batch_or_token_index
464
+ if batch_index < 0:
465
+ batch_index = self._batch_size + batch_index
466
+ if token_index < 0:
467
+ token_index = self._seq_len + token_index
468
+ return self._encodings[batch_index].token_to_word(token_index)
469
+
470
+ def word_to_tokens(
471
+ self,
472
+ batch_or_word_index: int,
473
+ word_index: Optional[int] = None,
474
+ sequence_index: int = 0,
475
+ ) -> Optional[TokenSpan]:
476
+ """
477
+ Get the encoded token span corresponding to a word in a sequence of the batch.
478
+
479
+ Token spans are returned as a [`TokenSpan`] with:
480
+
481
+ - **start** -- Index of the first token.
482
+ - **end** -- Index of the token following the last token.
483
+
484
+ Can be called as:
485
+
486
+ - `self.word_to_tokens(word_index, sequence_index: int = 0)` if batch size is 1
487
+ - `self.word_to_tokens(batch_index, word_index, sequence_index: int = 0)` if batch size is greater or equal to
488
+ 1
489
+
490
+ This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
491
+ are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
492
+ words.
493
+
494
+ Args:
495
+ batch_or_word_index (`int`):
496
+ Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of
497
+ the word in the sequence.
498
+ word_index (`int`, *optional*):
499
+ If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the
500
+ sequence.
501
+ sequence_index (`int`, *optional*, defaults to 0):
502
+ If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
503
+ or 1) the provided word index belongs to.
504
+
505
+ Returns:
506
+ Optional [`TokenSpan`] Span of tokens in the encoded sequence. Returns `None` if
507
+ no tokens correspond to the word.
508
+ """
509
+
510
+ if not self._encodings:
511
+ raise ValueError(
512
+ "word_to_tokens() is not available when using Python based tokenizers"
513
+ )
514
+ if word_index is not None:
515
+ batch_index = batch_or_word_index
516
+ else:
517
+ batch_index = 0
518
+ word_index = batch_or_word_index
519
+ if batch_index < 0:
520
+ batch_index = self._batch_size + batch_index
521
+ if word_index < 0:
522
+ word_index = self._seq_len + word_index
523
+ span = self._encodings[batch_index].word_to_tokens(word_index, sequence_index)
524
+ return TokenSpan(*span) if span is not None else None
525
+
526
+ def token_to_chars(
527
+ self, batch_or_token_index: int, token_index: Optional[int] = None
528
+ ) -> CharSpan:
529
+ """
530
+ Get the character span corresponding to an encoded token in a sequence of the batch.
531
+
532
+ Character spans are returned as a [`CharSpan`] with:
533
+
534
+ - **start** -- Index of the first character in the original string associated to the token.
535
+ - **end** -- Index of the character following the last character in the original string associated to the
536
+ token.
537
+
538
+ Can be called as:
539
+
540
+ - `self.token_to_chars(token_index)` if batch size is 1
541
+ - `self.token_to_chars(batch_index, token_index)` if batch size is greater or equal to 1
542
+
543
+ Args:
544
+ batch_or_token_index (`int`):
545
+ Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
546
+ the token in the sequence.
547
+ token_index (`int`, *optional*):
548
+ If a batch index is provided in *batch_or_token_index*, this can be the index of the token or tokens in
549
+ the sequence.
550
+
551
+ Returns:
552
+ [`CharSpan`]: Span of characters in the original string.
553
+ """
554
+
555
+ if not self._encodings:
556
+ raise ValueError(
557
+ "token_to_chars() is not available when using Python based tokenizers"
558
+ )
559
+ if token_index is not None:
560
+ batch_index = batch_or_token_index
561
+ else:
562
+ batch_index = 0
563
+ token_index = batch_or_token_index
564
+ return CharSpan(*(self._encodings[batch_index].token_to_chars(token_index)))
565
+
566
+ def char_to_token(
567
+ self,
568
+ batch_or_char_index: int,
569
+ char_index: Optional[int] = None,
570
+ sequence_index: int = 0,
571
+ ) -> int:
572
+ """
573
+ Get the index of the token in the encoded output comprising a character in the original string for a sequence
574
+ of the batch.
575
+
576
+ Can be called as:
577
+
578
+ - `self.char_to_token(char_index)` if batch size is 1
579
+ - `self.char_to_token(batch_index, char_index)` if batch size is greater or equal to 1
580
+
581
+ This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
582
+ are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
583
+ words.
584
+
585
+ Args:
586
+ batch_or_char_index (`int`):
587
+ Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
588
+ the word in the sequence
589
+ char_index (`int`, *optional*):
590
+ If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the
591
+ sequence.
592
+ sequence_index (`int`, *optional*, defaults to 0):
593
+ If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
594
+ or 1) the provided character index belongs to.
595
+
596
+
597
+ Returns:
598
+ `int`: Index of the token.
599
+ """
600
+
601
+ if not self._encodings:
602
+ raise ValueError(
603
+ "char_to_token() is not available when using Python based tokenizers"
604
+ )
605
+ if char_index is not None:
606
+ batch_index = batch_or_char_index
607
+ else:
608
+ batch_index = 0
609
+ char_index = batch_or_char_index
610
+ return self._encodings[batch_index].char_to_token(char_index, sequence_index)
611
+
612
+ def word_to_chars(
613
+ self,
614
+ batch_or_word_index: int,
615
+ word_index: Optional[int] = None,
616
+ sequence_index: int = 0,
617
+ ) -> CharSpan:
618
+ """
619
+ Get the character span in the original string corresponding to given word in a sequence of the batch.
620
+
621
+ Character spans are returned as a CharSpan NamedTuple with:
622
+
623
+ - start: index of the first character in the original string
624
+ - end: index of the character following the last character in the original string
625
+
626
+ Can be called as:
627
+
628
+ - `self.word_to_chars(word_index)` if batch size is 1
629
+ - `self.word_to_chars(batch_index, word_index)` if batch size is greater or equal to 1
630
+
631
+ Args:
632
+ batch_or_word_index (`int`):
633
+ Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
634
+ the word in the sequence
635
+ word_index (`int`, *optional*):
636
+ If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the
637
+ sequence.
638
+ sequence_index (`int`, *optional*, defaults to 0):
639
+ If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
640
+ or 1) the provided word index belongs to.
641
+
642
+ Returns:
643
+ `CharSpan` or `List[CharSpan]`: Span(s) of the associated character or characters in the string. CharSpan
644
+ are NamedTuple with:
645
+
646
+ - start: index of the first character associated to the token in the original string
647
+ - end: index of the character following the last character associated to the token in the original
648
+ string
649
+ """
650
+
651
+ if not self._encodings:
652
+ raise ValueError(
653
+ "word_to_chars() is not available when using Python based tokenizers"
654
+ )
655
+ if word_index is not None:
656
+ batch_index = batch_or_word_index
657
+ else:
658
+ batch_index = 0
659
+ word_index = batch_or_word_index
660
+ return CharSpan(
661
+ *(self._encodings[batch_index].word_to_chars(word_index, sequence_index))
662
+ )
663
+
664
+ def char_to_word(
665
+ self,
666
+ batch_or_char_index: int,
667
+ char_index: Optional[int] = None,
668
+ sequence_index: int = 0,
669
+ ) -> int:
670
+ """
671
+ Get the word in the original string corresponding to a character in the original string of a sequence of the
672
+ batch.
673
+
674
+ Can be called as:
675
+
676
+ - `self.char_to_word(char_index)` if batch size is 1
677
+ - `self.char_to_word(batch_index, char_index)` if batch size is greater than 1
678
+
679
+ This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
680
+ are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
681
+ words.
682
+
683
+ Args:
684
+ batch_or_char_index (`int`):
685
+ Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
686
+ the character in the original string.
687
+ char_index (`int`, *optional*):
688
+ If a batch index is provided in *batch_or_token_index*, this can be the index of the character in the
689
+ original string.
690
+ sequence_index (`int`, *optional*, defaults to 0):
691
+ If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
692
+ or 1) the provided character index belongs to.
693
+
694
+
695
+ Returns:
696
+ `int` or `List[int]`: Index or indices of the associated encoded token(s).
697
+ """
698
+
699
+ if not self._encodings:
700
+ raise ValueError(
701
+ "char_to_word() is not available when using Python based tokenizers"
702
+ )
703
+ if char_index is not None:
704
+ batch_index = batch_or_char_index
705
+ else:
706
+ batch_index = 0
707
+ char_index = batch_or_char_index
708
+ return self._encodings[batch_index].char_to_word(char_index, sequence_index)
709
+
710
+ def convert_to_tensors(
711
+ self,
712
+ tensor_type: Optional[Union[str, TensorType]] = None,
713
+ prepend_batch_axis: bool = False,
714
+ ):
715
+ """
716
+ Convert the inner content to tensors.
717
+
718
+ Args:
719
+ tensor_type (`str` or [`TensorType`], *optional*):
720
+ The type of tensors to use. If `str`, should be one of the values of the enum [`TensorType`]. If
721
+ `None`, no modification is done.
722
+ prepend_batch_axis (`int`, *optional*, defaults to `False`):
723
+ Whether or not to add the batch dimension during the conversion.
724
+ """
725
+ if tensor_type is None:
726
+ return self
727
+
728
+ # Convert to TensorType
729
+ if not isinstance(tensor_type, TensorType):
730
+ tensor_type = TensorType(tensor_type)
731
+ # Get a function reference for the correct framework
732
+ if tensor_type == TensorType.PADDLE:
733
+ as_tensor = paddle.to_tensor
734
+ is_tensor = paddle.is_tensor
735
+ else:
736
+ as_tensor = np.asarray
737
+ is_tensor = _is_numpy
738
+
739
+ # Do the tensor conversion in batch
740
+ for key, value in self.items():
741
+ try:
742
+ if prepend_batch_axis:
743
+ value = [value]
744
+
745
+ if not is_tensor(value):
746
+ tensor = as_tensor(value)
747
+
748
+ self[key] = tensor
749
+ except: # noqa E722
750
+ if key == "overflowing_tokens":
751
+ raise ValueError(
752
+ "Unable to create tensor returning overflowing tokens of different lengths. "
753
+ "Please see if a fast version of this tokenizer is available to have this feature available."
754
+ )
755
+ raise ValueError(
756
+ "Unable to create tensor, you should probably activate truncation and/or padding "
757
+ "with 'padding=True' 'truncation=True' to have batched tensors with the same length."
758
+ )
759
+
760
+ return self
761
+
762
+
763
+ class SpecialTokensMixin:
764
+ """
765
+ A mixin derived by [`PretrainedTokenizer`] to handle specific behaviors related to
766
+ special tokens. In particular, this class hold the attributes which can be used to directly access these special
767
+ tokens in a model-independent manner and allow to set and update the special tokens.
768
+
769
+ Args:
770
+ bos_token (`str` or `AddedToken`, *optional*):
771
+ A special token representing the beginning of a sentence.
772
+ eos_token (`str` or `AddedToken`, *optional*):
773
+ A special token representing the end of a sentence.
774
+ unk_token (`str` or `AddedToken`, *optional*):
775
+ A special token representing an out-of-vocabulary token.
776
+ sep_token (`str` or `AddedToken`, *optional*):
777
+ A special token separating two different sentences in the same input (used by BERT for instance).
778
+ pad_token (`str` or `AddedToken`, *optional*):
779
+ A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
780
+ attention mechanisms or loss computation.
781
+ cls_token (`str` or `AddedToken`, *optional*):
782
+ A special token representing the class of the input (used by BERT for instance).
783
+ mask_token (`str` or `AddedToken`, *optional*):
784
+ A special token representing a masked token (used by masked-language modeling pretraining objectives, like
785
+ BERT).
786
+ additional_special_tokens (tuple or list of `str` or `AddedToken`, *optional*):
787
+ A tuple or a list of additional special tokens.
788
+ """
789
+
790
+ SPECIAL_TOKENS_ATTRIBUTES = [
791
+ "bos_token",
792
+ "eos_token",
793
+ "unk_token",
794
+ "sep_token",
795
+ "pad_token",
796
+ "cls_token",
797
+ "mask_token",
798
+ "additional_special_tokens",
799
+ ]
800
+
801
+ def __init__(self, verbose=True, **kwargs):
802
+ # note(guosheng): Since `__init__` might be called multiple times which
803
+ # is hooked before `PretrainedTokenizer` init, we do not set to None as
804
+ # HF to avoid unintentional overriding.
805
+ self._bos_token = getattr(self, "_bos_token", None)
806
+ self._eos_token = getattr(self, "_eos_token", None)
807
+ self._unk_token = getattr(self, "_unk_token", None)
808
+ self._sep_token = getattr(self, "_sep_token", None)
809
+ self._pad_token = getattr(self, "_pad_token", None)
810
+ self._cls_token = getattr(self, "_cls_token", None)
811
+ self._mask_token = getattr(self, "_mask_token", None)
812
+ self._pad_token_type_id = getattr(self, "_pad_token_type_id", 0)
813
+ self._additional_special_tokens = getattr(
814
+ self, "_additional_special_tokens", []
815
+ )
816
+ self.verbose = verbose
817
+
818
+ # We directly set the hidden value to allow initialization with special tokens
819
+ # which are not yet in the vocabulary. Necessary for serialization/de-serialization
820
+ # TODO clean this up at some point (probably by switching to fast tokenizers)
821
+ for key, value in kwargs.items():
822
+ if value is None:
823
+ continue
824
+ if key in self.SPECIAL_TOKENS_ATTRIBUTES:
825
+ if key == "additional_special_tokens":
826
+ assert isinstance(
827
+ value, (list, tuple)
828
+ ), f"Value {value} is not a list or tuple"
829
+ assert all(
830
+ isinstance(t, (str, AddedToken)) for t in value
831
+ ), "One of the tokens is not a string or an AddedToken"
832
+ setattr(self, key, value)
833
+ elif isinstance(value, (str, AddedToken)):
834
+ setattr(self, key, value)
835
+ else:
836
+ raise TypeError(
837
+ f"special token {key} has to be either str or AddedToken but got: {type(value)}"
838
+ )
839
+
840
+ def sanitize_special_tokens(self) -> int:
841
+ """
842
+ Make sure that all the special tokens attributes of the tokenizer (`tokenizer.mask_token`,
843
+ `tokenizer.cls_token`, etc.) are in the vocabulary.
844
+
845
+ Add the missing ones to the vocabulary if needed.
846
+
847
+ Return:
848
+ `int`: The number of tokens added in the vocabulary during the operation.
849
+ """
850
+ return self.add_tokens(self.all_special_tokens_extended, special_tokens=True)
851
+
852
+ def add_special_tokens(
853
+ self, special_tokens_dict: Dict[str, Union[str, AddedToken]]
854
+ ) -> int:
855
+ """
856
+ Add a dictionary of special tokens (eos, pad, cls, etc.) to the encoder and link them to class attributes. If
857
+ special tokens are NOT in the vocabulary, they are added to it (indexed starting from the last index of the
858
+ current vocabulary).
859
+
860
+ Note,None When adding new tokens to the vocabulary, you should make sure to also resize the token embedding
861
+ matrix of the model so that its embedding matrix matches the tokenizer.
862
+
863
+ In order to do that, please use the [`~PreTrainedModel.resize_token_embeddings`] method.
864
+
865
+ Using `add_special_tokens` will ensure your special tokens can be used in several ways:
866
+
867
+ - Special tokens are carefully handled by the tokenizer (they are never split).
868
+ - You can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This
869
+ makes it easy to develop model-agnostic training and fine-tuning scripts.
870
+
871
+ When possible, special tokens are already registered for provided pretrained models (for instance
872
+ [`BertTokenizer`] `cls_token` is already registered to be :obj*'[CLS]'* and XLM's one is also registered to be
873
+ `'</s>'`).
874
+
875
+ Args:
876
+ special_tokens_dict (dictionary *str* to *str* or `AddedToken`):
877
+ Keys should be in the list of predefined special attributes: [`bos_token`, `eos_token`, `unk_token`,
878
+ `sep_token`, `pad_token`, `cls_token`, `mask_token`, `additional_special_tokens`].
879
+
880
+ Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer
881
+ assign the index of the `unk_token` to them).
882
+
883
+ Returns:
884
+ `int`: Number of tokens added to the vocabulary.
885
+
886
+ Examples:
887
+
888
+ ```python
889
+ # Let's see how to add a new classification token to GPT-2
890
+ tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
891
+ model = GPT2Model.from_pretrained("gpt2")
892
+
893
+ special_tokens_dict = {"cls_token": "<CLS>"}
894
+
895
+ num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
896
+ print("We have added", num_added_toks, "tokens")
897
+ # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
898
+ model.resize_token_embeddings(len(tokenizer))
899
+
900
+ assert tokenizer.cls_token == "<CLS>"
901
+ ```"""
902
+ if not special_tokens_dict:
903
+ return 0
904
+
905
+ added_tokens = 0
906
+ for key, value in special_tokens_dict.items():
907
+ assert (
908
+ key in self.SPECIAL_TOKENS_ATTRIBUTES
909
+ ), f"Key {key} is not a special token"
910
+
911
+ if self.verbose:
912
+ logging.info(f"Assigning {value} to the {key} key of the tokenizer")
913
+ setattr(self, key, value)
914
+
915
+ if key == "additional_special_tokens":
916
+ assert isinstance(value, (list, tuple)) and all(
917
+ isinstance(t, (str, AddedToken)) for t in value
918
+ ), f"Tokens {value} for key {key} should all be str or AddedToken instances"
919
+ added_tokens += self.add_tokens(value, special_tokens=True)
920
+ else:
921
+ assert isinstance(
922
+ value, (str, AddedToken)
923
+ ), f"Token {value} for key {key} should be a str or an AddedToken instance"
924
+ added_tokens += self.add_tokens([value], special_tokens=True)
925
+
926
+ return added_tokens
927
+
928
+ def add_tokens(
929
+ self,
930
+ new_tokens: Union[str, AddedToken, List[Union[str, AddedToken]]],
931
+ special_tokens: bool = False,
932
+ ) -> int:
933
+ """
934
+ Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
935
+ it with indices starting from length of the current vocabulary.
936
+
937
+ Note,None When adding new tokens to the vocabulary, you should make sure to also resize the token embedding
938
+ matrix of the model so that its embedding matrix matches the tokenizer.
939
+
940
+ In order to do that, please use the [`~PreTrainedModel.resize_token_embeddings`] method.
941
+
942
+ Args:
943
+ new_tokens (`str`, `AddedToken` or a list of *str* or `AddedToken`):
944
+ Tokens are only added if they are not already in the vocabulary. `AddedToken` wraps a string
945
+ token to let you personalize its behavior: whether this token should only match against a single word,
946
+ whether this token should strip all potential whitespaces on the left side, whether this token should
947
+ strip all potential whitespaces on the right side, etc.
948
+ special_tokens (`bool`, *optional*, defaults to `False`):
949
+ Can be used to specify if the token is a special token. This mostly change the normalization behavior
950
+ (special tokens like CLS or [MASK] are usually not lower-cased for instance).
951
+
952
+ Returns:
953
+ `int`: Number of tokens added to the vocabulary.
954
+
955
+ Examples:
956
+
957
+ ```python
958
+ # Let's see how to increase the vocabulary of Bert model and tokenizer
959
+ tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
960
+ model = BertModel.from_pretrained("bert-base-uncased")
961
+
962
+ num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
963
+ print("We have added", num_added_toks, "tokens")
964
+ # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
965
+ model.resize_token_embeddings(len(tokenizer))
966
+ ```"""
967
+ if not new_tokens:
968
+ return 0
969
+
970
+ if not isinstance(new_tokens, (list, tuple)):
971
+ new_tokens = [new_tokens]
972
+
973
+ return self._add_tokens(new_tokens, special_tokens=special_tokens)
974
+
975
+ def _add_tokens(
976
+ self,
977
+ new_tokens: Union[List[str], List[AddedToken]],
978
+ special_tokens: bool = False,
979
+ ) -> int:
980
+ raise NotImplementedError
981
+
982
+ @property
983
+ def bos_token(self) -> str:
984
+ """
985
+ `str`: Beginning of sentence token. Log an error if used while not having been set.
986
+ """
987
+ if self._bos_token is None and self.verbose:
988
+ logging.error("Using bos_token, but it is not set yet.")
989
+ return None
990
+ return str(self._bos_token)
991
+
992
+ @property
993
+ def eos_token(self) -> str:
994
+ """
995
+ `str`: End of sentence token. Log an error if used while not having been set.
996
+ """
997
+ if self._eos_token is None and self.verbose:
998
+ logging.error("Using eos_token, but it is not set yet.")
999
+ return None
1000
+ return str(self._eos_token)
1001
+
1002
+ @property
1003
+ def unk_token(self) -> str:
1004
+ """
1005
+ `str`: Unknown token. Log an error if used while not having been set.
1006
+ """
1007
+ if self._unk_token is None and self.verbose:
1008
+ logging.error("Using unk_token, but it is not set yet.")
1009
+ return None
1010
+ return str(self._unk_token)
1011
+
1012
+ @property
1013
+ def sep_token(self) -> str:
1014
+ """
1015
+ `str`: Separation token, to separate context and query in an input sequence. Log an error if used while not
1016
+ having been set.
1017
+ """
1018
+ if self._sep_token is None and self.verbose:
1019
+ logging.error("Using sep_token, but it is not set yet.")
1020
+ return None
1021
+ return str(self._sep_token)
1022
+
1023
+ @property
1024
+ def pad_token(self) -> str:
1025
+ """
1026
+ `str`: Padding token. Log an error if used while not having been set.
1027
+ """
1028
+ if self._pad_token is None and self.verbose:
1029
+ logging.error("Using pad_token, but it is not set yet.")
1030
+ return None
1031
+ return str(self._pad_token)
1032
+
1033
+ @property
1034
+ def cls_token(self) -> str:
1035
+ """
1036
+ `str`: Classification token, to extract a summary of an input sequence leveraging self-attention along the full
1037
+ depth of the model. Log an error if used while not having been set.
1038
+ """
1039
+ if self._cls_token is None and self.verbose:
1040
+ logging.error("Using cls_token, but it is not set yet.")
1041
+ return None
1042
+ return str(self._cls_token)
1043
+
1044
+ @property
1045
+ def mask_token(self) -> str:
1046
+ """
1047
+ `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not
1048
+ having been set.
1049
+ """
1050
+ if self._mask_token is None and self.verbose:
1051
+ logging.error("Using mask_token, but it is not set yet.")
1052
+ return None
1053
+ return str(self._mask_token)
1054
+
1055
+ @property
1056
+ def additional_special_tokens(self) -> List[str]:
1057
+ """
1058
+ `List[str]`: All the additional special tokens you may want to use. Log an error if used while not having been
1059
+ set.
1060
+ """
1061
+ if self._additional_special_tokens is None and self.verbose:
1062
+ logging.error("Using additional_special_tokens, but it is not set yet.")
1063
+ return None
1064
+ return [str(tok) for tok in self._additional_special_tokens]
1065
+
1066
+ @bos_token.setter
1067
+ def bos_token(self, value):
1068
+ self._bos_token = value
1069
+
1070
+ @eos_token.setter
1071
+ def eos_token(self, value):
1072
+ self._eos_token = value
1073
+
1074
+ @unk_token.setter
1075
+ def unk_token(self, value):
1076
+ self._unk_token = value
1077
+
1078
+ @sep_token.setter
1079
+ def sep_token(self, value):
1080
+ self._sep_token = value
1081
+
1082
+ @pad_token.setter
1083
+ def pad_token(self, value):
1084
+ self._pad_token = value
1085
+
1086
+ @cls_token.setter
1087
+ def cls_token(self, value):
1088
+ self._cls_token = value
1089
+
1090
+ @mask_token.setter
1091
+ def mask_token(self, value):
1092
+ self._mask_token = value
1093
+
1094
+ @additional_special_tokens.setter
1095
+ def additional_special_tokens(self, value):
1096
+ self._additional_special_tokens = value
1097
+
1098
+ @property
1099
+ def bos_token_id(self) -> Optional[int]:
1100
+ """
1101
+ `Optional[int]`: Id of the beginning of sentence token in the vocabulary. Returns `None` if the token has not
1102
+ been set.
1103
+ """
1104
+ if self._bos_token is None:
1105
+ return None
1106
+ return self.convert_tokens_to_ids(self.bos_token)
1107
+
1108
+ @property
1109
+ def eos_token_id(self) -> Optional[int]:
1110
+ """
1111
+ `Optional[int]`: Id of the end of sentence token in the vocabulary. Returns `None` if the token has not been
1112
+ set.
1113
+ """
1114
+ if self._eos_token is None:
1115
+ return None
1116
+ return self.convert_tokens_to_ids(self.eos_token)
1117
+
1118
+ @property
1119
+ def unk_token_id(self) -> Optional[int]:
1120
+ """
1121
+ `Optional[int]`: Id of the unknown token in the vocabulary. Returns `None` if the token has not been set.
1122
+ """
1123
+ if self._unk_token is None:
1124
+ return None
1125
+ return self.convert_tokens_to_ids(self.unk_token)
1126
+
1127
+ @property
1128
+ def sep_token_id(self) -> Optional[int]:
1129
+ """
1130
+ `Optional[int]`: Id of the separation token in the vocabulary, to separate context and query in an input
1131
+ sequence. Returns `None` if the token has not been set.
1132
+ """
1133
+ if self._sep_token is None:
1134
+ return None
1135
+ return self.convert_tokens_to_ids(self.sep_token)
1136
+
1137
+ @property
1138
+ def pad_token_id(self) -> Optional[int]:
1139
+ """
1140
+ `Optional[int]`: Id of the padding token in the vocabulary. Returns `None` if the token has not been set.
1141
+ """
1142
+ if self._pad_token is None:
1143
+ return None
1144
+ return self.convert_tokens_to_ids(self.pad_token)
1145
+
1146
+ @property
1147
+ def pad_token_type_id(self) -> int:
1148
+ """
1149
+ `int`: Id of the padding token type in the vocabulary.
1150
+ """
1151
+ return self._pad_token_type_id
1152
+
1153
+ @property
1154
+ def cls_token_id(self) -> Optional[int]:
1155
+ """
1156
+ `Optional[int]`: Id of the classification token in the vocabulary, to extract a summary of an input sequence
1157
+ leveraging self-attention along the full depth of the model.
1158
+
1159
+ Returns `None` if the token has not been set.
1160
+ """
1161
+ if self._cls_token is None:
1162
+ return None
1163
+ return self.convert_tokens_to_ids(self.cls_token)
1164
+
1165
+ @property
1166
+ def mask_token_id(self) -> Optional[int]:
1167
+ """
1168
+ `Optional[int]`: Id of the mask token in the vocabulary, used when training a model with masked-language
1169
+ modeling. Returns `None` if the token has not been set.
1170
+ """
1171
+ if self._mask_token is None:
1172
+ return None
1173
+ return self.convert_tokens_to_ids(self.mask_token)
1174
+
1175
+ @property
1176
+ def additional_special_tokens_ids(self) -> List[int]:
1177
+ """
1178
+ `List[int]`: Ids of all the additional special tokens in the vocabulary. Log an error if used while not having
1179
+ been set.
1180
+ """
1181
+ return self.convert_tokens_to_ids(self.additional_special_tokens)
1182
+
1183
+ @bos_token_id.setter
1184
+ def bos_token_id(self, value):
1185
+ self._bos_token = (
1186
+ self.convert_ids_to_tokens(value) if value is not None else None
1187
+ )
1188
+
1189
+ @eos_token_id.setter
1190
+ def eos_token_id(self, value):
1191
+ self._eos_token = (
1192
+ self.convert_ids_to_tokens(value) if value is not None else None
1193
+ )
1194
+
1195
+ @unk_token_id.setter
1196
+ def unk_token_id(self, value):
1197
+ self._unk_token = (
1198
+ self.convert_ids_to_tokens(value) if value is not None else None
1199
+ )
1200
+
1201
+ @sep_token_id.setter
1202
+ def sep_token_id(self, value):
1203
+ self._sep_token = (
1204
+ self.convert_ids_to_tokens(value) if value is not None else None
1205
+ )
1206
+
1207
+ @pad_token_id.setter
1208
+ def pad_token_id(self, value):
1209
+ self._pad_token = (
1210
+ self.convert_ids_to_tokens(value) if value is not None else None
1211
+ )
1212
+
1213
+ @cls_token_id.setter
1214
+ def cls_token_id(self, value):
1215
+ self._cls_token = (
1216
+ self.convert_ids_to_tokens(value) if value is not None else None
1217
+ )
1218
+
1219
+ @mask_token_id.setter
1220
+ def mask_token_id(self, value):
1221
+ self._mask_token = (
1222
+ self.convert_ids_to_tokens(value) if value is not None else None
1223
+ )
1224
+
1225
+ @additional_special_tokens_ids.setter
1226
+ def additional_special_tokens_ids(self, values):
1227
+ self._additional_special_tokens = [
1228
+ self.convert_ids_to_tokens(value) for value in values
1229
+ ]
1230
+
1231
+ @property
1232
+ def special_tokens_map(self) -> Dict[str, Union[str, List[str]]]:
1233
+ """
1234
+ `Dict[str, Union[str, List[str]]]`: A dictionary mapping special token class attributes (`cls_token`,
1235
+ `unk_token`, etc.) to their values (`'<unk>'`, `'<cls>'`, etc.).
1236
+
1237
+ Convert potential tokens of `AddedToken` type to string.
1238
+ """
1239
+ set_attr = {}
1240
+ for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
1241
+ attr_value = getattr(self, "_" + attr)
1242
+ if attr_value:
1243
+ set_attr[attr] = (
1244
+ type(attr_value)(
1245
+ str(attr_value_sub) for attr_value_sub in attr_value
1246
+ )
1247
+ if isinstance(attr_value, (list, tuple))
1248
+ else str(attr_value)
1249
+ )
1250
+ return set_attr
1251
+
1252
+ @property
1253
+ def special_tokens_map_extended(
1254
+ self,
1255
+ ) -> Dict[str, Union[str, AddedToken, List[Union[str, AddedToken]]]]:
1256
+ """
1257
+ `Dict[str, Union[str, AddedToken, List[Union[str, AddedToken]]]]`: A dictionary mapping
1258
+ special token class attributes (`cls_token`, `unk_token`, etc.) to their values (`'<unk>'`, `'<cls>'`, etc.).
1259
+
1260
+ Don't convert tokens of `AddedToken` type to string so they can be used to control more finely how
1261
+ special tokens are tokenized.
1262
+ """
1263
+ set_attr = {}
1264
+ for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
1265
+ attr_value = getattr(self, "_" + attr, None)
1266
+ if attr_value:
1267
+ set_attr[attr] = attr_value
1268
+ return set_attr
1269
+
1270
+ @property
1271
+ def all_special_tokens(self) -> List[str]:
1272
+ """
1273
+ `List[str]`: All the special tokens (`'<unk>'`, `'<cls>'`, etc.) mapped to class attributes.
1274
+
1275
+ Convert tokens of `AddedToken` type to string.
1276
+ """
1277
+ all_toks = [str(s) for s in self.all_special_tokens_extended]
1278
+ return all_toks
1279
+
1280
+ @property
1281
+ def all_special_tokens_extended(self) -> List[Union[str, AddedToken]]:
1282
+ """
1283
+ `List[Union[str, AddedToken]]`: All the special tokens (`'<unk>'`, `'<cls>'`, etc.) mapped to class
1284
+ attributes.
1285
+
1286
+ Don't convert tokens of `AddedToken` type to string so they can be used to control more finely how
1287
+ special tokens are tokenized.
1288
+ """
1289
+ all_toks = []
1290
+ set_attr = self.special_tokens_map_extended
1291
+ for attr_value in set_attr.values():
1292
+ all_toks = all_toks + (
1293
+ list(attr_value)
1294
+ if isinstance(attr_value, (list, tuple))
1295
+ else [attr_value]
1296
+ )
1297
+ all_toks = list(OrderedDict.fromkeys(all_toks))
1298
+ return all_toks
1299
+
1300
+ @property
1301
+ def all_special_ids(self) -> List[int]:
1302
+ """
1303
+ `List[int]`: List the ids of the special tokens(`'<unk>'`, `'<cls>'`, etc.) mapped to class attributes.
1304
+ """
1305
+ all_toks = self.all_special_tokens
1306
+ all_ids = self.convert_tokens_to_ids(all_toks)
1307
+ return all_ids
1308
+
1309
+
1310
+ class PretrainedTokenizerBase(SpecialTokensMixin):
1311
+ """
1312
+ Base class for [`PretrainedTokenizer`].
1313
+
1314
+ Class attributes (overridden by derived classes)
1315
+
1316
+ - **resource_files_names** (`Dict[str, str]`) -- A dictionary with, as keys, the `__init__` keyword name of each
1317
+ vocabulary file required by the model, and as associated values, the filename for saving the associated file
1318
+ (string).
1319
+ - **pretrained_resource_files_map** (`Dict[str, Dict[str, str]]`) -- A dictionary of dictionaries, with the
1320
+ high-level keys being the `__init__` keyword name of each vocabulary file required by the model, the
1321
+ low-level being the `short-cut-names` of the pretrained models with, as associated values, the `url` to the
1322
+ associated pretrained vocabulary file.
1323
+ - **max_model_input_sizes** (`Dict[str, Optional[int]]`) -- A dictionary with, as keys, the `short-cut-names`
1324
+ of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model,
1325
+ or `None` if the model has no maximum input size.
1326
+ - **pretrained_init_configuration** (`Dict[str, Dict[str, Any]]`) -- A dictionary with, as keys, the
1327
+ `short-cut-names` of the pretrained models, and as associated values, a dictionary of specific arguments to
1328
+ pass to the `__init__` method of the tokenizer class for this pretrained model when loading the tokenizer
1329
+ with the [`~tokenizer_utils_base.PretrainedTokenizerBase.from_pretrained`] method.
1330
+ - **model_input_names** (`List[str]`) -- A list of inputs expected in the forward pass of the model.
1331
+ - **padding_side** (`str`) -- The default value for the side on which the model should have padding applied.
1332
+ Should be `'right'` or `'left'`.
1333
+ - **truncation_side** (`str`) -- The default value for the side on which the model should have truncation
1334
+ applied. Should be `'right'` or `'left'`.
1335
+
1336
+ Args:
1337
+ model_max_length (`int`, *optional*):
1338
+ The maximum length (in number of tokens) for the inputs to the transformer model. When the tokenizer is
1339
+ loaded with [`~tokenizer_utils_base.PretrainedTokenizerBase.from_pretrained`], this will be set to the
1340
+ value stored for the associated model in `max_model_input_sizes` (see above). If no value is provided, will
1341
+ default to VERY_LARGE_INTEGER (`int(1e30)`).
1342
+ padding_side (`str`, *optional*):
1343
+ The side on which the model should have padding applied. Should be selected between ['right', 'left'].
1344
+ Default value is picked from the class attribute of the same name.
1345
+ truncation_side (`str`, *optional*):
1346
+ The side on which the model should have truncation applied. Should be selected between ['right', 'left'].
1347
+ Default value is picked from the class attribute of the same name.
1348
+ model_input_names (`List[string]`, *optional*):
1349
+ The list of inputs accepted by the forward pass of the model (like `"token_type_ids"` or
1350
+ `"attention_mask"`). Default value is picked from the class attribute of the same name.
1351
+ bos_token (`str` or `AddedToken`, *optional*):
1352
+ A special token representing the beginning of a sentence. Will be associated to `self.bos_token` and
1353
+ `self.bos_token_id`.
1354
+ eos_token (`str` or `AddedToken`, *optional*):
1355
+ A special token representing the end of a sentence. Will be associated to `self.eos_token` and
1356
+ `self.eos_token_id`.
1357
+ unk_token (`str` or `AddedToken`, *optional*):
1358
+ A special token representing an out-of-vocabulary token. Will be associated to `self.unk_token` and
1359
+ `self.unk_token_id`.
1360
+ sep_token (`str` or `AddedToken`, *optional*):
1361
+ A special token separating two different sentences in the same input (used by BERT for instance). Will be
1362
+ associated to `self.sep_token` and `self.sep_token_id`.
1363
+ pad_token (`str` or `AddedToken`, *optional*):
1364
+ A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
1365
+ attention mechanisms or loss computation. Will be associated to `self.pad_token` and `self.pad_token_id`.
1366
+ cls_token (`str` or `AddedToken`, *optional*):
1367
+ A special token representing the class of the input (used by BERT for instance). Will be associated to
1368
+ `self.cls_token` and `self.cls_token_id`.
1369
+ mask_token (`str` or `AddedToken`, *optional*):
1370
+ A special token representing a masked token (used by masked-language modeling pretraining objectives, like
1371
+ BERT). Will be associated to `self.mask_token` and `self.mask_token_id`.
1372
+ additional_special_tokens (tuple or list of `str` or `AddedToken`, *optional*):
1373
+ A tuple or a list of additional special tokens. Add them here to ensure they won't be split by the
1374
+ tokenization process. Will be associated to `self.additional_special_tokens` and
1375
+ `self.additional_special_tokens_ids`.
1376
+ """
1377
+
1378
+ resource_files_names: Dict[str, str] = {}
1379
+ pretrained_resource_files_map: Dict[str, Dict[str, str]] = {}
1380
+ pretrained_init_configuration: Dict[str, Dict[str, Any]] = {}
1381
+ max_model_input_sizes: Dict[str, Optional[int]] = {}
1382
+ _auto_class: Optional[str] = None
1383
+ tokenizer_config_file = TOKENIZER_CONFIG_NAME
1384
+
1385
+ # first name has to correspond to main model input name
1386
+ # to make sure `tokenizer.pad(...)` works correctly
1387
+ model_input_names: List[str] = ["input_ids", "token_type_ids"]
1388
+ padding_side: str = "right"
1389
+ truncation_side: str = "right"
1390
+ slow_tokenizer_class = None
1391
+
1392
+ def __init__(self, **kwargs):
1393
+ # inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``)
1394
+ self.init_inputs = ()
1395
+
1396
+ self.init_kwargs = getattr(self, "init_kwargs", None) or copy.deepcopy(kwargs)
1397
+ self.name_or_path = kwargs.pop("name_or_path", "")
1398
+ self._processor_class = kwargs.pop("processor_class", None)
1399
+
1400
+ # For backward compatibility we fallback to set model_max_length from max_len if provided
1401
+ model_max_length = kwargs.pop("model_max_length", kwargs.pop("max_len", None))
1402
+ self.model_max_length = (
1403
+ model_max_length if model_max_length is not None else VERY_LARGE_INTEGER
1404
+ )
1405
+
1406
+ # Padding and truncation side are right by default and overridden in subclasses. If specified in the kwargs, it
1407
+ # is changed.
1408
+ self.padding_side = kwargs.pop("padding_side", self.padding_side)
1409
+ if self.padding_side not in ["right", "left"]:
1410
+ raise ValueError(
1411
+ f"Padding side should be selected between 'right' and 'left', current value: {self.padding_side}"
1412
+ )
1413
+
1414
+ self.truncation_side = kwargs.pop("truncation_side", self.truncation_side)
1415
+ if self.truncation_side not in ["right", "left"]:
1416
+ raise ValueError(
1417
+ f"Padding side should be selected between 'right' and 'left', current value: {self.truncation_side}"
1418
+ )
1419
+
1420
+ self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)
1421
+
1422
+ self.deprecation_warnings = (
1423
+ {}
1424
+ ) # Use to store when we have already noticed a deprecation warning (avoid overlogging).
1425
+
1426
+ super().__init__(**kwargs)
1427
+
1428
+ @property
1429
+ def max_len_single_sentence(self) -> int:
1430
+ """
1431
+ `int`: The maximum length of a sentence that can be fed to the model.
1432
+ """
1433
+ return self.model_max_length - self.num_special_tokens_to_add(pair=False)
1434
+
1435
+ @property
1436
+ def max_len_sentences_pair(self) -> int:
1437
+ """
1438
+ `int`: The maximum combined length of a pair of sentences that can be fed to the model.
1439
+ """
1440
+ return self.model_max_length - self.num_special_tokens_to_add(pair=True)
1441
+
1442
+ @max_len_single_sentence.setter
1443
+ def max_len_single_sentence(self, value) -> int:
1444
+ # For backward compatibility, allow to try to setup 'max_len_single_sentence'.
1445
+ if (
1446
+ value == self.model_max_length - self.num_special_tokens_to_add(pair=False)
1447
+ and self.verbose
1448
+ ):
1449
+ if not self.deprecation_warnings.get("max_len_single_sentence", False):
1450
+ warnings.warn(
1451
+ "Setting 'max_len_single_sentence' is now deprecated. "
1452
+ "This value is automatically set up."
1453
+ )
1454
+ self.deprecation_warnings["max_len_single_sentence"] = True
1455
+ else:
1456
+ raise ValueError(
1457
+ "Setting 'max_len_single_sentence' is now deprecated. "
1458
+ "This value is automatically set up."
1459
+ )
1460
+
1461
+ def _switch_to_input_mode(self):
1462
+ """
1463
+ Private method to put the tokenizer in input mode (when it has different modes for input/outputs)
1464
+ """
1465
+ pass
1466
+
1467
+ @max_len_sentences_pair.setter
1468
+ def max_len_sentences_pair(self, value) -> int:
1469
+ # For backward compatibility, allow to try to setup 'max_len_sentences_pair'.
1470
+ if (
1471
+ value == self.model_max_length - self.num_special_tokens_to_add(pair=True)
1472
+ and self.verbose
1473
+ ):
1474
+ if not self.deprecation_warnings.get("max_len_sentences_pair", False):
1475
+ warnings.warn(
1476
+ "Setting 'max_len_sentences_pair' is now deprecated. "
1477
+ "This value is automatically set up."
1478
+ )
1479
+ self.deprecation_warnings["max_len_sentences_pair"] = True
1480
+ else:
1481
+ raise ValueError(
1482
+ "Setting 'max_len_sentences_pair' is now deprecated. "
1483
+ "This value is automatically set up."
1484
+ )
1485
+
1486
+ def _set_processor_class(self, processor_class: str):
1487
+ """Sets processor class as an attribute."""
1488
+ self._processor_class = processor_class
1489
+
1490
+ def __repr__(self) -> str:
1491
+ return (
1492
+ f"{'PretrainedTokenizer'}(name_or_path='{self.name_or_path}', "
1493
+ f"vocab_size={self.vocab_size}, model_max_len={self.model_max_length}, "
1494
+ f"padding_side='{self.padding_side}', truncation_side='{self.truncation_side}', special_tokens={self.special_tokens_map_extended})"
1495
+ )
1496
+
1497
+ def get_vocab(self) -> Dict[str, int]:
1498
+ """
1499
+ Returns the vocabulary as a dictionary of token to index.
1500
+
1501
+ `tokenizer.get_vocab()[token]` is equivalent to `tokenizer.convert_tokens_to_ids(token)` when `token` is in the
1502
+ vocab.
1503
+
1504
+ Returns:
1505
+ `Dict[str, int]`: The vocabulary.
1506
+ """
1507
+ raise NotImplementedError()
1508
+
1509
+ @classmethod
1510
+ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
1511
+ """
1512
+ Creates an instance of `PretrainedTokenizer`. Related resources are loaded
1513
+ by specifying name of a built-in pretrained model, or a community-contributed
1514
+ pretrained model, or a local file directory path.
1515
+
1516
+ Args:
1517
+ pretrained_model_name_or_path (str): Name of pretrained model or dir path
1518
+ to load from. The string can be:
1519
+
1520
+ - Name of built-in pretrained model
1521
+ - Name of a community-contributed pretrained model.
1522
+ - Local directory path which contains tokenizer related resources
1523
+ and tokenizer config file ("tokenizer_config.json").
1524
+ from_hf_hub (bool, optional): whether to load from Huggingface Hub
1525
+ subfolder (str, optional) An optional value corresponding to a folder inside the repo.
1526
+ Only works when loading from Huggingface Hub.
1527
+ *args (tuple): position arguments for model `__init__`. If provided,
1528
+ use these as position argument values for tokenizer initialization.
1529
+ **kwargs (dict): keyword arguments for model `__init__`. If provided,
1530
+ use these to update pre-defined keyword argument values for tokenizer
1531
+ initialization.
1532
+
1533
+ Returns:
1534
+ PretrainedTokenizer: An instance of `PretrainedTokenizer`.
1535
+
1536
+ Example:
1537
+ .. code-block::
1538
+
1539
+ from paddlenlp.transformers import BertTokenizer
1540
+
1541
+ # Name of built-in pretrained model
1542
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
1543
+
1544
+ # Name of community-contributed pretrained model
1545
+ tokenizer = BertTokenizer.from_pretrained('yingyibiao/bert-base-uncased-sst-2-finetuned')
1546
+
1547
+ # Load from local directory path
1548
+ tokenizer = BertTokenizer.from_pretrained('./my_bert/')
1549
+ """
1550
+
1551
+ pretrained_model_name_or_path = str(pretrained_model_name_or_path)
1552
+ cache_dir = kwargs.pop("cache_dir", None)
1553
+ from_hf_hub = kwargs.pop("from_hf_hub", False)
1554
+ from_aistudio = kwargs.pop("from_aistudio", False)
1555
+ subfolder = kwargs.pop("subfolder", "")
1556
+ return_tokenizer_file_dir = kwargs.pop("return_tokenizer_file_dir", False)
1557
+
1558
+ if subfolder is None:
1559
+ subfolder = ""
1560
+
1561
+ vocab_files = {}
1562
+ init_configuration = {}
1563
+
1564
+ additional_files_names = {
1565
+ "added_tokens_file": ADDED_TOKENS_FILE,
1566
+ "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,
1567
+ "tokenizer_config_file": TOKENIZER_CONFIG_FILE,
1568
+ "chat_template_file": CHAT_TEMPLATE_CONFIG_NAME,
1569
+ }
1570
+
1571
+ vocab_files_target = {**cls.resource_files_names, **additional_files_names}
1572
+
1573
+ # From HF Hub or AI Studio
1574
+ if from_hf_hub or from_aistudio:
1575
+ # Only include the necessary resource files specified by the tokenizer cls
1576
+ # Deep copy to avoid modifiying the class attributes
1577
+ vocab_files = copy.deepcopy(cls.resource_files_names)
1578
+ vocab_files["tokenizer_config_file"] = cls.tokenizer_config_file
1579
+
1580
+ # From built-in pretrained models
1581
+ elif pretrained_model_name_or_path in cls.pretrained_init_configuration:
1582
+ for file_id, map_list in cls.pretrained_resource_files_map.items():
1583
+ vocab_files[file_id] = map_list[pretrained_model_name_or_path]
1584
+ init_configuration = copy.deepcopy(
1585
+ cls.pretrained_init_configuration[pretrained_model_name_or_path]
1586
+ )
1587
+ # From local dir path
1588
+ elif os.path.isdir(pretrained_model_name_or_path):
1589
+ vocab_files_target["tokenizer_config_file"] = cls.tokenizer_config_file
1590
+ for file_id, file_name in vocab_files_target.items():
1591
+ full_file_name = os.path.join(
1592
+ pretrained_model_name_or_path, subfolder, file_name
1593
+ )
1594
+ if os.path.isfile(full_file_name):
1595
+ vocab_files[file_id] = full_file_name
1596
+ else:
1597
+ # Assuming from community-contributed pretrained models
1598
+ for file_id, file_name in vocab_files_target.items():
1599
+ vocab_files[file_id] = file_name
1600
+
1601
+ resolved_vocab_files = {}
1602
+ for file_id, file_path in vocab_files.items():
1603
+ if file_path is None or os.path.isfile(file_path):
1604
+ resolved_vocab_files[file_id] = file_path
1605
+ continue
1606
+ else:
1607
+ logging.warnings("need to download tokenizer, but not support yet.")
1608
+ # tokenizer download not support yet
1609
+ # resolved_vocab_files[file_id] = resolve_file_path(
1610
+ # pretrained_model_name_or_path,
1611
+ # [file_path],
1612
+ # subfolder,
1613
+ # cache_dir=cache_dir,
1614
+ # from_aistudio=from_aistudio,
1615
+ # from_hf_hub=from_hf_hub,
1616
+ # )
1617
+
1618
+ for file_id, file_path in resolved_vocab_files.items():
1619
+ if resolved_vocab_files[file_id] is not None:
1620
+ cache_dir = os.path.dirname(resolved_vocab_files[file_id])
1621
+ break
1622
+
1623
+ tokenizer_config_file_dir_list = set()
1624
+ for k, v in resolved_vocab_files.items():
1625
+ if v is not None and os.path.isfile(v):
1626
+ tokenizer_config_file_dir_list.add(os.path.dirname(v))
1627
+ tokenizer_config_file_dir_list = list(tokenizer_config_file_dir_list)
1628
+ # TODO: check this
1629
+ assert (
1630
+ len(tokenizer_config_file_dir_list) > 0
1631
+ ), "All tokenizer files should be in the same directory."
1632
+ # Prepare tokenizer initialization kwargs
1633
+ # Did we saved some inputs and kwargs to reload ?
1634
+ has_tokenizer_file = (
1635
+ resolved_vocab_files.get("tokenizer_file", None) is not None
1636
+ )
1637
+ tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None)
1638
+ if tokenizer_config_file is not None:
1639
+ with io.open(tokenizer_config_file, encoding="utf-8") as f:
1640
+ init_kwargs = json.load(f)
1641
+ else:
1642
+ init_kwargs = init_configuration
1643
+
1644
+ # position args are stored in kwargs, maybe better not include
1645
+ init_args = init_kwargs.pop("init_args", ())
1646
+ init_kwargs.pop("init_class", None)
1647
+
1648
+ # Update with newly provided args and kwargs
1649
+ init_args = init_args if not args else args
1650
+ init_kwargs.update(kwargs)
1651
+
1652
+ def convert_added_tokens(obj):
1653
+ if (
1654
+ isinstance(obj, dict)
1655
+ and "__type" in obj
1656
+ and obj["__type"] == "AddedToken"
1657
+ ):
1658
+ obj.pop("__type")
1659
+ return AddedToken(**obj)
1660
+ elif isinstance(obj, (list, tuple)):
1661
+ return list(convert_added_tokens(o) for o in obj)
1662
+ elif isinstance(obj, dict):
1663
+ return {k: convert_added_tokens(v) for k, v in obj.items()}
1664
+ return obj
1665
+
1666
+ init_kwargs = convert_added_tokens(init_kwargs)
1667
+ # Set max length if needed
1668
+ if pretrained_model_name_or_path in cls.max_model_input_sizes:
1669
+ # if we're using a pretrained model, ensure the tokenizer
1670
+ # wont index sequences longer than the number of positional embeddings
1671
+ model_max_length = cls.max_model_input_sizes[pretrained_model_name_or_path]
1672
+ if model_max_length is not None and isinstance(
1673
+ model_max_length, (int, float)
1674
+ ):
1675
+ init_kwargs["model_max_length"] = min(
1676
+ init_kwargs.get("model_max_length", int(1e30)), model_max_length
1677
+ )
1678
+
1679
+ added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None)
1680
+ # Merge resolved_vocab_files arguments in init_kwargs if not including.
1681
+ # Maybe need more ways to load resources.
1682
+ for args_name, file_path in resolved_vocab_files.items():
1683
+ # when `pretrained_model_name_or_path` is a pretrained model name,
1684
+ # use pretrained_init_configuration as `init_kwargs` to init which
1685
+ # does not include the vocab file in it, thus add vocab file into
1686
+ # args.
1687
+ if args_name not in init_kwargs:
1688
+ init_kwargs[args_name] = file_path
1689
+ # when `pretrained_model_name_or_path` is a pretrained model dir,
1690
+ # use tokenizer_config_file.json as `init_kwargs` to init which
1691
+ # does include a vocab file path in it. However, if the vocab file
1692
+ # path included in json does not exist, such as was deleted, to make
1693
+ # it still work, use the vocab file under this dir.
1694
+ elif not os.path.isfile(init_kwargs[args_name] or "") and os.path.isfile(
1695
+ file_path
1696
+ ):
1697
+ init_kwargs[args_name] = file_path
1698
+
1699
+ # TODO(zhoushunjie): It's not supportted to load tokenizer.json of hf so far.
1700
+ if from_hf_hub and "tokenizer_file" in init_kwargs:
1701
+ init_kwargs.pop("tokenizer_file")
1702
+
1703
+ # TODO(guosheng): avoid reduplication of position args and key word args
1704
+ tokenizer = cls(*init_args, **init_kwargs)
1705
+ chat_template = init_kwargs.pop("chat_template", None)
1706
+ if chat_template is not None:
1707
+ tokenizer.init_chat_template(chat_template)
1708
+ special_tokens_map_file = resolved_vocab_files.pop(
1709
+ "special_tokens_map_file", None
1710
+ )
1711
+ if special_tokens_map_file is not None:
1712
+ with open(
1713
+ special_tokens_map_file, encoding="utf-8"
1714
+ ) as special_tokens_map_handle:
1715
+ special_tokens_map = json.load(special_tokens_map_handle)
1716
+ for key, value in special_tokens_map.items():
1717
+ if key in kwargs and kwargs[key]:
1718
+ # This value has already been redefined by the kwargs
1719
+ # We keep this new value and ignore the one stored in the special_tokens_map_file
1720
+
1721
+ continue
1722
+
1723
+ if isinstance(value, dict):
1724
+ value = AddedToken(**value)
1725
+ elif isinstance(value, list):
1726
+ value = [
1727
+ AddedToken(**token) if isinstance(token, dict) else token
1728
+ for token in value
1729
+ ]
1730
+ setattr(tokenizer, key, value)
1731
+ # Add supplementary tokens.
1732
+ special_tokens = tokenizer.all_special_tokens
1733
+ if added_tokens_file is not None:
1734
+ with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
1735
+ added_tok_encoder = json.load(added_tokens_handle)
1736
+
1737
+ # Sort added tokens by index
1738
+ added_tok_encoder_sorted = list(
1739
+ sorted(added_tok_encoder.items(), key=lambda x: x[1])
1740
+ )
1741
+ for token, index in added_tok_encoder_sorted:
1742
+ if (
1743
+ has_tokenizer_file
1744
+ and index != len(tokenizer)
1745
+ and tokenizer.convert_tokens_to_ids(token) != index
1746
+ ):
1747
+ # index is the current length of the tokenizer (not in vocabulary)
1748
+ raise ValueError(
1749
+ f"Wrong index found for {token}: should be {tokenizer.convert_tokens_to_ids(token)} but found "
1750
+ f"{index}."
1751
+ )
1752
+ elif not has_tokenizer_file and index != len(tokenizer):
1753
+ # Tokenizer slow: added token cannot already be in the vocabulary so its index needs to be the
1754
+ # current length of the tokenizer.
1755
+ raise ValueError(
1756
+ f"Non-consecutive added token '{token}' found. "
1757
+ f"Should have index {len(tokenizer)} but has index {index} in saved vocabulary."
1758
+ )
1759
+
1760
+ tokenizer.add_tokens(
1761
+ token, special_tokens=bool(token in special_tokens)
1762
+ )
1763
+ # Check all our special tokens are registered as "no split" token (we don't cut them) and are in the vocab
1764
+ added_tokens = tokenizer.sanitize_special_tokens()
1765
+ if added_tokens:
1766
+ logging.info(
1767
+ "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained."
1768
+ )
1769
+ # save all of related things into default root dir
1770
+ if pretrained_model_name_or_path in cls.pretrained_init_configuration:
1771
+ # tokenizer.save_pretrained(os.path.join(cache_dir, pretrained_model_name_or_path, subfolder))
1772
+ tokenizer.save_pretrained(cache_dir)
1773
+
1774
+ if return_tokenizer_file_dir:
1775
+ return tokenizer, list(tokenizer_config_file_dir_list)[0]
1776
+ return tokenizer
1777
+
1778
+ def save_pretrained(
1779
+ self, save_directory, filename_prefix: Optional[str] = None, **kwargs
1780
+ ):
1781
+ """
1782
+ Save tokenizer configuration and related resources to files under
1783
+ `save_directory`. The tokenizer configuration would be saved into
1784
+ `tokenizer_config_file` indicating file (thus `tokenizer_config.json`),
1785
+ and resources would be saved into `resource_files_names` indicating files
1786
+ by using `self.save_resources(save_directory)`.
1787
+
1788
+ The `save_directory` can be used in `from_pretrained` as argument value
1789
+ of `pretrained_model_name_or_path` to re-load the tokenizer.
1790
+
1791
+ Args:
1792
+ save_directory (str): Directory to save files into.
1793
+ filename_prefix: (str, optional):
1794
+ A prefix to add to the names of the files saved by the tokenizer.
1795
+
1796
+ Example:
1797
+ .. code-block::
1798
+
1799
+ from paddlenlp.transformers import BertTokenizer
1800
+
1801
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
1802
+ tokenizer.save_pretrained('trained_model')
1803
+ # reload from save_directory
1804
+ tokenizer = BertTokenizer.from_pretrained('trained_model')
1805
+ """
1806
+ assert not os.path.isfile(
1807
+ save_directory
1808
+ ), "Saving directory ({}) should be a directory, not a file".format(
1809
+ save_directory
1810
+ )
1811
+ os.makedirs(save_directory, exist_ok=True)
1812
+
1813
+ special_tokens_map_file = os.path.join(
1814
+ save_directory,
1815
+ (filename_prefix + "-" if filename_prefix else "")
1816
+ + SPECIAL_TOKENS_MAP_FILE,
1817
+ )
1818
+ tokenizer_config_file = os.path.join(
1819
+ save_directory,
1820
+ (filename_prefix + "-" if filename_prefix else "")
1821
+ + self.tokenizer_config_file,
1822
+ )
1823
+
1824
+ tokenizer_config = copy.deepcopy(self.init_kwargs)
1825
+ if len(self.init_inputs) > 0:
1826
+ tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs)
1827
+ for file_id in self.resource_files_names.keys():
1828
+ tokenizer_config.pop(file_id, None)
1829
+
1830
+ # Sanitize AddedTokens
1831
+ def convert_added_tokens(obj: Union[AddedToken, Any], add_type_field=True):
1832
+ if isinstance(obj, AddedToken):
1833
+ out = obj.__getstate__()
1834
+ if add_type_field:
1835
+ out["__type"] = "AddedToken"
1836
+ return out
1837
+ elif isinstance(obj, (list, tuple)):
1838
+ return list(
1839
+ convert_added_tokens(o, add_type_field=add_type_field) for o in obj
1840
+ )
1841
+ elif isinstance(obj, dict):
1842
+ return {
1843
+ k: convert_added_tokens(v, add_type_field=add_type_field)
1844
+ for k, v in obj.items()
1845
+ }
1846
+ return obj
1847
+
1848
+ # add_type_field=True to allow dicts in the kwargs / differentiate from AddedToken serialization
1849
+ tokenizer_config = convert_added_tokens(tokenizer_config, add_type_field=True)
1850
+
1851
+ # Add tokenizer class to the tokenizer config to be able to reload it with from_pretrained
1852
+ tokenizer_class = self.__class__.__name__
1853
+ tokenizer_config["tokenizer_class"] = tokenizer_class
1854
+
1855
+ with io.open(tokenizer_config_file, "w", encoding="utf-8") as f:
1856
+ f.write(json.dumps(tokenizer_config, ensure_ascii=False))
1857
+ logging.info(f"tokenizer config file saved in {tokenizer_config_file}")
1858
+
1859
+ # Sanitize AddedTokens in special_tokens_map
1860
+ write_dict = convert_added_tokens(
1861
+ self.special_tokens_map_extended, add_type_field=False
1862
+ )
1863
+ with open(special_tokens_map_file, "w", encoding="utf-8") as f:
1864
+ f.write(json.dumps(write_dict, ensure_ascii=False))
1865
+ logging.info(f"Special tokens file saved in {special_tokens_map_file}")
1866
+
1867
+ file_names = (tokenizer_config_file, special_tokens_map_file)
1868
+
1869
+ save_files = self._save_pretrained(
1870
+ save_directory=save_directory,
1871
+ file_names=file_names,
1872
+ filename_prefix=filename_prefix,
1873
+ )
1874
+
1875
+ return save_files
1876
+
1877
+ def _save_pretrained(
1878
+ self,
1879
+ save_directory: Union[str, os.PathLike],
1880
+ file_names: Tuple[str],
1881
+ filename_prefix: Optional[str] = None,
1882
+ ) -> Tuple[str]:
1883
+ """
1884
+ Save a tokenizer using the tokenizer format: vocabulary + added tokens.
1885
+
1886
+ """
1887
+ save_directory = str(save_directory)
1888
+
1889
+ added_tokens_file = os.path.join(
1890
+ save_directory,
1891
+ (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE,
1892
+ )
1893
+ added_vocab = self.get_added_vocab()
1894
+ if added_vocab:
1895
+ with open(added_tokens_file, "w", encoding="utf-8") as f:
1896
+ out_str = json.dumps(added_vocab, ensure_ascii=False)
1897
+ f.write(out_str)
1898
+ logging.info(f"added tokens file saved in {added_tokens_file}")
1899
+
1900
+ self.save_resources(save_directory)
1901
+
1902
+ return file_names + (added_tokens_file,)
1903
+
1904
+ def tokenize(
1905
+ self,
1906
+ text: str,
1907
+ pair: Optional[str] = None,
1908
+ add_special_tokens: bool = False,
1909
+ **kwargs,
1910
+ ) -> List[str]:
1911
+ """
1912
+ Converts a string in a sequence of tokens, replacing unknown tokens with the `unk_token`.
1913
+
1914
+ Args:
1915
+ text (`str`):
1916
+ The sequence to be encoded.
1917
+ pair (`str`, *optional*):
1918
+ A second sequence to be encoded with the first.
1919
+ add_special_tokens (`bool`, *optional*, defaults to `False`):
1920
+ Whether or not to add the special tokens associated with the corresponding model.
1921
+ kwargs (additional keyword arguments, *optional*):
1922
+ Will be passed to the underlying model specific encode method. See details in
1923
+ [`~PretrainedTokenizerBase.__call__`]
1924
+
1925
+ Returns:
1926
+ `List[str]`: The list of tokens.
1927
+ """
1928
+ raise NotImplementedError
1929
+
1930
+ def num_special_tokens_to_add(self, pair: bool = False) -> int:
1931
+ raise NotImplementedError
1932
+
1933
+ def _get_padding_truncation_strategies(
1934
+ self,
1935
+ padding=False,
1936
+ truncation=False,
1937
+ max_length=None,
1938
+ pad_to_multiple_of=None,
1939
+ verbose=True,
1940
+ **kwargs,
1941
+ ):
1942
+ """
1943
+ Find the correct padding/truncation strategy with backward compatibility for old arguments (truncation_strategy
1944
+ and pad_to_max_length) and behaviors.
1945
+ """
1946
+ old_truncation_strategy = kwargs.pop("truncation_strategy", "do_not_truncate")
1947
+ old_pad_to_max_length = kwargs.pop("pad_to_max_seq_len", False)
1948
+
1949
+ # Backward compatibility for previous behavior, maybe we should deprecate it:
1950
+ # If you only set max_length, it activates truncation for max_length
1951
+ if max_length is not None and padding is False and truncation is False:
1952
+ if verbose:
1953
+ if not self.deprecation_warnings.get(
1954
+ "Truncation-not-explicitly-activated", False
1955
+ ):
1956
+ warnings.warn(
1957
+ "Truncation was not explicitly activated but `max_length` is provided a specific value, "
1958
+ "please use `truncation=True` to explicitly truncate examples to max length. "
1959
+ "Defaulting to 'longest_first' truncation strategy. "
1960
+ "If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy "
1961
+ "more precisely by providing a specific strategy to `truncation`."
1962
+ )
1963
+ self.deprecation_warnings["Truncation-not-explicitly-activated"] = True
1964
+ truncation = "longest_first"
1965
+
1966
+ # Get padding strategy
1967
+ if padding is False and old_pad_to_max_length:
1968
+ if verbose:
1969
+ warnings.warn(
1970
+ "The `pad_to_max_length` argument is deprecated and will be removed in a future version, "
1971
+ "use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or "
1972
+ "use `padding='max_length'` to pad to a max length. In this case, you can give a specific "
1973
+ "length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the "
1974
+ "maximal input size of the model (e.g. 512 for Bert).",
1975
+ FutureWarning,
1976
+ )
1977
+ if max_length is None:
1978
+ padding_strategy = PaddingStrategy.LONGEST
1979
+ else:
1980
+ padding_strategy = PaddingStrategy.MAX_LENGTH
1981
+ elif padding is not False:
1982
+ if padding is True:
1983
+ if verbose:
1984
+ if max_length is not None and (
1985
+ truncation is False or truncation == "do_not_truncate"
1986
+ ):
1987
+ warnings.warn(
1988
+ "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "
1989
+ "To pad to max length, use `padding='max_length'`."
1990
+ )
1991
+ if old_pad_to_max_length is not False:
1992
+ warnings.warn(
1993
+ "Though `pad_to_max_length` = `True`, it is ignored because `padding`=`True`."
1994
+ )
1995
+ # Default to pad to the longest sequence in the batch
1996
+ padding_strategy = PaddingStrategy.LONGEST
1997
+ elif not isinstance(padding, PaddingStrategy):
1998
+ padding_strategy = PaddingStrategy(padding)
1999
+ elif isinstance(padding, PaddingStrategy):
2000
+ padding_strategy = padding
2001
+ else:
2002
+ padding_strategy = PaddingStrategy.DO_NOT_PAD
2003
+
2004
+ # Get truncation strategy
2005
+ if truncation is False and old_truncation_strategy != "do_not_truncate":
2006
+ if verbose:
2007
+ warnings.warn(
2008
+ "The `truncation_strategy` argument is deprecated and will be removed in a future version, "
2009
+ "use `truncation=True` to truncate examples to a max length. You can give a specific "
2010
+ "length with `max_length` (e.g. `max_length=45`) or leave max_length to None to truncate to the "
2011
+ "maximal input size of the model (e.g. 512 for Bert). "
2012
+ " If you have pairs of inputs, you can give a specific truncation strategy selected among "
2013
+ "`truncation='only_first'` (will only truncate the first sentence in the pairs) "
2014
+ "`truncation='only_second'` (will only truncate the second sentence in the pairs) "
2015
+ "or `truncation='longest_first'` (will iteratively remove tokens from the longest sentence in the pairs).",
2016
+ FutureWarning,
2017
+ )
2018
+ truncation_strategy = TruncationStrategy(old_truncation_strategy)
2019
+ elif truncation is not False and truncation is not None:
2020
+ if truncation is True:
2021
+ truncation_strategy = (
2022
+ TruncationStrategy.LONGEST_FIRST
2023
+ ) # Default to truncate the longest sequences in pairs of inputs
2024
+ elif not isinstance(truncation, TruncationStrategy):
2025
+ truncation_strategy = TruncationStrategy(truncation)
2026
+ elif isinstance(truncation, TruncationStrategy):
2027
+ truncation_strategy = truncation
2028
+ else:
2029
+ truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
2030
+
2031
+ # Set max length if needed
2032
+ if max_length is None:
2033
+ if padding_strategy == PaddingStrategy.MAX_LENGTH:
2034
+ if self.model_max_length > LARGE_INTEGER:
2035
+ if verbose:
2036
+ if not self.deprecation_warnings.get(
2037
+ "Asking-to-pad-to-max_length", False
2038
+ ):
2039
+ warnings.warn(
2040
+ "Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. "
2041
+ "Default to no padding."
2042
+ )
2043
+ self.deprecation_warnings["Asking-to-pad-to-max_length"] = True
2044
+ padding_strategy = PaddingStrategy.DO_NOT_PAD
2045
+ else:
2046
+ max_length = self.model_max_length
2047
+
2048
+ if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:
2049
+ if self.model_max_length > LARGE_INTEGER:
2050
+ if verbose:
2051
+ if not self.deprecation_warnings.get(
2052
+ "Asking-to-truncate-to-max_length", False
2053
+ ):
2054
+ warnings.warn(
2055
+ "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. "
2056
+ "Default to no truncation."
2057
+ )
2058
+ self.deprecation_warnings[
2059
+ "Asking-to-truncate-to-max_length"
2060
+ ] = True
2061
+ truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
2062
+ else:
2063
+ max_length = self.model_max_length
2064
+
2065
+ # Test if we have a padding token
2066
+ if padding_strategy != PaddingStrategy.DO_NOT_PAD and (
2067
+ not self.pad_token or self.pad_token_id < 0
2068
+ ):
2069
+ raise ValueError(
2070
+ "Asking to pad but the tokenizer does not have a padding token. "
2071
+ "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` "
2072
+ "or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`."
2073
+ )
2074
+
2075
+ # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided
2076
+ if (
2077
+ truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE
2078
+ and padding_strategy != PaddingStrategy.DO_NOT_PAD
2079
+ and pad_to_multiple_of is not None
2080
+ and max_length is not None
2081
+ and (max_length % pad_to_multiple_of != 0)
2082
+ ):
2083
+ raise ValueError(
2084
+ f"Truncation and padding are both activated but "
2085
+ f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})."
2086
+ )
2087
+
2088
+ return padding_strategy, truncation_strategy, max_length, kwargs
2089
+
2090
+ def __call__(
2091
+ self,
2092
+ text: Union[str, List[str], List[List[str]]],
2093
+ text_pair: Optional[Union[str, List[str], List[List[str]]]] = None,
2094
+ max_length: Optional[int] = None,
2095
+ stride: int = 0,
2096
+ is_split_into_words: Union[bool, str] = False,
2097
+ padding: Union[bool, str, PaddingStrategy] = False,
2098
+ truncation: Union[bool, str, TruncationStrategy] = False,
2099
+ return_position_ids: bool = None,
2100
+ return_token_type_ids: Optional[bool] = None,
2101
+ return_attention_mask: Optional[bool] = None,
2102
+ return_length: bool = False,
2103
+ return_overflowing_tokens: bool = False,
2104
+ return_special_tokens_mask: bool = False,
2105
+ return_dict: bool = True,
2106
+ return_offsets_mapping: bool = False,
2107
+ add_special_tokens: bool = True,
2108
+ pad_to_multiple_of: Optional[int] = None,
2109
+ return_tensors: Optional[Union[str, TensorType]] = None,
2110
+ verbose: bool = True,
2111
+ **kwargs,
2112
+ ):
2113
+ """
2114
+ Performs tokenization and uses the tokenized tokens to prepare model
2115
+ inputs. It supports sequence or sequence pair as input, and batch input
2116
+ is allowed. `self.encode()` or `self.batch_encode()` would be called
2117
+ separately for single or batch input depending on input format and
2118
+ `is_split_into_words` argument.
2119
+
2120
+ Args:
2121
+ text (str, List[str] or List[List[str]]):
2122
+ The sequence or batch of sequences to be processed. One sequence
2123
+ is a string or a list of strings depending on whether it has been
2124
+ pretokenized. If each sequence is provided as a list of strings
2125
+ (pretokenized), you must set `is_split_into_words` as `True` to
2126
+ disambiguate with a batch of sequences.
2127
+ text_pair (str, List[str] or List[List[str]], optional):
2128
+ Same as `text` argument, while it represents for the latter
2129
+ sequence of the sequence pair.
2130
+ max_length (int, optional):
2131
+ If set to a number, will limit the total sequence returned so
2132
+ that it has a maximum length. If there are overflowing tokens,
2133
+ those overflowing tokens will be added to the returned dictionary
2134
+ when `return_overflowing_tokens` is `True`. Defaults to `None`.
2135
+ stride (int, optional):
2136
+ Only available for batch input of sequence pair and mainly for
2137
+ question answering usage. When for QA, `text` represents questions
2138
+ and `text_pair` represents contexts. If `stride` is set to a
2139
+ positive number, the context will be split into multiple spans
2140
+ where `stride` defines the number of (tokenized) tokens to skip
2141
+ from the start of one span to get the next span, thus will produce
2142
+ a bigger batch than inputs to include all spans. Moreover, 'overflow_to_sample'
2143
+ and 'offset_mapping' preserving the original example and position
2144
+ information will be added to the returned dictionary. Defaults to 0.
2145
+ is_split_into_words (Union[bool, str], optional):
2146
+ when the text is words or tokens, `is_split_into_words` should be True or `token`.
2147
+ `True`: means that the text should be words which should be tokenized.
2148
+ `token`: means that the text should be tokens which already be tokenized, so it should not be tokenized again.
2149
+ padding (bool, str or [PaddingStrategy], optional):
2150
+ Activates and controls padding. Accepts the following values:
2151
+
2152
+ - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
2153
+ sequence if provided).
2154
+ - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
2155
+ acceptable input length for the model if that argument is not provided.
2156
+ - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
2157
+ lengths).
2158
+ Defaults to `False`.
2159
+ truncation (bool, str or [TruncationStrategy], optional):
2160
+ Activates and controls truncation. Accepts the following values:
2161
+
2162
+ - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or
2163
+ to the maximum acceptable input length for the model if that argument is not provided. This will
2164
+ truncate token by token, removing a token from the longest sequence in the pair if a pair of
2165
+ sequences (or a batch of pairs) is provided.
2166
+ - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
2167
+ maximum acceptable input length for the model if that argument is not provided. This will only
2168
+ truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
2169
+ - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
2170
+ maximum acceptable input length for the model if that argument is not provided. This will only
2171
+ truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
2172
+ - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
2173
+ greater than the model maximum admissible input size).
2174
+ Defaults to `False`.
2175
+ return_position_ids (bool, optional):
2176
+ Whether to include tokens position ids in the returned dictionary.
2177
+ Defaults to `False`.
2178
+ return_token_type_ids (bool, optional):
2179
+ Whether to include token type ids in the returned dictionary.
2180
+ Defaults to `True`.
2181
+ return_attention_mask (bool, optional):
2182
+ Whether to include the attention mask in the returned dictionary.
2183
+ Defaults to `False`.
2184
+ return_length (bool, optional):
2185
+ Whether to include the length of each encoded inputs in the
2186
+ returned dictionary. Defaults to `False`.
2187
+ return_overflowing_tokens (bool, optional):
2188
+ Whether to include overflowing token information in the returned
2189
+ dictionary. Defaults to `False`.
2190
+ return_special_tokens_mask (bool, optional):
2191
+ Whether to include special tokens mask information in the returned
2192
+ dictionary. Defaults to `False`.
2193
+ return_dict (bool, optional):
2194
+ Decide the format for returned encoded batch inputs. Only works when
2195
+ input is a batch of data.
2196
+ ::
2197
+ - If True, encoded inputs would be a dictionary like:
2198
+ {'input_ids': [[1, 4444, 4385, 1545, 6712],[1, 4444, 4385]],
2199
+ 'token_type_ids': [[0, 0, 0, 0, 0], [0, 0, 0]]}
2200
+ - If False, encoded inputs would be a list like:
2201
+ [{'input_ids': [1, 4444, 4385, 1545, 6712],
2202
+ 'token_type_ids': [0, 0, 0, 0, 0]},
2203
+ {'input_ids': [1, 4444, 4385], 'token_type_ids': [0, 0, 0]}]
2204
+
2205
+ Defaults to `True`.
2206
+ return_offsets_mapping (bool, optional):
2207
+ Whether to include the list of pair preserving the index of start
2208
+ and end char in original input for each token in the returned
2209
+ dictionary. Would be automatically set to `True` when `stride` > 0.
2210
+ Defaults to `False`.
2211
+ add_special_tokens (bool, optional):
2212
+ Whether to add the special tokens associated with the corresponding model
2213
+ to the encoded inputs. Defaults to `True`
2214
+ pad_to_multiple_of (int, optional):
2215
+ If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
2216
+ the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
2217
+ Defaults to `None`.
2218
+ return_tensors (str or [TensorType], optional):
2219
+ If set, will return tensors instead of list of python integers. Acceptable values are:
2220
+
2221
+ - `'pd'`: Return Paddle `paddle.Tensor` objects.
2222
+ - `'np'`: Return Numpy `np.ndarray` objects.
2223
+ Defaults to `None`.
2224
+ verbose (bool, optional):
2225
+ Whether or not to print more information and warnings. Defaults to True.
2226
+
2227
+ Returns:
2228
+ dict or list[dict] (for batch input):
2229
+ The dict has the following optional items:
2230
+
2231
+ - **input_ids** (list[int] or list[list[int]]): List of token ids to be fed to a model.
2232
+ - **position_ids** (list[int] or list[list[int]], optional): List of token position ids to be
2233
+ fed to a model. Included when `return_position_ids` is `True`
2234
+ - **token_type_ids** (list[int] or list[list[int]], optional): List of token type ids to be
2235
+ fed to a model. Included when `return_token_type_ids` is `True`.
2236
+ - **attention_mask** (list[int] or list[list[int]], optional): List of integers valued 0 or 1,
2237
+ where 0 specifies paddings and should not be attended to by the
2238
+ model. Included when `return_attention_mask` is `True`.
2239
+ - **seq_len** (int or list[int], optional): The input_ids length. Included when `return_length`
2240
+ is `True`.
2241
+ - **overflowing_tokens** (list[int] or list[list[int]], optional): List of overflowing tokens.
2242
+ Included when if `max_length` is specified and `return_overflowing_tokens`
2243
+ is True.
2244
+ - **num_truncated_tokens** (int or list[int], optional): The number of overflowing tokens.
2245
+ Included when if `max_length` is specified and `return_overflowing_tokens`
2246
+ is True.
2247
+ - **special_tokens_mask** (list[int] or list[list[int]], optional): List of integers valued 0 or 1,
2248
+ with 0 specifying special added tokens and 1 specifying sequence tokens.
2249
+ Included when `return_special_tokens_mask` is `True`.
2250
+ - **offset_mapping** (list[int], optional): list of pair preserving the
2251
+ index of start and end char in original input for each token.
2252
+ For a sqecial token, the index pair is `(0, 0)`. Included when
2253
+ `return_overflowing_tokens` is True or `stride` > 0.
2254
+ - **overflow_to_sample** (int or list[int], optional): Index of example from which this
2255
+ feature is generated. Included when `stride` works.
2256
+ """
2257
+
2258
+ # Input type checking for clearer error
2259
+ def _is_valid_text_input(t):
2260
+ if isinstance(t, str):
2261
+ # Strings are fine
2262
+ return True
2263
+ elif isinstance(t, (list, tuple)):
2264
+ # List are fine as long as they are...
2265
+ if len(t) == 0:
2266
+ # ... empty
2267
+ return True
2268
+ elif isinstance(t[0], str):
2269
+ # ... list of strings
2270
+ return True
2271
+ elif isinstance(t[0], (list, tuple)):
2272
+ # ... list with an empty list or with a list of strings
2273
+ return len(t[0]) == 0 or isinstance(t[0][0], str)
2274
+ else:
2275
+ return False
2276
+ else:
2277
+ return False
2278
+
2279
+ if not _is_valid_text_input(text):
2280
+ raise ValueError(
2281
+ "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
2282
+ "or `List[List[str]]` (batch of pretokenized examples)."
2283
+ )
2284
+
2285
+ if text_pair is not None and not _is_valid_text_input(text_pair):
2286
+ raise ValueError(
2287
+ "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
2288
+ "or `List[List[str]]` (batch of pretokenized examples)."
2289
+ )
2290
+
2291
+ # check `split_into_words` value
2292
+ if isinstance(is_split_into_words, str) and is_split_into_words != "token":
2293
+ raise ValueError(
2294
+ "the value of `is_split_into_words` should be one of: {True, False, 'token'} but receive: <%s>",
2295
+ is_split_into_words,
2296
+ )
2297
+
2298
+ if is_split_into_words:
2299
+ is_batched = (
2300
+ isinstance(text, (list, tuple))
2301
+ and text
2302
+ and isinstance(text[0], (list, tuple))
2303
+ )
2304
+ else:
2305
+ is_batched = isinstance(text, (list, tuple))
2306
+
2307
+ if is_batched:
2308
+ if isinstance(text_pair, str):
2309
+ raise TypeError(
2310
+ "when tokenizing batches of text, `text_pair` must be a list or tuple with the same length as `text`."
2311
+ )
2312
+ if text_pair is not None and len(text) != len(text_pair):
2313
+ raise ValueError(
2314
+ f"batch length of `text`: {len(text)} does not match batch length of `text_pair`: {len(text_pair)}."
2315
+ )
2316
+ batch_text_or_text_pairs = (
2317
+ list(zip(text, text_pair)) if text_pair is not None else text
2318
+ )
2319
+ return self.batch_encode(
2320
+ batch_text_or_text_pairs=batch_text_or_text_pairs,
2321
+ max_length=max_length,
2322
+ stride=stride,
2323
+ is_split_into_words=is_split_into_words,
2324
+ padding=padding,
2325
+ truncation=truncation,
2326
+ return_position_ids=return_position_ids,
2327
+ return_token_type_ids=return_token_type_ids,
2328
+ return_attention_mask=return_attention_mask,
2329
+ return_length=return_length,
2330
+ return_overflowing_tokens=return_overflowing_tokens,
2331
+ return_special_tokens_mask=return_special_tokens_mask,
2332
+ return_dict=return_dict,
2333
+ return_offsets_mapping=return_offsets_mapping,
2334
+ add_special_tokens=add_special_tokens,
2335
+ pad_to_multiple_of=pad_to_multiple_of,
2336
+ return_tensors=return_tensors,
2337
+ verbose=verbose,
2338
+ **kwargs,
2339
+ )
2340
+ else:
2341
+ return self.encode(
2342
+ text=text,
2343
+ text_pair=text_pair,
2344
+ max_length=max_length,
2345
+ stride=stride,
2346
+ is_split_into_words=is_split_into_words,
2347
+ padding=padding,
2348
+ truncation=truncation,
2349
+ return_position_ids=return_position_ids,
2350
+ return_token_type_ids=return_token_type_ids,
2351
+ return_attention_mask=return_attention_mask,
2352
+ return_length=return_length,
2353
+ return_overflowing_tokens=return_overflowing_tokens,
2354
+ return_special_tokens_mask=return_special_tokens_mask,
2355
+ return_offsets_mapping=return_offsets_mapping,
2356
+ add_special_tokens=add_special_tokens,
2357
+ pad_to_multiple_of=pad_to_multiple_of,
2358
+ return_tensors=return_tensors,
2359
+ verbose=verbose,
2360
+ **kwargs,
2361
+ )
2362
+
2363
+ def encode(
2364
+ self,
2365
+ text,
2366
+ text_pair=None,
2367
+ add_special_tokens=True,
2368
+ padding: Union[bool, str, PaddingStrategy] = False,
2369
+ truncation: Union[bool, str, TruncationStrategy] = False,
2370
+ max_length: Optional[int] = None,
2371
+ stride: int = 0,
2372
+ is_split_into_words: bool = False,
2373
+ pad_to_multiple_of: Optional[int] = None,
2374
+ return_tensors: Optional[Union[str, TensorType]] = None,
2375
+ return_token_type_ids: Optional[bool] = None,
2376
+ return_attention_mask: Optional[bool] = None,
2377
+ return_overflowing_tokens: bool = False,
2378
+ return_special_tokens_mask: bool = False,
2379
+ return_offsets_mapping: bool = False,
2380
+ return_length: bool = False,
2381
+ verbose: bool = True,
2382
+ return_position_ids=None,
2383
+ **kwargs,
2384
+ ) -> BatchEncoding:
2385
+ """
2386
+ Tokenize and prepare for the model a sequence or a pair of sequences.
2387
+
2388
+ Args:
2389
+ text (`str`, `List[str]` or `List[int]`):
2390
+ The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
2391
+ `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
2392
+ method).
2393
+ text_pair (`str`, `List[str]` or `List[int]`, *optional*):
2394
+ Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
2395
+ the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
2396
+ method).
2397
+ """
2398
+ # Backward compatibility for 'max_seq_len'
2399
+ old_max_seq_len = kwargs.get("max_seq_len", None)
2400
+ if max_length is None and old_max_seq_len:
2401
+ if verbose:
2402
+ warnings.warn(
2403
+ "The `max_seq_len` argument is deprecated and will be removed in a future version, "
2404
+ "please use `max_length` instead.",
2405
+ FutureWarning,
2406
+ )
2407
+ max_length = old_max_seq_len
2408
+ # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
2409
+ padding_strategy, truncation_strategy, max_length, kwargs = (
2410
+ self._get_padding_truncation_strategies(
2411
+ padding=padding,
2412
+ truncation=truncation,
2413
+ max_length=max_length,
2414
+ pad_to_multiple_of=pad_to_multiple_of,
2415
+ verbose=verbose,
2416
+ **kwargs,
2417
+ )
2418
+ )
2419
+
2420
+ return self._encode_plus(
2421
+ text=text,
2422
+ text_pair=text_pair,
2423
+ add_special_tokens=add_special_tokens,
2424
+ padding_strategy=padding_strategy,
2425
+ truncation_strategy=truncation_strategy,
2426
+ max_length=max_length,
2427
+ stride=stride,
2428
+ is_split_into_words=is_split_into_words,
2429
+ pad_to_multiple_of=pad_to_multiple_of,
2430
+ return_tensors=return_tensors,
2431
+ return_position_ids=return_position_ids,
2432
+ return_token_type_ids=return_token_type_ids,
2433
+ return_attention_mask=return_attention_mask,
2434
+ return_overflowing_tokens=return_overflowing_tokens,
2435
+ return_special_tokens_mask=return_special_tokens_mask,
2436
+ return_offsets_mapping=return_offsets_mapping,
2437
+ return_length=return_length,
2438
+ verbose=verbose,
2439
+ **kwargs,
2440
+ )
2441
+
2442
+ def encode_plus(
2443
+ self,
2444
+ text: Union[TextInput, PreTokenizedInput, EncodedInput],
2445
+ text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
2446
+ add_special_tokens: bool = True,
2447
+ padding: Union[bool, str, PaddingStrategy] = False,
2448
+ truncation: Union[bool, str, TruncationStrategy] = None,
2449
+ max_length: Optional[int] = None,
2450
+ stride: int = 0,
2451
+ is_split_into_words: bool = False,
2452
+ pad_to_multiple_of: Optional[int] = None,
2453
+ return_tensors: Optional[Union[str, TensorType]] = None,
2454
+ return_token_type_ids: Optional[bool] = None,
2455
+ return_attention_mask: Optional[bool] = None,
2456
+ return_overflowing_tokens: bool = False,
2457
+ return_special_tokens_mask: bool = False,
2458
+ return_offsets_mapping: bool = False,
2459
+ return_length: bool = False,
2460
+ verbose: bool = True,
2461
+ **kwargs,
2462
+ ) -> BatchEncoding:
2463
+ """
2464
+ Tokenize and prepare for the model a sequence or a pair of sequences.
2465
+
2466
+ <Tip warning={true}>
2467
+
2468
+ This method is deprecated, `__call__` should be used instead.
2469
+
2470
+ </Tip>
2471
+
2472
+ Args:
2473
+ text (`str`, `List[str]` or `List[int]` (the latter only for not-fast tokenizers)):
2474
+ The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
2475
+ `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
2476
+ method).
2477
+ text_pair (`str`, `List[str]` or `List[int]`, *optional*):
2478
+ Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
2479
+ the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
2480
+ method).
2481
+ """
2482
+
2483
+ # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
2484
+ padding_strategy, truncation_strategy, max_length, kwargs = (
2485
+ self._get_padding_truncation_strategies(
2486
+ padding=padding,
2487
+ truncation=truncation,
2488
+ max_length=max_length,
2489
+ pad_to_multiple_of=pad_to_multiple_of,
2490
+ verbose=verbose,
2491
+ **kwargs,
2492
+ )
2493
+ )
2494
+
2495
+ return self._encode_plus(
2496
+ text=text,
2497
+ text_pair=text_pair,
2498
+ add_special_tokens=add_special_tokens,
2499
+ padding_strategy=padding_strategy,
2500
+ truncation_strategy=truncation_strategy,
2501
+ max_length=max_length,
2502
+ stride=stride,
2503
+ is_split_into_words=is_split_into_words,
2504
+ pad_to_multiple_of=pad_to_multiple_of,
2505
+ return_tensors=return_tensors,
2506
+ return_token_type_ids=return_token_type_ids,
2507
+ return_attention_mask=return_attention_mask,
2508
+ return_overflowing_tokens=return_overflowing_tokens,
2509
+ return_special_tokens_mask=return_special_tokens_mask,
2510
+ return_offsets_mapping=return_offsets_mapping,
2511
+ return_length=return_length,
2512
+ verbose=verbose,
2513
+ **kwargs,
2514
+ )
2515
+
2516
+ def _encode_plus(
2517
+ self,
2518
+ text: Union[TextInput, PreTokenizedInput, EncodedInput],
2519
+ text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
2520
+ add_special_tokens: bool = True,
2521
+ padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
2522
+ truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
2523
+ max_length: Optional[int] = None,
2524
+ stride: int = 0,
2525
+ is_split_into_words: bool = False,
2526
+ pad_to_multiple_of: Optional[int] = None,
2527
+ return_position_ids: Optional[bool] = None,
2528
+ return_tensors: Optional[Union[str, TensorType]] = None,
2529
+ return_token_type_ids: Optional[bool] = None,
2530
+ return_attention_mask: Optional[bool] = None,
2531
+ return_overflowing_tokens: bool = False,
2532
+ return_special_tokens_mask: bool = False,
2533
+ return_offsets_mapping: bool = False,
2534
+ return_length: bool = False,
2535
+ verbose: bool = True,
2536
+ **kwargs,
2537
+ ) -> BatchEncoding:
2538
+ raise NotImplementedError
2539
+
2540
+ def batch_encode(
2541
+ self,
2542
+ batch_text_or_text_pairs: Union[
2543
+ List[TextInput],
2544
+ List[TextInputPair],
2545
+ List[PreTokenizedInput],
2546
+ List[PreTokenizedInputPair],
2547
+ List[EncodedInput],
2548
+ List[EncodedInputPair],
2549
+ ],
2550
+ max_length=None,
2551
+ stride: int = 0,
2552
+ is_split_into_words: bool = False,
2553
+ padding: Union[bool, str, PaddingStrategy] = False,
2554
+ truncation: Union[bool, str, TruncationStrategy] = False,
2555
+ return_position_ids=None,
2556
+ # TODO(wj-mcat): keep align with `encode` method
2557
+ return_token_type_ids=None,
2558
+ return_attention_mask=None,
2559
+ return_length=False,
2560
+ return_overflowing_tokens=False,
2561
+ return_special_tokens_mask=False,
2562
+ return_dict=True,
2563
+ return_offsets_mapping=False,
2564
+ add_special_tokens=True,
2565
+ pad_to_multiple_of: Optional[int] = None,
2566
+ return_tensors: Optional[Union[str, TensorType]] = None,
2567
+ verbose: bool = True,
2568
+ **kwargs,
2569
+ ) -> BatchEncoding:
2570
+ """
2571
+ Performs tokenization and uses the tokenized tokens to prepare model
2572
+ inputs. It supports batch inputs of sequence or sequence pair.
2573
+
2574
+ Args:
2575
+ batch_text_or_text_pairs (list):
2576
+ The element of list can be sequence or sequence pair, and the
2577
+ sequence is a string or a list of strings depending on whether
2578
+ it has been pretokenized. If each sequence is provided as a list
2579
+ of strings (pretokenized), you must set `is_split_into_words` as
2580
+ `True` to disambiguate with a sequence pair.
2581
+
2582
+ Returns:
2583
+ dict or list[dict]:
2584
+ The dict has the following optional items:
2585
+
2586
+ """
2587
+ # Backward compatibility for 'max_seq_len'
2588
+ old_max_seq_len = kwargs.get("max_seq_len", None)
2589
+ if max_length is None and old_max_seq_len:
2590
+ if verbose:
2591
+ warnings.warn(
2592
+ "The `max_seq_len` argument is deprecated and will be removed in a future version, "
2593
+ "please use `max_length` instead.",
2594
+ FutureWarning,
2595
+ )
2596
+ max_length = old_max_seq_len
2597
+ # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
2598
+ padding_strategy, truncation_strategy, max_length, kwargs = (
2599
+ self._get_padding_truncation_strategies(
2600
+ padding=padding,
2601
+ truncation=truncation,
2602
+ max_length=max_length,
2603
+ pad_to_multiple_of=pad_to_multiple_of,
2604
+ verbose=verbose,
2605
+ **kwargs,
2606
+ )
2607
+ )
2608
+
2609
+ return self._batch_encode_plus(
2610
+ batch_text_or_text_pairs=batch_text_or_text_pairs,
2611
+ add_special_tokens=add_special_tokens,
2612
+ padding_strategy=padding_strategy,
2613
+ truncation_strategy=truncation_strategy,
2614
+ max_length=max_length,
2615
+ stride=stride,
2616
+ is_split_into_words=is_split_into_words,
2617
+ pad_to_multiple_of=pad_to_multiple_of,
2618
+ return_tensors=return_tensors,
2619
+ return_position_ids=return_position_ids,
2620
+ return_token_type_ids=return_token_type_ids,
2621
+ return_attention_mask=return_attention_mask,
2622
+ return_overflowing_tokens=return_overflowing_tokens,
2623
+ return_special_tokens_mask=return_special_tokens_mask,
2624
+ return_dict=return_dict,
2625
+ return_offsets_mapping=return_offsets_mapping,
2626
+ return_length=return_length,
2627
+ verbose=verbose,
2628
+ **kwargs,
2629
+ )
2630
+
2631
+ def _batch_encode_plus(
2632
+ self,
2633
+ batch_text_or_text_pairs: Union[
2634
+ List[TextInput],
2635
+ List[TextInputPair],
2636
+ List[PreTokenizedInput],
2637
+ List[PreTokenizedInputPair],
2638
+ List[EncodedInput],
2639
+ List[EncodedInputPair],
2640
+ ],
2641
+ add_special_tokens: bool = True,
2642
+ padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
2643
+ truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
2644
+ max_length: Optional[int] = None,
2645
+ stride: int = 0,
2646
+ is_split_into_words: bool = False,
2647
+ pad_to_multiple_of: Optional[int] = None,
2648
+ return_position_ids: Optional[bool] = None,
2649
+ return_tensors: Optional[Union[str, TensorType]] = None,
2650
+ return_token_type_ids: Optional[bool] = None,
2651
+ return_attention_mask: Optional[bool] = None,
2652
+ return_overflowing_tokens: bool = False,
2653
+ return_special_tokens_mask: bool = False,
2654
+ return_dict: bool = True,
2655
+ return_offsets_mapping: bool = False,
2656
+ return_length: bool = False,
2657
+ verbose: bool = True,
2658
+ **kwargs,
2659
+ ) -> BatchEncoding:
2660
+ raise NotImplementedError
2661
+
2662
+ def pad(
2663
+ self,
2664
+ encoded_inputs: Union[
2665
+ BatchEncoding,
2666
+ List[BatchEncoding],
2667
+ Dict[str, EncodedInput],
2668
+ Dict[str, List[EncodedInput]],
2669
+ List[Dict[str, EncodedInput]],
2670
+ ],
2671
+ padding: Union[bool, str, PaddingStrategy] = True,
2672
+ max_length: Optional[int] = None,
2673
+ pad_to_multiple_of: Optional[int] = None,
2674
+ return_attention_mask: Optional[bool] = None,
2675
+ return_tensors: Optional[Union[str, TensorType]] = None,
2676
+ verbose: bool = True,
2677
+ ) -> BatchEncoding:
2678
+ """
2679
+ Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
2680
+ in the batch.
2681
+
2682
+ Padding side (left/right) padding token ids are defined at the tokenizer level (with `self.padding_side`,
2683
+ `self.pad_token_id` and `self.pad_token_type_id`)
2684
+
2685
+ <Tip>
2686
+
2687
+ If the `encoded_inputs` passed are dictionary of numpy arrays, Paddle tensors, the
2688
+ result will use the same type unless you provide a different tensor type with `return_tensors`.
2689
+ </Tip>
2690
+
2691
+ Args:
2692
+ encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`], `Dict[str, List[int]]`, `Dict[str, List[List[int]]` or `List[Dict[str, List[int]]]`):
2693
+ Tokenized inputs. Can represent one input ([`BatchEncoding`] or `Dict[str, List[int]]`) or a batch of
2694
+ tokenized inputs (list of [`BatchEncoding`], *Dict[str, List[List[int]]]* or *List[Dict[str,
2695
+ List[int]]]*) so you can use this method during preprocessing as well as in a Paddle Dataloader
2696
+ collate function.
2697
+
2698
+ Instead of `List[int]` you can have tensors (numpy arrays, Paddle tensors), see
2699
+ the note above for the return type.
2700
+ padding (`bool`, `str` or [`PaddingStrategy`], *optional*, defaults to `True`):
2701
+ Select a strategy to pad the returned sequences (according to the model's padding side and padding
2702
+ index) among:
2703
+
2704
+ - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
2705
+ sequence if provided).
2706
+ - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
2707
+ acceptable input length for the model if that argument is not provided.
2708
+ - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
2709
+ lengths).
2710
+ max_length (`int`, *optional*):
2711
+ Maximum length of the returned list and optionally padding length (see above).
2712
+ pad_to_multiple_of (`int`, *optional*):
2713
+ If set will pad the sequence to a multiple of the provided value.
2714
+
2715
+ This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
2716
+ >= 7.5 (Volta).
2717
+ return_attention_mask (`bool`, *optional*):
2718
+ Whether to return the attention mask. If left to the default, will return the attention mask according
2719
+ to the specific tokenizer's default, defined by the `return_outputs` attribute.
2720
+
2721
+ [What are attention masks?](../glossary#attention-mask)
2722
+ return_tensors (`str` or [`TensorType`], *optional*):
2723
+ If set, will return tensors instead of list of python integers. Acceptable values are:
2724
+
2725
+ - `'pd'`: Return Paddle `paddle.Tensor` objects.
2726
+ - `'np'`: Return Numpy `np.ndarray` objects.
2727
+ verbose (`bool`, *optional*, defaults to `True`):
2728
+ Whether or not to print more information and warnings.
2729
+ """
2730
+ # If we have a list of dicts, let's convert it in a dict of lists
2731
+ if isinstance(encoded_inputs, (list, tuple)) and isinstance(
2732
+ encoded_inputs[0], (dict, BatchEncoding)
2733
+ ):
2734
+ encoded_inputs = {
2735
+ key: [example[key] for example in encoded_inputs]
2736
+ for key in encoded_inputs[0].keys()
2737
+ }
2738
+
2739
+ # The model's main input name, usually `input_ids`, has be passed for padding
2740
+ if self.model_input_names[0] not in encoded_inputs:
2741
+ raise ValueError(
2742
+ "You should supply an encoding or a list of encodings to this method "
2743
+ f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
2744
+ )
2745
+
2746
+ required_input = encoded_inputs[self.model_input_names[0]]
2747
+
2748
+ if not required_input:
2749
+ if return_attention_mask:
2750
+ encoded_inputs["attention_mask"] = []
2751
+ return encoded_inputs
2752
+
2753
+ # If we have Paddle/NumPy tensors/arrays as inputs, we cast them as python objects
2754
+ # and rebuild them afterwards if no return_tensors is specified
2755
+
2756
+ first_element = required_input[0]
2757
+ if isinstance(first_element, (list, tuple)):
2758
+ # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
2759
+ for item in required_input:
2760
+ if len(item) != 0:
2761
+ first_element = item[0]
2762
+ break
2763
+ # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
2764
+ if not isinstance(first_element, (int, list, tuple)):
2765
+ if isinstance(first_element, paddle.Tensor):
2766
+ return_tensors = "pd" if return_tensors is None else return_tensors
2767
+ else:
2768
+ raise ValueError(
2769
+ f"type of {first_element} unknown: {type(first_element)}. "
2770
+ f"Should be either python or paddle object."
2771
+ )
2772
+
2773
+ for key, value in encoded_inputs.items():
2774
+ encoded_inputs[key] = to_py_obj(value)
2775
+
2776
+ # Convert padding_strategy in PaddingStrategy
2777
+ padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
2778
+ padding=padding, max_length=max_length, verbose=verbose
2779
+ )
2780
+
2781
+ required_input = encoded_inputs[self.model_input_names[0]]
2782
+ if required_input and not isinstance(required_input[0], (list, tuple)):
2783
+ encoded_inputs = self._pad(
2784
+ encoded_inputs,
2785
+ max_length=max_length,
2786
+ padding_strategy=padding_strategy,
2787
+ pad_to_multiple_of=pad_to_multiple_of,
2788
+ return_attention_mask=return_attention_mask,
2789
+ )
2790
+ return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
2791
+
2792
+ batch_size = len(required_input)
2793
+ assert all(
2794
+ len(v) == batch_size for v in encoded_inputs.values()
2795
+ ), "Some items in the output dictionary have a different batch size than others."
2796
+
2797
+ if padding_strategy == PaddingStrategy.LONGEST:
2798
+ max_length = max(len(inputs) for inputs in required_input)
2799
+ padding_strategy = PaddingStrategy.MAX_LENGTH
2800
+
2801
+ batch_outputs = {}
2802
+ for i in range(batch_size):
2803
+ inputs = dict((k, v[i]) for k, v in encoded_inputs.items())
2804
+ outputs = self._pad(
2805
+ inputs,
2806
+ max_length=max_length,
2807
+ padding_strategy=padding_strategy,
2808
+ pad_to_multiple_of=pad_to_multiple_of,
2809
+ return_attention_mask=return_attention_mask,
2810
+ )
2811
+
2812
+ for key, value in outputs.items():
2813
+ if key not in batch_outputs:
2814
+ batch_outputs[key] = []
2815
+ batch_outputs[key].append(value)
2816
+
2817
+ return BatchEncoding(batch_outputs, tensor_type=return_tensors)
2818
+
2819
+ def create_token_type_ids_from_sequences(
2820
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
2821
+ ) -> List[int]:
2822
+ """
2823
+ Create the token type IDs corresponding to the sequences passed. [What are token type
2824
+ IDs?](../glossary#token-type-ids)
2825
+
2826
+ Should be overridden in a subclass if the model has a special way of building those.
2827
+
2828
+ Args:
2829
+ token_ids_0 (`List[int]`): The first tokenized sequence.
2830
+ token_ids_1 (`List[int]`, *optional*): The second tokenized sequence.
2831
+
2832
+ Returns:
2833
+ `List[int]`: The token type ids.
2834
+ """
2835
+ if token_ids_1 is None:
2836
+ return len(token_ids_0) * [0]
2837
+ return [0] * len(token_ids_0) + [1] * len(token_ids_1)
2838
+
2839
+ def build_inputs_with_special_tokens(
2840
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
2841
+ ) -> List[int]:
2842
+ """
2843
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
2844
+ adding special tokens.
2845
+
2846
+ This implementation does not add special tokens and this method should be overridden in a subclass.
2847
+
2848
+ Args:
2849
+ token_ids_0 (`List[int]`): The first tokenized sequence.
2850
+ token_ids_1 (`List[int]`, *optional*): The second tokenized sequence.
2851
+
2852
+ Returns:
2853
+ `List[int]`: The model input with special tokens.
2854
+ """
2855
+ if token_ids_1 is None:
2856
+ return token_ids_0
2857
+ return token_ids_0 + token_ids_1
2858
+
2859
+ def build_offset_mapping_with_special_tokens(
2860
+ self, offset_mapping_0, offset_mapping_1=None
2861
+ ):
2862
+ """
2863
+ Build offset map from a pair of offset map by concatenating and adding offsets of special tokens.
2864
+
2865
+ Should be overridden in a subclass if the model has a special way of building those.
2866
+
2867
+ Args:
2868
+ offset_mapping_0 (List[tuple]):
2869
+ List of char offsets to which the special tokens will be added.
2870
+ offset_mapping_1 (List[tuple], optional):
2871
+ Optional second list of char offsets for offset mapping pairs.
2872
+
2873
+ Returns:
2874
+ List[tuple]: List of char offsets with the appropriate offsets of special tokens.
2875
+ """
2876
+ if offset_mapping_1 is None:
2877
+ return offset_mapping_0
2878
+
2879
+ return offset_mapping_0 + offset_mapping_1
2880
+
2881
+ def prepare_for_model(
2882
+ self,
2883
+ ids,
2884
+ pair_ids=None,
2885
+ padding: Union[bool, str, PaddingStrategy] = False,
2886
+ truncation: Union[bool, str, TruncationStrategy] = False,
2887
+ max_length: Optional[int] = None,
2888
+ stride: int = 0,
2889
+ pad_to_multiple_of: Optional[int] = None,
2890
+ return_tensors: Optional[Union[str, TensorType]] = None,
2891
+ return_position_ids=None,
2892
+ return_token_type_ids: Optional[bool] = None,
2893
+ return_attention_mask: Optional[bool] = None,
2894
+ return_length=False,
2895
+ return_overflowing_tokens=False,
2896
+ return_special_tokens_mask=False,
2897
+ return_offsets_mapping=False,
2898
+ add_special_tokens=True,
2899
+ verbose: bool = True,
2900
+ prepend_batch_axis: bool = False,
2901
+ **kwargs,
2902
+ ):
2903
+ """
2904
+ Performs tokenization and uses the tokenized tokens to prepare model
2905
+ inputs. It supports sequence or sequence pair as input, and batch input
2906
+ is not allowed.
2907
+ """
2908
+ padding_strategy, truncation_strategy, max_length, kwargs = (
2909
+ self._get_padding_truncation_strategies(
2910
+ padding=padding,
2911
+ truncation=truncation,
2912
+ max_length=max_length,
2913
+ pad_to_multiple_of=pad_to_multiple_of,
2914
+ verbose=verbose,
2915
+ **kwargs,
2916
+ )
2917
+ )
2918
+
2919
+ pair = bool(pair_ids is not None)
2920
+ len_ids = len(ids)
2921
+ len_pair_ids = len(pair_ids) if pair else 0
2922
+
2923
+ if return_token_type_ids and not add_special_tokens:
2924
+ raise ValueError(
2925
+ "Asking to return token_type_ids while setting add_special_tokens to False "
2926
+ "results in an undefined behavior. Please set add_special_tokens to True or "
2927
+ "set return_token_type_ids to None."
2928
+ )
2929
+
2930
+ if (
2931
+ return_overflowing_tokens
2932
+ and truncation_strategy == TruncationStrategy.LONGEST_FIRST
2933
+ and pair_ids is not None
2934
+ ):
2935
+ raise ValueError(
2936
+ "Not possible to return overflowing tokens for pair of sequences with the "
2937
+ "`longest_first`. Please select another truncation strategy than `longest_first`, "
2938
+ "for instance `only_second` or `only_first`."
2939
+ )
2940
+
2941
+ # Load from model defaults
2942
+ if return_token_type_ids is None:
2943
+ return_token_type_ids = "token_type_ids" in self.model_input_names
2944
+ if return_attention_mask is None:
2945
+ return_attention_mask = "attention_mask" in self.model_input_names
2946
+ if return_position_ids is None:
2947
+ return_position_ids = "position_ids" in self.model_input_names
2948
+ encoded_inputs = {}
2949
+ # Truncation: Handle max sequence length
2950
+ total_len = (
2951
+ len_ids
2952
+ + len_pair_ids
2953
+ + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0)
2954
+ )
2955
+
2956
+ overflowing_tokens = []
2957
+
2958
+ if (
2959
+ truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE
2960
+ and max_length
2961
+ and total_len > max_length
2962
+ ):
2963
+ ids, pair_ids, overflowing_tokens = self.truncate_sequences(
2964
+ ids,
2965
+ pair_ids=pair_ids,
2966
+ num_tokens_to_remove=total_len - max_length,
2967
+ truncation_strategy=truncation_strategy,
2968
+ stride=stride,
2969
+ )
2970
+ if return_overflowing_tokens:
2971
+ encoded_inputs["overflowing_tokens"] = overflowing_tokens
2972
+ encoded_inputs["num_truncated_tokens"] = total_len - max_length
2973
+
2974
+ # Add special tokens
2975
+ if add_special_tokens:
2976
+ sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
2977
+ token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
2978
+ else:
2979
+ sequence = ids + pair_ids if pair else ids
2980
+ token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else [])
2981
+
2982
+ # Build output dictionnary
2983
+ encoded_inputs["input_ids"] = sequence
2984
+ if return_token_type_ids:
2985
+ encoded_inputs["token_type_ids"] = token_type_ids
2986
+ if return_special_tokens_mask:
2987
+ if add_special_tokens:
2988
+ encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(
2989
+ ids, pair_ids
2990
+ )
2991
+ else:
2992
+ encoded_inputs["special_tokens_mask"] = [0] * len(sequence)
2993
+
2994
+ if return_offsets_mapping and "text" in kwargs and "text_pair" in kwargs:
2995
+ text = kwargs.pop("text")
2996
+ text_pair = kwargs.pop("text_pair")
2997
+
2998
+ token_offset_mapping = self.get_offset_mapping(text)
2999
+ token_pair_offset_mapping = (
3000
+ self.get_offset_mapping(text_pair) if text_pair is not None else None
3001
+ )
3002
+ if max_length and total_len > max_length:
3003
+ token_offset_mapping, token_pair_offset_mapping, _ = (
3004
+ self.truncate_sequences(
3005
+ token_offset_mapping,
3006
+ pair_ids=token_pair_offset_mapping,
3007
+ num_tokens_to_remove=total_len - max_length,
3008
+ truncation_strategy=truncation_strategy,
3009
+ stride=stride,
3010
+ )
3011
+ )
3012
+ if add_special_tokens:
3013
+ offset_mapping = self.build_offset_mapping_with_special_tokens(
3014
+ token_offset_mapping, token_pair_offset_mapping
3015
+ )
3016
+ else:
3017
+ offset_mapping = (
3018
+ token_offset_mapping + token_pair_offset_mapping
3019
+ if token_pair_offset_mapping
3020
+ else token_offset_mapping
3021
+ )
3022
+ encoded_inputs["offset_mapping"] = offset_mapping
3023
+
3024
+ # Check lengths
3025
+ self._eventual_warn_about_too_long_sequence(
3026
+ encoded_inputs["input_ids"], max_length, verbose
3027
+ )
3028
+
3029
+ if return_position_ids:
3030
+ encoded_inputs["position_ids"] = list(
3031
+ range(len(encoded_inputs["input_ids"]))
3032
+ )
3033
+
3034
+ if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
3035
+ encoded_inputs = self.pad(
3036
+ encoded_inputs,
3037
+ max_length=max_length,
3038
+ padding=padding_strategy.value,
3039
+ pad_to_multiple_of=pad_to_multiple_of,
3040
+ return_attention_mask=return_attention_mask,
3041
+ )
3042
+
3043
+ if return_length:
3044
+ encoded_inputs["length"] = len(encoded_inputs["input_ids"])
3045
+ # for compatibility
3046
+ encoded_inputs["seq_len"] = encoded_inputs["length"]
3047
+
3048
+ batch_outputs = BatchEncoding(
3049
+ encoded_inputs,
3050
+ tensor_type=return_tensors,
3051
+ prepend_batch_axis=prepend_batch_axis,
3052
+ )
3053
+
3054
+ return batch_outputs
3055
+
3056
+ def truncate_sequences(
3057
+ self,
3058
+ ids: List[int],
3059
+ pair_ids: Optional[List[int]] = None,
3060
+ num_tokens_to_remove: int = 0,
3061
+ truncation_strategy: Union[str, TruncationStrategy] = "longest_first",
3062
+ stride: int = 0,
3063
+ ) -> Tuple[List[int], List[int], List[int]]:
3064
+ """
3065
+ Truncates a sequence pair in-place following the strategy.
3066
+
3067
+ Args:
3068
+ ids (`List[int]`):
3069
+ Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and
3070
+ `convert_tokens_to_ids` methods.
3071
+ pair_ids (`List[int]`, *optional*):
3072
+ Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
3073
+ and `convert_tokens_to_ids` methods.
3074
+ num_tokens_to_remove (`int`, *optional*, defaults to 0):
3075
+ Number of tokens to remove using the truncation strategy.
3076
+ truncation_strategy (`str` or [`TruncationStrategy`], *optional*, defaults to `False`):
3077
+ The strategy to follow for truncation. Can be:
3078
+
3079
+ - `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
3080
+ maximum acceptable input length for the model if that argument is not provided. This will truncate
3081
+ token by token, removing a token from the longest sequence in the pair if a pair of sequences (or a
3082
+ batch of pairs) is provided.
3083
+ - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
3084
+ maximum acceptable input length for the model if that argument is not provided. This will only
3085
+ truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
3086
+ - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
3087
+ maximum acceptable input length for the model if that argument is not provided. This will only
3088
+ truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
3089
+ - `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths greater
3090
+ than the model maximum admissible input size).
3091
+ stride (`int`, *optional*, defaults to 0):
3092
+ If set to a positive number, the overflowing tokens returned will contain some tokens from the main
3093
+ sequence returned. The value of this argument defines the number of additional tokens.
3094
+
3095
+ Returns:
3096
+ `Tuple[List[int], List[int], List[int]]`: The truncated `ids`, the truncated `pair_ids` and the list of
3097
+ overflowing tokens. Note: The *longest_first* strategy returns empty list of overflowing tokens if a pair
3098
+ of sequences (or a batch of pairs) is provided.
3099
+ """
3100
+ if num_tokens_to_remove <= 0:
3101
+ return ids, pair_ids, []
3102
+
3103
+ if not isinstance(truncation_strategy, TruncationStrategy):
3104
+ truncation_strategy = TruncationStrategy(truncation_strategy)
3105
+
3106
+ overflowing_tokens = []
3107
+ if truncation_strategy == TruncationStrategy.ONLY_FIRST or (
3108
+ truncation_strategy == TruncationStrategy.LONGEST_FIRST and pair_ids is None
3109
+ ):
3110
+ if len(ids) > num_tokens_to_remove:
3111
+ window_len = min(len(ids), stride + num_tokens_to_remove)
3112
+ if self.truncation_side == "left":
3113
+ overflowing_tokens = ids[:window_len]
3114
+ ids = ids[num_tokens_to_remove:]
3115
+ elif self.truncation_side == "right":
3116
+ overflowing_tokens = ids[-window_len:]
3117
+ ids = ids[:-num_tokens_to_remove]
3118
+ else:
3119
+ raise ValueError(
3120
+ f"invalid truncation strategy: {self.truncation_side}, use 'left' or 'right'."
3121
+ )
3122
+
3123
+ else:
3124
+ error_msg = (
3125
+ f"We need to remove {num_tokens_to_remove} to truncate the input "
3126
+ f"but the first sequence has a length {len(ids)}. "
3127
+ )
3128
+ if truncation_strategy == TruncationStrategy.ONLY_FIRST:
3129
+ error_msg = (
3130
+ error_msg + "Please select another truncation strategy than "
3131
+ f"{truncation_strategy}, for instance 'longest_first' or 'only_second'."
3132
+ )
3133
+ logging.error(error_msg)
3134
+ elif truncation_strategy == TruncationStrategy.LONGEST_FIRST:
3135
+ warnings.warn(
3136
+ f"Be aware, overflowing tokens are not returned for the setting you have chosen,"
3137
+ f" i.e. sequence pairs with the '{TruncationStrategy.LONGEST_FIRST.value}' "
3138
+ f"truncation strategy. So the returned list will always be empty even if some "
3139
+ f"tokens have been removed."
3140
+ )
3141
+ for _ in range(num_tokens_to_remove):
3142
+ if pair_ids is None or len(ids) > len(pair_ids):
3143
+ if self.truncation_side == "right":
3144
+ ids = ids[:-1]
3145
+ elif self.truncation_side == "left":
3146
+ ids = ids[1:]
3147
+ else:
3148
+ raise ValueError(
3149
+ "invalid truncation strategy:" + str(self.truncation_side)
3150
+ )
3151
+ else:
3152
+ if self.truncation_side == "right":
3153
+ pair_ids = pair_ids[:-1]
3154
+ elif self.truncation_side == "left":
3155
+ pair_ids = pair_ids[1:]
3156
+ else:
3157
+ raise ValueError(
3158
+ "invalid truncation strategy:" + str(self.truncation_side)
3159
+ )
3160
+ elif (
3161
+ truncation_strategy == TruncationStrategy.ONLY_SECOND
3162
+ and pair_ids is not None
3163
+ ):
3164
+ if len(pair_ids) > num_tokens_to_remove:
3165
+ window_len = min(len(pair_ids), stride + num_tokens_to_remove)
3166
+ if self.truncation_side == "right":
3167
+ overflowing_tokens = pair_ids[-window_len:]
3168
+ pair_ids = pair_ids[:-num_tokens_to_remove]
3169
+ elif self.truncation_side == "left":
3170
+ overflowing_tokens = pair_ids[:window_len]
3171
+ pair_ids = pair_ids[num_tokens_to_remove:]
3172
+ else:
3173
+ raise ValueError(
3174
+ "invalid truncation strategy:" + str(self.truncation_side)
3175
+ )
3176
+ else:
3177
+ logging.error(
3178
+ f"We need to remove {num_tokens_to_remove} to truncate the input "
3179
+ f"but the second sequence has a length {len(pair_ids)}. "
3180
+ f"Please select another truncation strategy than {truncation_strategy}, "
3181
+ f"for instance 'longest_first' or 'only_first'."
3182
+ )
3183
+
3184
+ return (ids, pair_ids, overflowing_tokens)
3185
+
3186
+ def _pad(
3187
+ self,
3188
+ encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
3189
+ max_length: Optional[int] = None,
3190
+ padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
3191
+ pad_to_multiple_of: Optional[int] = None,
3192
+ return_attention_mask: Optional[bool] = None,
3193
+ ) -> dict:
3194
+ """
3195
+ Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
3196
+
3197
+ Args:
3198
+ encoded_inputs:
3199
+ Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
3200
+ max_length: maximum length of the returned list and optionally padding length (see below).
3201
+ Will truncate by taking into account the special tokens.
3202
+ padding_strategy: PaddingStrategy to use for padding.
3203
+
3204
+ - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
3205
+ - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
3206
+ - PaddingStrategy.DO_NOT_PAD: Do not pad
3207
+ The tokenizer padding sides are defined in self.padding_side:
3208
+
3209
+ - 'left': pads on the left of the sequences
3210
+ - 'right': pads on the right of the sequences
3211
+ pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
3212
+ This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
3213
+ >= 7.5 (Volta).
3214
+ return_attention_mask:
3215
+ (optional) Set to False to avoid returning attention mask (default: set to model specifics)
3216
+ """
3217
+ # Load from model defaults
3218
+ if return_attention_mask is None:
3219
+ return_attention_mask = (
3220
+ "attention_mask" in self.model_input_names
3221
+ or "attention_mask" in encoded_inputs
3222
+ )
3223
+
3224
+ required_input = encoded_inputs[self.model_input_names[0]]
3225
+
3226
+ if padding_strategy == PaddingStrategy.LONGEST:
3227
+ max_length = len(required_input)
3228
+
3229
+ if (
3230
+ max_length is not None
3231
+ and pad_to_multiple_of is not None
3232
+ and (max_length % pad_to_multiple_of != 0)
3233
+ ):
3234
+ max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
3235
+
3236
+ needs_to_be_padded = (
3237
+ padding_strategy != PaddingStrategy.DO_NOT_PAD
3238
+ and len(required_input) != max_length
3239
+ )
3240
+
3241
+ # Initialize attention mask if not present.
3242
+ if return_attention_mask and "attention_mask" not in encoded_inputs:
3243
+ encoded_inputs["attention_mask"] = [1] * len(required_input)
3244
+
3245
+ if needs_to_be_padded:
3246
+ difference = max_length - len(required_input)
3247
+
3248
+ if self.padding_side == "right":
3249
+ if return_attention_mask:
3250
+
3251
+ encoded_inputs["attention_mask"] = (
3252
+ encoded_inputs["attention_mask"] + [0] * difference
3253
+ )
3254
+ if "token_type_ids" in encoded_inputs:
3255
+ encoded_inputs["token_type_ids"] = (
3256
+ encoded_inputs["token_type_ids"]
3257
+ + [self.pad_token_type_id] * difference
3258
+ )
3259
+ if "special_tokens_mask" in encoded_inputs:
3260
+ encoded_inputs["special_tokens_mask"] = (
3261
+ encoded_inputs["special_tokens_mask"] + [1] * difference
3262
+ )
3263
+ if "offset_mapping" in encoded_inputs:
3264
+ encoded_inputs["offset_mapping"] = (
3265
+ encoded_inputs["offset_mapping"] + [(0, 0)] * difference
3266
+ )
3267
+ if "position_ids" in encoded_inputs:
3268
+ encoded_inputs["position_ids"] = (
3269
+ encoded_inputs["position_ids"] + [0] * difference
3270
+ )
3271
+ # NOTE: In ernie3.0-qa, the type of `*_positions` is int.
3272
+ if "start_positions" in encoded_inputs and isinstance(
3273
+ encoded_inputs["start_positions"], list
3274
+ ):
3275
+ encoded_inputs["start_positions"] = (
3276
+ encoded_inputs["start_positions"] + [0] * difference
3277
+ )
3278
+ if "end_positions" in encoded_inputs and isinstance(
3279
+ encoded_inputs["end_positions"], list
3280
+ ):
3281
+ encoded_inputs["end_positions"] = (
3282
+ encoded_inputs["end_positions"] + [0] * difference
3283
+ )
3284
+ encoded_inputs[self.model_input_names[0]] = (
3285
+ required_input + [self.pad_token_id] * difference
3286
+ )
3287
+ elif self.padding_side == "left":
3288
+ if return_attention_mask:
3289
+ encoded_inputs["attention_mask"] = [
3290
+ 0
3291
+ ] * difference + encoded_inputs["attention_mask"]
3292
+ if "token_type_ids" in encoded_inputs:
3293
+ encoded_inputs["token_type_ids"] = [
3294
+ self.pad_token_type_id
3295
+ ] * difference + encoded_inputs["token_type_ids"]
3296
+ if "special_tokens_mask" in encoded_inputs:
3297
+ encoded_inputs["special_tokens_mask"] = [
3298
+ 1
3299
+ ] * difference + encoded_inputs["special_tokens_mask"]
3300
+ if "offset_mapping" in encoded_inputs:
3301
+ encoded_inputs["offset_mapping"] = [
3302
+ (0, 0)
3303
+ ] * difference + encoded_inputs["offset_mapping"]
3304
+ if "position_ids" in encoded_inputs:
3305
+ encoded_inputs["position_ids"] = [0] * difference + encoded_inputs[
3306
+ "position_ids"
3307
+ ]
3308
+ if "start_positions" in encoded_inputs and isinstance(
3309
+ encoded_inputs["start_positions"], list
3310
+ ):
3311
+ encoded_inputs["start_positions"] = [
3312
+ 0
3313
+ ] * difference + encoded_inputs["start_positions"]
3314
+ if "end_positions" in encoded_inputs and isinstance(
3315
+ encoded_inputs["end_positions"], list
3316
+ ):
3317
+ encoded_inputs["end_positions"] = [0] * difference + encoded_inputs[
3318
+ "end_positions"
3319
+ ]
3320
+ encoded_inputs[self.model_input_names[0]] = [
3321
+ self.pad_token_id
3322
+ ] * difference + required_input
3323
+ else:
3324
+ raise ValueError("Invalid padding strategy:" + str(self.padding_side))
3325
+
3326
+ return encoded_inputs
3327
+
3328
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
3329
+ """
3330
+ Converts a sequence of tokens in a single string. The most simple way to do it is `" ".join(tokens)` but we
3331
+ often want to remove sub-word tokenization artifacts at the same time.
3332
+
3333
+ Args:
3334
+ tokens (`List[str]`): The token to join in a string.
3335
+
3336
+ Returns:
3337
+ `str`: The joined tokens.
3338
+ """
3339
+ raise NotImplementedError
3340
+
3341
+ def batch_decode(
3342
+ self,
3343
+ sequences: Union[List[int], List[List[int]], "np.ndarray", "paddle.Tensor"],
3344
+ skip_special_tokens: bool = False,
3345
+ clean_up_tokenization_spaces: bool = True,
3346
+ **kwargs,
3347
+ ) -> List[str]:
3348
+ """
3349
+ Convert a list of lists of token ids into a list of strings by calling decode.
3350
+
3351
+ Args:
3352
+ sequences (`Union[List[int], List[List[int]], np.ndarray, paddle.Tensor]`):
3353
+ List of tokenized input ids. Can be obtained using the `__call__` method.
3354
+ skip_special_tokens (`bool`, *optional*, defaults to `False`):
3355
+ Whether or not to remove special tokens in the decoding.
3356
+ clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
3357
+ Whether or not to clean up the tokenization spaces.
3358
+ kwargs (additional keyword arguments, *optional*):
3359
+ Will be passed to the underlying model specific decode method.
3360
+
3361
+ Returns:
3362
+ `List[str]`: The list of decoded sentences.
3363
+ """
3364
+ return [
3365
+ self.decode(
3366
+ seq,
3367
+ skip_special_tokens=skip_special_tokens,
3368
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
3369
+ **kwargs,
3370
+ )
3371
+ for seq in sequences
3372
+ ]
3373
+
3374
+ def decode(
3375
+ self,
3376
+ token_ids: Union[int, List[int], "np.ndarray", "paddle.Tensor"],
3377
+ skip_special_tokens: bool = False,
3378
+ clean_up_tokenization_spaces: bool = True,
3379
+ **kwargs,
3380
+ ) -> str:
3381
+ """
3382
+ Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
3383
+ tokens and clean up tokenization spaces.
3384
+
3385
+ Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.
3386
+
3387
+ Args:
3388
+ token_ids (`Union[int, List[int], np.ndarray, paddle.Tensor]`):
3389
+ List of tokenized input ids. Can be obtained using the `__call__` method.
3390
+ skip_special_tokens (`bool`, *optional*, defaults to `False`):
3391
+ Whether or not to remove special tokens in the decoding.
3392
+ clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
3393
+ Whether or not to clean up the tokenization spaces.
3394
+ kwargs (additional keyword arguments, *optional*):
3395
+ Will be passed to the underlying model specific decode method.
3396
+
3397
+ Returns:
3398
+ `str`: The decoded sentence.
3399
+ """
3400
+ # Convert inputs to python lists
3401
+ token_ids = to_py_obj(token_ids)
3402
+
3403
+ return self._decode(
3404
+ token_ids=token_ids,
3405
+ skip_special_tokens=skip_special_tokens,
3406
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
3407
+ **kwargs,
3408
+ )
3409
+
3410
+ def _decode(
3411
+ self,
3412
+ token_ids: Union[int, List[int]],
3413
+ skip_special_tokens: bool = False,
3414
+ clean_up_tokenization_spaces: bool = True,
3415
+ **kwargs,
3416
+ ) -> str:
3417
+ raise NotImplementedError
3418
+
3419
+ def get_special_tokens_mask(
3420
+ self,
3421
+ token_ids_0: List[int],
3422
+ token_ids_1: Optional[List[int]] = None,
3423
+ already_has_special_tokens: bool = False,
3424
+ ) -> List[int]:
3425
+ """
3426
+ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
3427
+ special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
3428
+
3429
+ Args:
3430
+ token_ids_0 (`List[int]`):
3431
+ List of ids of the first sequence.
3432
+ token_ids_1 (`List[int]`, *optional*):
3433
+ List of ids of the second sequence.
3434
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
3435
+ Whether or not the token list is already formatted with special tokens for the model.
3436
+
3437
+ Returns:
3438
+ A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
3439
+ """
3440
+ assert already_has_special_tokens and token_ids_1 is None, (
3441
+ "You cannot use ``already_has_special_tokens=False`` with this tokenizer. "
3442
+ "Please use a slow (full python) tokenizer to activate this argument. "
3443
+ "Or set `return_special_tokens_mask=True` when calling the encoding method "
3444
+ "to get the special tokens mask in any tokenizer. "
3445
+ )
3446
+
3447
+ all_special_ids = self.all_special_ids # cache the property
3448
+
3449
+ special_tokens_mask = [
3450
+ 1 if token in all_special_ids else 0 for token in token_ids_0
3451
+ ]
3452
+
3453
+ return special_tokens_mask
3454
+
3455
+ @staticmethod
3456
+ def clean_up_tokenization(out_string: str) -> str:
3457
+ """
3458
+ Clean up a list of simple English tokenization artifacts like spaces before punctuations and abbreviated forms.
3459
+
3460
+ Args:
3461
+ out_string (`str`): The text to clean up.
3462
+
3463
+ Returns:
3464
+ `str`: The cleaned-up string.
3465
+ """
3466
+ out_string = (
3467
+ out_string.replace(" .", ".")
3468
+ .replace(" ?", "?")
3469
+ .replace(" !", "!")
3470
+ .replace(" ,", ",")
3471
+ .replace(" ' ", "'")
3472
+ .replace(" n't", "n't")
3473
+ .replace(" 'm", "'m")
3474
+ .replace(" 's", "'s")
3475
+ .replace(" 've", "'ve")
3476
+ .replace(" 're", "'re")
3477
+ )
3478
+ return out_string
3479
+
3480
+ def _eventual_warn_about_too_long_sequence(
3481
+ self, ids: List[int], max_length: Optional[int], verbose: bool
3482
+ ):
3483
+ """
3484
+ Depending on the input and internal state we might trigger a warning about a sequence that is too long for its
3485
+ corresponding model
3486
+
3487
+ Args:
3488
+ ids (`List[str]`): The ids produced by the tokenization
3489
+ max_length (`int`, *optional*): The max_length desired (does not trigger a warning if it is set)
3490
+ verbose (`bool`): Whether or not to print more information and warnings.
3491
+
3492
+ """
3493
+ if max_length is None and len(ids) > self.model_max_length and verbose:
3494
+ if not self.deprecation_warnings.get(
3495
+ "sequence-length-is-longer-than-the-specified-maximum", False
3496
+ ):
3497
+ logging.warning(
3498
+ "Token indices sequence length is longer than the specified maximum sequence length "
3499
+ f"for this model ({len(ids)} > {self.model_max_length}). Running this sequence through the model "
3500
+ "will result in indexing errors"
3501
+ )
3502
+ self.deprecation_warnings[
3503
+ "sequence-length-is-longer-than-the-specified-maximum"
3504
+ ] = True