paddlex 3.0.0rc0__py3-none-any.whl → 3.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (824) hide show
  1. paddlex/.version +1 -1
  2. paddlex/__init__.py +17 -34
  3. paddlex/__main__.py +1 -1
  4. paddlex/configs/modules/chart_parsing/PP-Chart2Table.yaml +13 -0
  5. paddlex/configs/modules/doc_vlm/PP-DocBee-2B.yaml +14 -0
  6. paddlex/configs/modules/doc_vlm/PP-DocBee-7B.yaml +14 -0
  7. paddlex/configs/modules/doc_vlm/PP-DocBee2-3B.yaml +14 -0
  8. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-L.yaml +40 -0
  9. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-M.yaml +40 -0
  10. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-S.yaml +40 -0
  11. paddlex/configs/modules/layout_detection/PP-DocBlockLayout.yaml +40 -0
  12. paddlex/configs/modules/layout_detection/PP-DocLayout-L.yaml +2 -2
  13. paddlex/configs/modules/layout_detection/PP-DocLayout-M.yaml +2 -2
  14. paddlex/configs/modules/layout_detection/PP-DocLayout-S.yaml +2 -2
  15. paddlex/configs/modules/layout_detection/PP-DocLayout_plus-L.yaml +40 -0
  16. paddlex/configs/modules/open_vocabulary_detection/YOLO-Worldv2-L.yaml +13 -0
  17. paddlex/configs/modules/text_detection/PP-OCRv5_mobile_det.yaml +40 -0
  18. paddlex/configs/modules/text_detection/PP-OCRv5_server_det.yaml +40 -0
  19. paddlex/configs/modules/text_recognition/PP-OCRv5_mobile_rec.yaml +39 -0
  20. paddlex/configs/modules/text_recognition/PP-OCRv5_server_rec.yaml +39 -0
  21. paddlex/configs/modules/textline_orientation/PP-LCNet_x1_0_textline_ori.yaml +41 -0
  22. paddlex/configs/pipelines/OCR.yaml +7 -6
  23. paddlex/configs/pipelines/PP-ChatOCRv3-doc.yaml +3 -1
  24. paddlex/configs/pipelines/PP-ChatOCRv4-doc.yaml +91 -34
  25. paddlex/configs/pipelines/PP-StructureV3.yaml +72 -72
  26. paddlex/configs/pipelines/anomaly_detection.yaml +1 -1
  27. paddlex/configs/pipelines/doc_understanding.yaml +9 -0
  28. paddlex/configs/pipelines/formula_recognition.yaml +2 -2
  29. paddlex/configs/pipelines/layout_parsing.yaml +3 -2
  30. paddlex/configs/pipelines/seal_recognition.yaml +1 -0
  31. paddlex/configs/pipelines/table_recognition.yaml +2 -1
  32. paddlex/configs/pipelines/table_recognition_v2.yaml +7 -1
  33. paddlex/configs/pipelines/ts_anomaly_detection.yaml +1 -1
  34. paddlex/configs/pipelines/ts_classification.yaml +1 -1
  35. paddlex/configs/pipelines/ts_forecast.yaml +1 -1
  36. paddlex/constants.py +17 -0
  37. paddlex/engine.py +7 -5
  38. paddlex/hpip_links.html +23 -11
  39. paddlex/inference/__init__.py +3 -3
  40. paddlex/inference/common/__init__.py +1 -1
  41. paddlex/inference/common/batch_sampler/__init__.py +5 -4
  42. paddlex/inference/common/batch_sampler/audio_batch_sampler.py +5 -6
  43. paddlex/inference/common/batch_sampler/base_batch_sampler.py +20 -16
  44. paddlex/inference/common/batch_sampler/det_3d_batch_sampler.py +4 -7
  45. paddlex/inference/common/batch_sampler/doc_vlm_batch_sampler.py +87 -0
  46. paddlex/inference/common/batch_sampler/image_batch_sampler.py +45 -60
  47. paddlex/inference/common/batch_sampler/ts_batch_sampler.py +9 -10
  48. paddlex/inference/common/batch_sampler/video_batch_sampler.py +2 -22
  49. paddlex/inference/common/reader/__init__.py +4 -4
  50. paddlex/inference/common/reader/audio_reader.py +3 -3
  51. paddlex/inference/common/reader/det_3d_reader.py +7 -5
  52. paddlex/inference/common/reader/image_reader.py +16 -12
  53. paddlex/inference/common/reader/ts_reader.py +3 -2
  54. paddlex/inference/common/reader/video_reader.py +3 -3
  55. paddlex/inference/common/result/__init__.py +7 -7
  56. paddlex/inference/common/result/base_cv_result.py +12 -2
  57. paddlex/inference/common/result/base_result.py +7 -5
  58. paddlex/inference/common/result/base_ts_result.py +1 -2
  59. paddlex/inference/common/result/base_video_result.py +2 -2
  60. paddlex/inference/common/result/mixin.py +31 -25
  61. paddlex/inference/models/__init__.py +41 -85
  62. paddlex/inference/models/anomaly_detection/__init__.py +1 -1
  63. paddlex/inference/models/anomaly_detection/predictor.py +9 -19
  64. paddlex/inference/models/anomaly_detection/processors.py +9 -2
  65. paddlex/inference/models/anomaly_detection/result.py +3 -2
  66. paddlex/inference/models/base/__init__.py +2 -2
  67. paddlex/inference/models/base/predictor/__init__.py +1 -2
  68. paddlex/inference/models/base/predictor/base_predictor.py +278 -39
  69. paddlex/inference/models/common/__init__.py +6 -15
  70. paddlex/inference/models/common/static_infer.py +724 -251
  71. paddlex/inference/models/common/tokenizer/__init__.py +7 -3
  72. paddlex/inference/models/common/tokenizer/bert_tokenizer.py +1 -1
  73. paddlex/inference/models/common/tokenizer/clip_tokenizer.py +609 -0
  74. paddlex/inference/models/common/tokenizer/gpt_tokenizer.py +9 -7
  75. paddlex/inference/models/common/tokenizer/qwen2_5_tokenizer.py +112 -0
  76. paddlex/inference/models/common/tokenizer/qwen2_tokenizer.py +438 -0
  77. paddlex/inference/models/common/tokenizer/qwen_tokenizer.py +288 -0
  78. paddlex/inference/models/common/tokenizer/tokenizer_utils.py +85 -77
  79. paddlex/inference/models/common/tokenizer/tokenizer_utils_base.py +339 -123
  80. paddlex/inference/models/common/tokenizer/utils.py +1 -1
  81. paddlex/inference/models/common/tokenizer/vocab.py +8 -8
  82. paddlex/inference/models/common/ts/__init__.py +1 -1
  83. paddlex/inference/models/common/ts/funcs.py +13 -6
  84. paddlex/inference/models/common/ts/processors.py +14 -5
  85. paddlex/inference/models/common/vision/__init__.py +3 -3
  86. paddlex/inference/models/common/vision/funcs.py +17 -12
  87. paddlex/inference/models/common/vision/processors.py +61 -46
  88. paddlex/inference/models/common/vlm/__init__.py +13 -0
  89. paddlex/inference/models/common/vlm/activations.py +189 -0
  90. paddlex/inference/models/common/vlm/bert_padding.py +127 -0
  91. paddlex/inference/models/common/vlm/conversion_utils.py +99 -0
  92. paddlex/inference/models/common/vlm/distributed.py +229 -0
  93. paddlex/inference/models/common/vlm/flash_attn_utils.py +119 -0
  94. paddlex/inference/models/common/vlm/fusion_ops.py +205 -0
  95. paddlex/inference/models/common/vlm/generation/__init__.py +34 -0
  96. paddlex/inference/models/common/vlm/generation/configuration_utils.py +533 -0
  97. paddlex/inference/models/common/vlm/generation/logits_process.py +730 -0
  98. paddlex/inference/models/common/vlm/generation/stopping_criteria.py +106 -0
  99. paddlex/inference/models/common/vlm/generation/utils.py +2162 -0
  100. paddlex/inference/models/common/vlm/transformers/__init__.py +16 -0
  101. paddlex/inference/models/common/vlm/transformers/configuration_utils.py +1037 -0
  102. paddlex/inference/models/common/vlm/transformers/conversion_utils.py +408 -0
  103. paddlex/inference/models/common/vlm/transformers/model_outputs.py +1612 -0
  104. paddlex/inference/models/common/vlm/transformers/model_utils.py +2014 -0
  105. paddlex/inference/models/common/vlm/transformers/utils.py +178 -0
  106. paddlex/inference/models/common/vlm/utils.py +109 -0
  107. paddlex/inference/models/doc_vlm/__init__.py +15 -0
  108. paddlex/inference/models/doc_vlm/modeling/GOT_ocr_2_0.py +830 -0
  109. paddlex/inference/models/doc_vlm/modeling/__init__.py +17 -0
  110. paddlex/inference/models/doc_vlm/modeling/qwen2.py +1606 -0
  111. paddlex/inference/models/doc_vlm/modeling/qwen2_5_vl.py +3006 -0
  112. paddlex/inference/models/doc_vlm/modeling/qwen2_vl.py +2495 -0
  113. paddlex/inference/models/doc_vlm/predictor.py +253 -0
  114. paddlex/inference/models/doc_vlm/processors/GOT_ocr_2_0.py +97 -0
  115. paddlex/inference/models/doc_vlm/processors/__init__.py +17 -0
  116. paddlex/inference/models/doc_vlm/processors/common.py +561 -0
  117. paddlex/inference/models/doc_vlm/processors/qwen2_5_vl.py +548 -0
  118. paddlex/inference/models/doc_vlm/processors/qwen2_vl.py +543 -0
  119. paddlex/inference/models/doc_vlm/result.py +21 -0
  120. paddlex/inference/models/face_feature/__init__.py +1 -1
  121. paddlex/inference/models/face_feature/predictor.py +2 -1
  122. paddlex/inference/models/formula_recognition/__init__.py +1 -1
  123. paddlex/inference/models/formula_recognition/predictor.py +18 -28
  124. paddlex/inference/models/formula_recognition/processors.py +126 -97
  125. paddlex/inference/models/formula_recognition/result.py +43 -35
  126. paddlex/inference/models/image_classification/__init__.py +1 -1
  127. paddlex/inference/models/image_classification/predictor.py +9 -19
  128. paddlex/inference/models/image_classification/processors.py +4 -2
  129. paddlex/inference/models/image_classification/result.py +4 -3
  130. paddlex/inference/models/image_feature/__init__.py +1 -1
  131. paddlex/inference/models/image_feature/predictor.py +9 -19
  132. paddlex/inference/models/image_feature/processors.py +7 -5
  133. paddlex/inference/models/image_feature/result.py +2 -3
  134. paddlex/inference/models/image_multilabel_classification/__init__.py +1 -1
  135. paddlex/inference/models/image_multilabel_classification/predictor.py +7 -6
  136. paddlex/inference/models/image_multilabel_classification/processors.py +6 -2
  137. paddlex/inference/models/image_multilabel_classification/result.py +4 -3
  138. paddlex/inference/models/image_unwarping/__init__.py +1 -1
  139. paddlex/inference/models/image_unwarping/predictor.py +8 -16
  140. paddlex/inference/models/image_unwarping/processors.py +6 -2
  141. paddlex/inference/models/image_unwarping/result.py +4 -2
  142. paddlex/inference/models/instance_segmentation/__init__.py +1 -1
  143. paddlex/inference/models/instance_segmentation/predictor.py +7 -15
  144. paddlex/inference/models/instance_segmentation/processors.py +4 -7
  145. paddlex/inference/models/instance_segmentation/result.py +11 -10
  146. paddlex/inference/models/keypoint_detection/__init__.py +1 -1
  147. paddlex/inference/models/keypoint_detection/predictor.py +5 -3
  148. paddlex/inference/models/keypoint_detection/processors.py +11 -3
  149. paddlex/inference/models/keypoint_detection/result.py +9 -4
  150. paddlex/inference/models/{3d_bev_detection → m_3d_bev_detection}/__init__.py +1 -1
  151. paddlex/inference/models/{3d_bev_detection → m_3d_bev_detection}/predictor.py +15 -26
  152. paddlex/inference/models/{3d_bev_detection → m_3d_bev_detection}/processors.py +26 -14
  153. paddlex/inference/models/{3d_bev_detection → m_3d_bev_detection}/result.py +15 -12
  154. paddlex/inference/models/{3d_bev_detection → m_3d_bev_detection}/visualizer_3d.py +77 -39
  155. paddlex/inference/models/multilingual_speech_recognition/__init__.py +1 -1
  156. paddlex/inference/models/multilingual_speech_recognition/predictor.py +11 -15
  157. paddlex/inference/models/multilingual_speech_recognition/processors.py +45 -53
  158. paddlex/inference/models/multilingual_speech_recognition/result.py +1 -1
  159. paddlex/inference/models/object_detection/__init__.py +1 -1
  160. paddlex/inference/models/object_detection/predictor.py +8 -12
  161. paddlex/inference/models/object_detection/processors.py +63 -33
  162. paddlex/inference/models/object_detection/result.py +5 -4
  163. paddlex/inference/models/object_detection/utils.py +3 -1
  164. paddlex/inference/models/open_vocabulary_detection/__init__.py +1 -1
  165. paddlex/inference/models/open_vocabulary_detection/predictor.py +31 -14
  166. paddlex/inference/models/open_vocabulary_detection/processors/__init__.py +3 -2
  167. paddlex/inference/models/open_vocabulary_detection/processors/common.py +114 -0
  168. paddlex/inference/models/open_vocabulary_detection/processors/groundingdino_processors.py +19 -8
  169. paddlex/inference/models/open_vocabulary_detection/processors/yoloworld_processors.py +209 -0
  170. paddlex/inference/models/open_vocabulary_segmentation/__init__.py +1 -1
  171. paddlex/inference/models/open_vocabulary_segmentation/predictor.py +6 -13
  172. paddlex/inference/models/open_vocabulary_segmentation/processors/__init__.py +1 -1
  173. paddlex/inference/models/open_vocabulary_segmentation/processors/sam_processer.py +12 -12
  174. paddlex/inference/models/open_vocabulary_segmentation/results/__init__.py +1 -1
  175. paddlex/inference/models/open_vocabulary_segmentation/results/sam_result.py +11 -9
  176. paddlex/inference/models/semantic_segmentation/__init__.py +1 -1
  177. paddlex/inference/models/semantic_segmentation/predictor.py +9 -18
  178. paddlex/inference/models/semantic_segmentation/processors.py +11 -8
  179. paddlex/inference/models/semantic_segmentation/result.py +4 -3
  180. paddlex/inference/models/table_structure_recognition/__init__.py +1 -1
  181. paddlex/inference/models/table_structure_recognition/predictor.py +8 -18
  182. paddlex/inference/models/table_structure_recognition/processors.py +23 -29
  183. paddlex/inference/models/table_structure_recognition/result.py +8 -15
  184. paddlex/inference/models/text_detection/__init__.py +1 -1
  185. paddlex/inference/models/text_detection/predictor.py +24 -24
  186. paddlex/inference/models/text_detection/processors.py +116 -44
  187. paddlex/inference/models/text_detection/result.py +8 -13
  188. paddlex/inference/models/text_recognition/__init__.py +1 -1
  189. paddlex/inference/models/text_recognition/predictor.py +11 -19
  190. paddlex/inference/models/text_recognition/processors.py +27 -13
  191. paddlex/inference/models/text_recognition/result.py +3 -2
  192. paddlex/inference/models/ts_anomaly_detection/__init__.py +1 -1
  193. paddlex/inference/models/ts_anomaly_detection/predictor.py +12 -17
  194. paddlex/inference/models/ts_anomaly_detection/processors.py +6 -2
  195. paddlex/inference/models/ts_anomaly_detection/result.py +21 -10
  196. paddlex/inference/models/ts_classification/__init__.py +1 -1
  197. paddlex/inference/models/ts_classification/predictor.py +14 -27
  198. paddlex/inference/models/ts_classification/processors.py +7 -2
  199. paddlex/inference/models/ts_classification/result.py +21 -12
  200. paddlex/inference/models/ts_forecasting/__init__.py +1 -1
  201. paddlex/inference/models/ts_forecasting/predictor.py +13 -18
  202. paddlex/inference/models/ts_forecasting/processors.py +12 -3
  203. paddlex/inference/models/ts_forecasting/result.py +24 -11
  204. paddlex/inference/models/video_classification/__init__.py +1 -1
  205. paddlex/inference/models/video_classification/predictor.py +9 -15
  206. paddlex/inference/models/video_classification/processors.py +24 -24
  207. paddlex/inference/models/video_classification/result.py +7 -3
  208. paddlex/inference/models/video_detection/__init__.py +1 -1
  209. paddlex/inference/models/video_detection/predictor.py +8 -15
  210. paddlex/inference/models/video_detection/processors.py +24 -11
  211. paddlex/inference/models/video_detection/result.py +10 -5
  212. paddlex/inference/pipelines/__init__.py +48 -37
  213. paddlex/inference/pipelines/_parallel.py +172 -0
  214. paddlex/inference/pipelines/anomaly_detection/__init__.py +1 -1
  215. paddlex/inference/pipelines/anomaly_detection/pipeline.py +29 -9
  216. paddlex/inference/pipelines/attribute_recognition/__init__.py +1 -1
  217. paddlex/inference/pipelines/attribute_recognition/pipeline.py +24 -9
  218. paddlex/inference/pipelines/attribute_recognition/result.py +10 -8
  219. paddlex/inference/pipelines/base.py +43 -13
  220. paddlex/inference/pipelines/components/__init__.py +14 -8
  221. paddlex/inference/pipelines/components/chat_server/__init__.py +1 -1
  222. paddlex/inference/pipelines/components/chat_server/base.py +2 -2
  223. paddlex/inference/pipelines/components/chat_server/openai_bot_chat.py +8 -8
  224. paddlex/inference/pipelines/components/common/__init__.py +5 -4
  225. paddlex/inference/pipelines/components/common/base_operator.py +2 -1
  226. paddlex/inference/pipelines/components/common/base_result.py +3 -2
  227. paddlex/inference/pipelines/components/common/convert_points_and_boxes.py +1 -2
  228. paddlex/inference/pipelines/components/common/crop_image_regions.py +11 -5
  229. paddlex/inference/pipelines/components/common/seal_det_warp.py +44 -13
  230. paddlex/inference/pipelines/components/common/sort_boxes.py +4 -2
  231. paddlex/inference/pipelines/components/common/warp_image.py +50 -0
  232. paddlex/inference/pipelines/components/faisser.py +10 -5
  233. paddlex/inference/pipelines/components/prompt_engineering/__init__.py +2 -2
  234. paddlex/inference/pipelines/components/prompt_engineering/base.py +2 -2
  235. paddlex/inference/pipelines/components/prompt_engineering/generate_ensemble_prompt.py +2 -1
  236. paddlex/inference/pipelines/components/prompt_engineering/generate_kie_prompt.py +2 -2
  237. paddlex/inference/pipelines/components/retriever/__init__.py +2 -2
  238. paddlex/inference/pipelines/components/retriever/base.py +18 -16
  239. paddlex/inference/pipelines/components/retriever/openai_bot_retriever.py +2 -2
  240. paddlex/inference/pipelines/components/retriever/qianfan_bot_retriever.py +87 -84
  241. paddlex/inference/pipelines/components/utils/__init__.py +1 -1
  242. paddlex/inference/pipelines/components/utils/mixin.py +7 -7
  243. paddlex/inference/pipelines/doc_preprocessor/__init__.py +1 -1
  244. paddlex/inference/pipelines/doc_preprocessor/pipeline.py +70 -51
  245. paddlex/inference/pipelines/doc_preprocessor/result.py +5 -10
  246. paddlex/inference/pipelines/doc_understanding/__init__.py +15 -0
  247. paddlex/inference/pipelines/doc_understanding/pipeline.py +71 -0
  248. paddlex/inference/pipelines/face_recognition/__init__.py +1 -1
  249. paddlex/inference/pipelines/face_recognition/pipeline.py +3 -1
  250. paddlex/inference/pipelines/face_recognition/result.py +3 -2
  251. paddlex/inference/pipelines/formula_recognition/__init__.py +1 -1
  252. paddlex/inference/pipelines/formula_recognition/pipeline.py +137 -93
  253. paddlex/inference/pipelines/formula_recognition/result.py +20 -29
  254. paddlex/inference/pipelines/image_classification/__init__.py +1 -1
  255. paddlex/inference/pipelines/image_classification/pipeline.py +30 -11
  256. paddlex/inference/pipelines/image_multilabel_classification/__init__.py +1 -1
  257. paddlex/inference/pipelines/image_multilabel_classification/pipeline.py +31 -12
  258. paddlex/inference/pipelines/instance_segmentation/__init__.py +1 -1
  259. paddlex/inference/pipelines/instance_segmentation/pipeline.py +30 -9
  260. paddlex/inference/pipelines/keypoint_detection/__init__.py +1 -1
  261. paddlex/inference/pipelines/keypoint_detection/pipeline.py +30 -9
  262. paddlex/inference/pipelines/layout_parsing/__init__.py +1 -1
  263. paddlex/inference/pipelines/layout_parsing/pipeline.py +54 -56
  264. paddlex/inference/pipelines/layout_parsing/pipeline_v2.py +904 -261
  265. paddlex/inference/pipelines/layout_parsing/result.py +9 -21
  266. paddlex/inference/pipelines/layout_parsing/result_v2.py +525 -250
  267. paddlex/inference/pipelines/layout_parsing/setting.py +87 -0
  268. paddlex/inference/pipelines/layout_parsing/utils.py +570 -2004
  269. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/__init__.py +16 -0
  270. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/utils.py +1144 -0
  271. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/xycuts.py +563 -0
  272. paddlex/inference/pipelines/{3d_bev_detection → m_3d_bev_detection}/__init__.py +1 -1
  273. paddlex/inference/pipelines/{3d_bev_detection → m_3d_bev_detection}/pipeline.py +17 -10
  274. paddlex/inference/pipelines/multilingual_speech_recognition/__init__.py +1 -1
  275. paddlex/inference/pipelines/multilingual_speech_recognition/pipeline.py +17 -6
  276. paddlex/inference/pipelines/object_detection/__init__.py +1 -1
  277. paddlex/inference/pipelines/object_detection/pipeline.py +29 -9
  278. paddlex/inference/pipelines/ocr/__init__.py +1 -1
  279. paddlex/inference/pipelines/ocr/pipeline.py +151 -77
  280. paddlex/inference/pipelines/ocr/result.py +31 -24
  281. paddlex/inference/pipelines/open_vocabulary_detection/__init__.py +1 -1
  282. paddlex/inference/pipelines/open_vocabulary_detection/pipeline.py +17 -6
  283. paddlex/inference/pipelines/open_vocabulary_segmentation/__init__.py +1 -1
  284. paddlex/inference/pipelines/open_vocabulary_segmentation/pipeline.py +17 -6
  285. paddlex/inference/pipelines/pp_chatocr/__init__.py +1 -1
  286. paddlex/inference/pipelines/pp_chatocr/pipeline_base.py +14 -5
  287. paddlex/inference/pipelines/pp_chatocr/pipeline_v3.py +22 -14
  288. paddlex/inference/pipelines/pp_chatocr/pipeline_v4.py +34 -16
  289. paddlex/inference/pipelines/pp_shitu_v2/__init__.py +1 -1
  290. paddlex/inference/pipelines/pp_shitu_v2/pipeline.py +12 -8
  291. paddlex/inference/pipelines/pp_shitu_v2/result.py +4 -4
  292. paddlex/inference/pipelines/rotated_object_detection/__init__.py +1 -1
  293. paddlex/inference/pipelines/rotated_object_detection/pipeline.py +30 -9
  294. paddlex/inference/pipelines/seal_recognition/__init__.py +1 -1
  295. paddlex/inference/pipelines/seal_recognition/pipeline.py +127 -63
  296. paddlex/inference/pipelines/seal_recognition/result.py +4 -2
  297. paddlex/inference/pipelines/semantic_segmentation/__init__.py +1 -1
  298. paddlex/inference/pipelines/semantic_segmentation/pipeline.py +30 -9
  299. paddlex/inference/pipelines/small_object_detection/__init__.py +1 -1
  300. paddlex/inference/pipelines/small_object_detection/pipeline.py +30 -9
  301. paddlex/inference/pipelines/table_recognition/__init__.py +1 -1
  302. paddlex/inference/pipelines/table_recognition/pipeline.py +61 -37
  303. paddlex/inference/pipelines/table_recognition/pipeline_v2.py +668 -65
  304. paddlex/inference/pipelines/table_recognition/result.py +12 -10
  305. paddlex/inference/pipelines/table_recognition/table_recognition_post_processing.py +12 -8
  306. paddlex/inference/pipelines/table_recognition/table_recognition_post_processing_v2.py +55 -37
  307. paddlex/inference/pipelines/table_recognition/utils.py +1 -1
  308. paddlex/inference/pipelines/ts_anomaly_detection/__init__.py +1 -1
  309. paddlex/inference/pipelines/ts_anomaly_detection/pipeline.py +16 -6
  310. paddlex/inference/pipelines/ts_classification/__init__.py +1 -1
  311. paddlex/inference/pipelines/ts_classification/pipeline.py +16 -6
  312. paddlex/inference/pipelines/ts_forecasting/__init__.py +1 -1
  313. paddlex/inference/pipelines/ts_forecasting/pipeline.py +16 -6
  314. paddlex/inference/pipelines/video_classification/__init__.py +1 -1
  315. paddlex/inference/pipelines/video_classification/pipeline.py +17 -6
  316. paddlex/inference/pipelines/video_detection/__init__.py +1 -1
  317. paddlex/inference/pipelines/video_detection/pipeline.py +20 -7
  318. paddlex/inference/serving/__init__.py +5 -1
  319. paddlex/inference/serving/basic_serving/__init__.py +1 -1
  320. paddlex/inference/serving/basic_serving/_app.py +31 -19
  321. paddlex/inference/serving/basic_serving/_pipeline_apps/__init__.py +7 -4
  322. paddlex/inference/serving/basic_serving/_pipeline_apps/_common/__init__.py +1 -1
  323. paddlex/inference/serving/basic_serving/_pipeline_apps/_common/common.py +12 -4
  324. paddlex/inference/serving/basic_serving/_pipeline_apps/_common/image_recognition.py +1 -1
  325. paddlex/inference/serving/basic_serving/_pipeline_apps/_common/ocr.py +7 -2
  326. paddlex/inference/serving/basic_serving/_pipeline_apps/anomaly_detection.py +10 -7
  327. paddlex/inference/serving/basic_serving/_pipeline_apps/doc_preprocessor.py +10 -7
  328. paddlex/inference/serving/basic_serving/_pipeline_apps/doc_understanding.py +153 -0
  329. paddlex/inference/serving/basic_serving/_pipeline_apps/face_recognition.py +16 -13
  330. paddlex/inference/serving/basic_serving/_pipeline_apps/formula_recognition.py +10 -7
  331. paddlex/inference/serving/basic_serving/_pipeline_apps/human_keypoint_detection.py +10 -7
  332. paddlex/inference/serving/basic_serving/_pipeline_apps/image_classification.py +10 -7
  333. paddlex/inference/serving/basic_serving/_pipeline_apps/image_multilabel_classification.py +10 -7
  334. paddlex/inference/serving/basic_serving/_pipeline_apps/instance_segmentation.py +13 -7
  335. paddlex/inference/serving/basic_serving/_pipeline_apps/layout_parsing.py +10 -8
  336. paddlex/inference/serving/basic_serving/_pipeline_apps/m_3d_bev_detection.py +10 -7
  337. paddlex/inference/serving/basic_serving/_pipeline_apps/multilingual_speech_recognition.py +10 -7
  338. paddlex/inference/serving/basic_serving/_pipeline_apps/object_detection.py +10 -7
  339. paddlex/inference/serving/basic_serving/_pipeline_apps/ocr.py +10 -7
  340. paddlex/inference/serving/basic_serving/_pipeline_apps/open_vocabulary_detection.py +10 -7
  341. paddlex/inference/serving/basic_serving/_pipeline_apps/open_vocabulary_segmentation.py +13 -7
  342. paddlex/inference/serving/basic_serving/_pipeline_apps/pedestrian_attribute_recognition.py +10 -7
  343. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv3_doc.py +14 -12
  344. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv4_doc.py +17 -14
  345. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_shituv2.py +16 -13
  346. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_structurev3.py +16 -9
  347. paddlex/inference/serving/basic_serving/_pipeline_apps/rotated_object_detection.py +10 -7
  348. paddlex/inference/serving/basic_serving/_pipeline_apps/seal_recognition.py +10 -7
  349. paddlex/inference/serving/basic_serving/_pipeline_apps/semantic_segmentation.py +10 -7
  350. paddlex/inference/serving/basic_serving/_pipeline_apps/small_object_detection.py +10 -7
  351. paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition.py +11 -12
  352. paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition_v2.py +14 -12
  353. paddlex/inference/serving/basic_serving/_pipeline_apps/ts_anomaly_detection.py +10 -7
  354. paddlex/inference/serving/basic_serving/_pipeline_apps/ts_classification.py +10 -7
  355. paddlex/inference/serving/basic_serving/_pipeline_apps/ts_forecast.py +10 -7
  356. paddlex/inference/serving/basic_serving/_pipeline_apps/vehicle_attribute_recognition.py +10 -7
  357. paddlex/inference/serving/basic_serving/_pipeline_apps/video_classification.py +10 -7
  358. paddlex/inference/serving/basic_serving/_pipeline_apps/video_detection.py +10 -7
  359. paddlex/inference/serving/basic_serving/_server.py +9 -4
  360. paddlex/inference/serving/infra/__init__.py +1 -1
  361. paddlex/inference/serving/infra/config.py +1 -1
  362. paddlex/inference/serving/infra/models.py +13 -6
  363. paddlex/inference/serving/infra/storage.py +9 -4
  364. paddlex/inference/serving/infra/utils.py +54 -28
  365. paddlex/inference/serving/schemas/__init__.py +1 -1
  366. paddlex/inference/serving/schemas/anomaly_detection.py +1 -1
  367. paddlex/inference/serving/schemas/doc_preprocessor.py +1 -1
  368. paddlex/inference/serving/schemas/doc_understanding.py +78 -0
  369. paddlex/inference/serving/schemas/face_recognition.py +1 -1
  370. paddlex/inference/serving/schemas/formula_recognition.py +2 -2
  371. paddlex/inference/serving/schemas/human_keypoint_detection.py +1 -1
  372. paddlex/inference/serving/schemas/image_classification.py +1 -1
  373. paddlex/inference/serving/schemas/image_multilabel_classification.py +1 -1
  374. paddlex/inference/serving/schemas/instance_segmentation.py +1 -1
  375. paddlex/inference/serving/schemas/layout_parsing.py +2 -3
  376. paddlex/inference/serving/schemas/m_3d_bev_detection.py +1 -1
  377. paddlex/inference/serving/schemas/multilingual_speech_recognition.py +1 -1
  378. paddlex/inference/serving/schemas/object_detection.py +1 -1
  379. paddlex/inference/serving/schemas/ocr.py +1 -1
  380. paddlex/inference/serving/schemas/open_vocabulary_detection.py +1 -1
  381. paddlex/inference/serving/schemas/open_vocabulary_segmentation.py +1 -1
  382. paddlex/inference/serving/schemas/pedestrian_attribute_recognition.py +1 -1
  383. paddlex/inference/serving/schemas/pp_chatocrv3_doc.py +2 -3
  384. paddlex/inference/serving/schemas/pp_chatocrv4_doc.py +3 -3
  385. paddlex/inference/serving/schemas/pp_shituv2.py +1 -1
  386. paddlex/inference/serving/schemas/pp_structurev3.py +11 -7
  387. paddlex/inference/serving/schemas/rotated_object_detection.py +1 -1
  388. paddlex/inference/serving/schemas/seal_recognition.py +2 -2
  389. paddlex/inference/serving/schemas/semantic_segmentation.py +1 -1
  390. paddlex/inference/serving/schemas/shared/__init__.py +1 -1
  391. paddlex/inference/serving/schemas/shared/classification.py +1 -1
  392. paddlex/inference/serving/schemas/shared/image_segmentation.py +1 -1
  393. paddlex/inference/serving/schemas/shared/object_detection.py +1 -1
  394. paddlex/inference/serving/schemas/shared/ocr.py +1 -1
  395. paddlex/inference/serving/schemas/small_object_detection.py +1 -1
  396. paddlex/inference/serving/schemas/table_recognition.py +3 -7
  397. paddlex/inference/serving/schemas/table_recognition_v2.py +6 -7
  398. paddlex/inference/serving/schemas/ts_anomaly_detection.py +1 -1
  399. paddlex/inference/serving/schemas/ts_classification.py +1 -1
  400. paddlex/inference/serving/schemas/ts_forecast.py +1 -1
  401. paddlex/inference/serving/schemas/vehicle_attribute_recognition.py +1 -1
  402. paddlex/inference/serving/schemas/video_classification.py +1 -1
  403. paddlex/inference/serving/schemas/video_detection.py +1 -1
  404. paddlex/inference/utils/__init__.py +1 -1
  405. paddlex/inference/utils/benchmark.py +332 -179
  406. paddlex/inference/utils/color_map.py +1 -1
  407. paddlex/inference/utils/get_pipeline_path.py +1 -1
  408. paddlex/inference/utils/hpi.py +258 -0
  409. paddlex/inference/utils/hpi_model_info_collection.json +2331 -0
  410. paddlex/inference/utils/io/__init__.py +11 -11
  411. paddlex/inference/utils/io/readers.py +31 -27
  412. paddlex/inference/utils/io/style.py +21 -14
  413. paddlex/inference/utils/io/tablepyxl.py +13 -5
  414. paddlex/inference/utils/io/writers.py +9 -10
  415. paddlex/inference/utils/mkldnn_blocklist.py +25 -0
  416. paddlex/inference/utils/model_paths.py +48 -0
  417. paddlex/inference/utils/{new_ir_blacklist.py → new_ir_blocklist.py} +1 -2
  418. paddlex/inference/utils/official_models.py +278 -262
  419. paddlex/inference/utils/pp_option.py +184 -92
  420. paddlex/inference/utils/trt_blocklist.py +43 -0
  421. paddlex/inference/utils/trt_config.py +420 -0
  422. paddlex/model.py +30 -12
  423. paddlex/modules/__init__.py +57 -80
  424. paddlex/modules/anomaly_detection/__init__.py +2 -2
  425. paddlex/modules/anomaly_detection/dataset_checker/__init__.py +2 -3
  426. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/__init__.py +2 -2
  427. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/analyse_dataset.py +6 -3
  428. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/check_dataset.py +8 -4
  429. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/convert_dataset.py +7 -4
  430. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/split_dataset.py +2 -2
  431. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/utils/__init__.py +1 -1
  432. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/utils/visualizer.py +7 -2
  433. paddlex/modules/anomaly_detection/evaluator.py +3 -3
  434. paddlex/modules/anomaly_detection/exportor.py +1 -1
  435. paddlex/modules/anomaly_detection/model_list.py +1 -1
  436. paddlex/modules/anomaly_detection/trainer.py +3 -4
  437. paddlex/modules/base/__init__.py +5 -5
  438. paddlex/modules/base/build_model.py +1 -2
  439. paddlex/modules/base/dataset_checker/__init__.py +2 -2
  440. paddlex/modules/base/dataset_checker/dataset_checker.py +4 -4
  441. paddlex/modules/base/dataset_checker/utils.py +1 -3
  442. paddlex/modules/base/evaluator.py +13 -13
  443. paddlex/modules/base/exportor.py +12 -13
  444. paddlex/modules/base/trainer.py +21 -11
  445. paddlex/modules/base/utils/__init__.py +13 -0
  446. paddlex/modules/base/utils/cinn_setting.py +89 -0
  447. paddlex/modules/base/utils/coco_eval.py +94 -0
  448. paddlex/modules/base/utils/topk_eval.py +118 -0
  449. paddlex/modules/doc_vlm/__init__.py +18 -0
  450. paddlex/modules/doc_vlm/dataset_checker.py +29 -0
  451. paddlex/modules/doc_vlm/evaluator.py +29 -0
  452. paddlex/modules/doc_vlm/exportor.py +29 -0
  453. paddlex/modules/doc_vlm/model_list.py +16 -0
  454. paddlex/modules/doc_vlm/trainer.py +41 -0
  455. paddlex/modules/face_recognition/__init__.py +2 -2
  456. paddlex/modules/face_recognition/dataset_checker/__init__.py +2 -2
  457. paddlex/modules/face_recognition/dataset_checker/dataset_src/__init__.py +1 -1
  458. paddlex/modules/face_recognition/dataset_checker/dataset_src/check_dataset.py +3 -5
  459. paddlex/modules/face_recognition/dataset_checker/dataset_src/utils/__init__.py +1 -1
  460. paddlex/modules/face_recognition/dataset_checker/dataset_src/utils/visualizer.py +2 -5
  461. paddlex/modules/face_recognition/evaluator.py +3 -3
  462. paddlex/modules/face_recognition/exportor.py +1 -1
  463. paddlex/modules/face_recognition/model_list.py +1 -1
  464. paddlex/modules/face_recognition/trainer.py +1 -1
  465. paddlex/modules/formula_recognition/__init__.py +2 -2
  466. paddlex/modules/formula_recognition/dataset_checker/__init__.py +3 -3
  467. paddlex/modules/formula_recognition/dataset_checker/dataset_src/__init__.py +2 -2
  468. paddlex/modules/formula_recognition/dataset_checker/dataset_src/analyse_dataset.py +13 -12
  469. paddlex/modules/formula_recognition/dataset_checker/dataset_src/check_dataset.py +2 -6
  470. paddlex/modules/formula_recognition/dataset_checker/dataset_src/convert_dataset.py +11 -10
  471. paddlex/modules/formula_recognition/dataset_checker/dataset_src/split_dataset.py +1 -2
  472. paddlex/modules/formula_recognition/evaluator.py +6 -3
  473. paddlex/modules/formula_recognition/exportor.py +1 -1
  474. paddlex/modules/formula_recognition/model_list.py +4 -1
  475. paddlex/modules/formula_recognition/trainer.py +5 -3
  476. paddlex/modules/general_recognition/__init__.py +2 -2
  477. paddlex/modules/general_recognition/dataset_checker/__init__.py +2 -2
  478. paddlex/modules/general_recognition/dataset_checker/dataset_src/__init__.py +2 -2
  479. paddlex/modules/general_recognition/dataset_checker/dataset_src/analyse_dataset.py +7 -9
  480. paddlex/modules/general_recognition/dataset_checker/dataset_src/check_dataset.py +4 -5
  481. paddlex/modules/general_recognition/dataset_checker/dataset_src/convert_dataset.py +6 -5
  482. paddlex/modules/general_recognition/dataset_checker/dataset_src/split_dataset.py +1 -1
  483. paddlex/modules/general_recognition/dataset_checker/dataset_src/utils/__init__.py +1 -1
  484. paddlex/modules/general_recognition/dataset_checker/dataset_src/utils/visualizer.py +2 -5
  485. paddlex/modules/general_recognition/evaluator.py +2 -2
  486. paddlex/modules/general_recognition/exportor.py +1 -1
  487. paddlex/modules/general_recognition/model_list.py +1 -1
  488. paddlex/modules/general_recognition/trainer.py +1 -1
  489. paddlex/modules/image_classification/__init__.py +2 -2
  490. paddlex/modules/image_classification/dataset_checker/__init__.py +2 -2
  491. paddlex/modules/image_classification/dataset_checker/dataset_src/__init__.py +2 -2
  492. paddlex/modules/image_classification/dataset_checker/dataset_src/analyse_dataset.py +8 -9
  493. paddlex/modules/image_classification/dataset_checker/dataset_src/check_dataset.py +4 -3
  494. paddlex/modules/image_classification/dataset_checker/dataset_src/convert_dataset.py +4 -4
  495. paddlex/modules/image_classification/dataset_checker/dataset_src/split_dataset.py +1 -1
  496. paddlex/modules/image_classification/dataset_checker/dataset_src/utils/__init__.py +1 -1
  497. paddlex/modules/image_classification/dataset_checker/dataset_src/utils/visualizer.py +2 -5
  498. paddlex/modules/image_classification/evaluator.py +3 -3
  499. paddlex/modules/image_classification/exportor.py +1 -1
  500. paddlex/modules/image_classification/model_list.py +2 -1
  501. paddlex/modules/image_classification/trainer.py +3 -3
  502. paddlex/modules/image_unwarping/__init__.py +1 -1
  503. paddlex/modules/image_unwarping/model_list.py +1 -1
  504. paddlex/modules/instance_segmentation/__init__.py +2 -2
  505. paddlex/modules/instance_segmentation/dataset_checker/__init__.py +2 -3
  506. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/__init__.py +2 -2
  507. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/analyse_dataset.py +9 -5
  508. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/check_dataset.py +8 -5
  509. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/convert_dataset.py +8 -8
  510. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/split_dataset.py +7 -4
  511. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/utils/__init__.py +1 -1
  512. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/utils/visualizer.py +10 -8
  513. paddlex/modules/instance_segmentation/evaluator.py +2 -2
  514. paddlex/modules/instance_segmentation/exportor.py +1 -1
  515. paddlex/modules/instance_segmentation/model_list.py +1 -1
  516. paddlex/modules/instance_segmentation/trainer.py +1 -1
  517. paddlex/modules/keypoint_detection/__init__.py +2 -2
  518. paddlex/modules/keypoint_detection/dataset_checker/__init__.py +2 -2
  519. paddlex/modules/keypoint_detection/dataset_checker/dataset_src/__init__.py +1 -1
  520. paddlex/modules/keypoint_detection/dataset_checker/dataset_src/check_dataset.py +10 -5
  521. paddlex/modules/keypoint_detection/dataset_checker/dataset_src/utils/__init__.py +1 -1
  522. paddlex/modules/keypoint_detection/dataset_checker/dataset_src/utils/visualizer.py +8 -3
  523. paddlex/modules/keypoint_detection/evaluator.py +2 -2
  524. paddlex/modules/keypoint_detection/exportor.py +1 -1
  525. paddlex/modules/keypoint_detection/model_list.py +1 -1
  526. paddlex/modules/keypoint_detection/trainer.py +2 -2
  527. paddlex/modules/{3d_bev_detection → m_3d_bev_detection}/__init__.py +2 -2
  528. paddlex/modules/{3d_bev_detection → m_3d_bev_detection}/dataset_checker/__init__.py +3 -3
  529. paddlex/modules/{3d_bev_detection → m_3d_bev_detection}/dataset_checker/dataset_src/__init__.py +2 -2
  530. paddlex/modules/{3d_bev_detection → m_3d_bev_detection}/dataset_checker/dataset_src/analyse_dataset.py +8 -8
  531. paddlex/modules/{3d_bev_detection → m_3d_bev_detection}/dataset_checker/dataset_src/check_dataset.py +1 -2
  532. paddlex/modules/{3d_bev_detection → m_3d_bev_detection}/evaluator.py +3 -3
  533. paddlex/modules/{3d_bev_detection → m_3d_bev_detection}/exportor.py +1 -1
  534. paddlex/modules/{3d_bev_detection → m_3d_bev_detection}/model_list.py +1 -1
  535. paddlex/modules/{3d_bev_detection → m_3d_bev_detection}/trainer.py +5 -7
  536. paddlex/modules/multilabel_classification/__init__.py +2 -2
  537. paddlex/modules/multilabel_classification/dataset_checker/__init__.py +2 -2
  538. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/__init__.py +2 -2
  539. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/analyse_dataset.py +8 -9
  540. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/check_dataset.py +4 -3
  541. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/convert_dataset.py +10 -7
  542. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/split_dataset.py +1 -1
  543. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/utils/__init__.py +1 -1
  544. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/utils/visualizer.py +1 -5
  545. paddlex/modules/multilabel_classification/evaluator.py +3 -3
  546. paddlex/modules/multilabel_classification/exportor.py +1 -1
  547. paddlex/modules/multilabel_classification/model_list.py +1 -1
  548. paddlex/modules/multilabel_classification/trainer.py +3 -3
  549. paddlex/modules/multilingual_speech_recognition/__init__.py +2 -2
  550. paddlex/modules/multilingual_speech_recognition/dataset_checker.py +3 -3
  551. paddlex/modules/multilingual_speech_recognition/evaluator.py +3 -3
  552. paddlex/modules/multilingual_speech_recognition/exportor.py +3 -3
  553. paddlex/modules/multilingual_speech_recognition/model_list.py +1 -1
  554. paddlex/modules/multilingual_speech_recognition/trainer.py +7 -5
  555. paddlex/modules/object_detection/__init__.py +2 -2
  556. paddlex/modules/object_detection/dataset_checker/__init__.py +2 -11
  557. paddlex/modules/object_detection/dataset_checker/dataset_src/__init__.py +2 -2
  558. paddlex/modules/object_detection/dataset_checker/dataset_src/analyse_dataset.py +10 -8
  559. paddlex/modules/object_detection/dataset_checker/dataset_src/check_dataset.py +10 -5
  560. paddlex/modules/object_detection/dataset_checker/dataset_src/convert_dataset.py +17 -12
  561. paddlex/modules/object_detection/dataset_checker/dataset_src/split_dataset.py +8 -4
  562. paddlex/modules/object_detection/dataset_checker/dataset_src/utils/__init__.py +1 -1
  563. paddlex/modules/object_detection/dataset_checker/dataset_src/utils/visualizer.py +9 -8
  564. paddlex/modules/object_detection/evaluator.py +11 -6
  565. paddlex/modules/object_detection/exportor.py +1 -1
  566. paddlex/modules/object_detection/model_list.py +3 -1
  567. paddlex/modules/object_detection/trainer.py +4 -5
  568. paddlex/modules/open_vocabulary_detection/__init__.py +2 -2
  569. paddlex/modules/open_vocabulary_detection/dataset_checker.py +3 -3
  570. paddlex/modules/open_vocabulary_detection/evaluator.py +3 -3
  571. paddlex/modules/open_vocabulary_detection/exportor.py +3 -3
  572. paddlex/modules/open_vocabulary_detection/model_list.py +2 -4
  573. paddlex/modules/open_vocabulary_detection/trainer.py +7 -5
  574. paddlex/modules/open_vocabulary_segmentation/__init__.py +2 -2
  575. paddlex/modules/open_vocabulary_segmentation/dataset_checker.py +3 -3
  576. paddlex/modules/open_vocabulary_segmentation/evaluator.py +3 -3
  577. paddlex/modules/open_vocabulary_segmentation/exportor.py +3 -3
  578. paddlex/modules/open_vocabulary_segmentation/model_list.py +1 -1
  579. paddlex/modules/open_vocabulary_segmentation/trainer.py +7 -5
  580. paddlex/modules/semantic_segmentation/__init__.py +2 -2
  581. paddlex/modules/semantic_segmentation/dataset_checker/__init__.py +2 -3
  582. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/__init__.py +2 -2
  583. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/analyse_dataset.py +6 -3
  584. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/check_dataset.py +2 -2
  585. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/convert_dataset.py +7 -4
  586. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/split_dataset.py +2 -2
  587. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/utils/__init__.py +1 -1
  588. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/utils/visualizer.py +6 -2
  589. paddlex/modules/semantic_segmentation/evaluator.py +3 -3
  590. paddlex/modules/semantic_segmentation/exportor.py +1 -1
  591. paddlex/modules/semantic_segmentation/model_list.py +1 -1
  592. paddlex/modules/semantic_segmentation/trainer.py +3 -4
  593. paddlex/modules/table_recognition/__init__.py +2 -2
  594. paddlex/modules/table_recognition/dataset_checker/__init__.py +5 -5
  595. paddlex/modules/table_recognition/dataset_checker/dataset_src/__init__.py +2 -2
  596. paddlex/modules/table_recognition/dataset_checker/dataset_src/analyse_dataset.py +3 -2
  597. paddlex/modules/table_recognition/dataset_checker/dataset_src/check_dataset.py +8 -7
  598. paddlex/modules/table_recognition/dataset_checker/dataset_src/split_dataset.py +2 -1
  599. paddlex/modules/table_recognition/evaluator.py +3 -3
  600. paddlex/modules/table_recognition/exportor.py +1 -1
  601. paddlex/modules/table_recognition/model_list.py +1 -1
  602. paddlex/modules/table_recognition/trainer.py +2 -5
  603. paddlex/modules/text_detection/__init__.py +2 -2
  604. paddlex/modules/text_detection/dataset_checker/__init__.py +4 -6
  605. paddlex/modules/text_detection/dataset_checker/dataset_src/__init__.py +2 -2
  606. paddlex/modules/text_detection/dataset_checker/dataset_src/analyse_dataset.py +12 -9
  607. paddlex/modules/text_detection/dataset_checker/dataset_src/check_dataset.py +3 -3
  608. paddlex/modules/text_detection/dataset_checker/dataset_src/split_dataset.py +3 -3
  609. paddlex/modules/text_detection/evaluator.py +3 -3
  610. paddlex/modules/text_detection/exportor.py +1 -1
  611. paddlex/modules/text_detection/model_list.py +3 -1
  612. paddlex/modules/text_detection/trainer.py +2 -5
  613. paddlex/modules/text_recognition/__init__.py +2 -2
  614. paddlex/modules/text_recognition/dataset_checker/__init__.py +4 -5
  615. paddlex/modules/text_recognition/dataset_checker/dataset_src/__init__.py +2 -2
  616. paddlex/modules/text_recognition/dataset_checker/dataset_src/analyse_dataset.py +13 -12
  617. paddlex/modules/text_recognition/dataset_checker/dataset_src/check_dataset.py +2 -5
  618. paddlex/modules/text_recognition/dataset_checker/dataset_src/convert_dataset.py +11 -10
  619. paddlex/modules/text_recognition/dataset_checker/dataset_src/split_dataset.py +1 -2
  620. paddlex/modules/text_recognition/evaluator.py +3 -3
  621. paddlex/modules/text_recognition/exportor.py +1 -1
  622. paddlex/modules/text_recognition/model_list.py +3 -1
  623. paddlex/modules/text_recognition/trainer.py +2 -3
  624. paddlex/modules/ts_anomaly_detection/__init__.py +2 -2
  625. paddlex/modules/ts_anomaly_detection/dataset_checker/__init__.py +4 -5
  626. paddlex/modules/ts_anomaly_detection/dataset_checker/dataset_src/__init__.py +2 -2
  627. paddlex/modules/ts_anomaly_detection/dataset_checker/dataset_src/analyse_dataset.py +1 -9
  628. paddlex/modules/ts_anomaly_detection/dataset_checker/dataset_src/check_dataset.py +2 -2
  629. paddlex/modules/ts_anomaly_detection/dataset_checker/dataset_src/convert_dataset.py +2 -6
  630. paddlex/modules/ts_anomaly_detection/dataset_checker/dataset_src/split_dataset.py +4 -4
  631. paddlex/modules/ts_anomaly_detection/evaluator.py +3 -3
  632. paddlex/modules/ts_anomaly_detection/exportor.py +2 -3
  633. paddlex/modules/ts_anomaly_detection/model_list.py +1 -1
  634. paddlex/modules/ts_anomaly_detection/trainer.py +8 -8
  635. paddlex/modules/ts_classification/__init__.py +2 -2
  636. paddlex/modules/ts_classification/dataset_checker/__init__.py +4 -5
  637. paddlex/modules/ts_classification/dataset_checker/dataset_src/__init__.py +2 -2
  638. paddlex/modules/ts_classification/dataset_checker/dataset_src/analyse_dataset.py +8 -5
  639. paddlex/modules/ts_classification/dataset_checker/dataset_src/check_dataset.py +2 -2
  640. paddlex/modules/ts_classification/dataset_checker/dataset_src/convert_dataset.py +2 -6
  641. paddlex/modules/ts_classification/dataset_checker/dataset_src/split_dataset.py +5 -5
  642. paddlex/modules/ts_classification/evaluator.py +3 -3
  643. paddlex/modules/ts_classification/exportor.py +2 -3
  644. paddlex/modules/ts_classification/model_list.py +1 -1
  645. paddlex/modules/ts_classification/trainer.py +7 -7
  646. paddlex/modules/ts_forecast/__init__.py +2 -2
  647. paddlex/modules/ts_forecast/dataset_checker/__init__.py +4 -5
  648. paddlex/modules/ts_forecast/dataset_checker/dataset_src/__init__.py +2 -2
  649. paddlex/modules/ts_forecast/dataset_checker/dataset_src/analyse_dataset.py +1 -9
  650. paddlex/modules/ts_forecast/dataset_checker/dataset_src/check_dataset.py +2 -2
  651. paddlex/modules/ts_forecast/dataset_checker/dataset_src/convert_dataset.py +2 -6
  652. paddlex/modules/ts_forecast/dataset_checker/dataset_src/split_dataset.py +4 -4
  653. paddlex/modules/ts_forecast/evaluator.py +3 -3
  654. paddlex/modules/ts_forecast/exportor.py +2 -3
  655. paddlex/modules/ts_forecast/model_list.py +1 -1
  656. paddlex/modules/ts_forecast/trainer.py +7 -7
  657. paddlex/modules/video_classification/__init__.py +2 -2
  658. paddlex/modules/video_classification/dataset_checker/__init__.py +2 -2
  659. paddlex/modules/video_classification/dataset_checker/dataset_src/__init__.py +2 -2
  660. paddlex/modules/video_classification/dataset_checker/dataset_src/analyse_dataset.py +9 -9
  661. paddlex/modules/video_classification/dataset_checker/dataset_src/check_dataset.py +2 -3
  662. paddlex/modules/video_classification/dataset_checker/dataset_src/split_dataset.py +1 -1
  663. paddlex/modules/video_classification/evaluator.py +3 -3
  664. paddlex/modules/video_classification/exportor.py +1 -1
  665. paddlex/modules/video_classification/model_list.py +1 -1
  666. paddlex/modules/video_classification/trainer.py +3 -3
  667. paddlex/modules/video_detection/__init__.py +2 -2
  668. paddlex/modules/video_detection/dataset_checker/__init__.py +2 -2
  669. paddlex/modules/video_detection/dataset_checker/dataset_src/__init__.py +2 -2
  670. paddlex/modules/video_detection/dataset_checker/dataset_src/analyse_dataset.py +8 -9
  671. paddlex/modules/video_detection/dataset_checker/dataset_src/check_dataset.py +3 -5
  672. paddlex/modules/video_detection/evaluator.py +3 -3
  673. paddlex/modules/video_detection/exportor.py +1 -1
  674. paddlex/modules/video_detection/model_list.py +1 -1
  675. paddlex/modules/video_detection/trainer.py +3 -3
  676. paddlex/ops/__init__.py +7 -4
  677. paddlex/ops/iou3d_nms/iou3d_cpu.cpp +8 -6
  678. paddlex/ops/iou3d_nms/iou3d_cpu.h +3 -2
  679. paddlex/ops/iou3d_nms/iou3d_nms.cpp +8 -6
  680. paddlex/ops/iou3d_nms/iou3d_nms.h +6 -4
  681. paddlex/ops/iou3d_nms/iou3d_nms_api.cpp +24 -18
  682. paddlex/ops/iou3d_nms/iou3d_nms_kernel.cu +9 -7
  683. paddlex/ops/setup.py +3 -3
  684. paddlex/ops/voxel/voxelize_op.cc +22 -19
  685. paddlex/ops/voxel/voxelize_op.cu +25 -25
  686. paddlex/paddlex_cli.py +104 -87
  687. paddlex/repo_apis/Paddle3D_api/__init__.py +1 -1
  688. paddlex/repo_apis/Paddle3D_api/bev_fusion/__init__.py +1 -1
  689. paddlex/repo_apis/Paddle3D_api/bev_fusion/config.py +1 -1
  690. paddlex/repo_apis/Paddle3D_api/bev_fusion/model.py +6 -6
  691. paddlex/repo_apis/Paddle3D_api/bev_fusion/register.py +2 -2
  692. paddlex/repo_apis/Paddle3D_api/bev_fusion/runner.py +1 -1
  693. paddlex/repo_apis/Paddle3D_api/pp3d_config.py +3 -2
  694. paddlex/repo_apis/PaddleClas_api/__init__.py +1 -1
  695. paddlex/repo_apis/PaddleClas_api/cls/__init__.py +3 -3
  696. paddlex/repo_apis/PaddleClas_api/cls/config.py +5 -4
  697. paddlex/repo_apis/PaddleClas_api/cls/model.py +4 -4
  698. paddlex/repo_apis/PaddleClas_api/cls/register.py +12 -3
  699. paddlex/repo_apis/PaddleClas_api/cls/runner.py +2 -3
  700. paddlex/repo_apis/PaddleClas_api/shitu_rec/__init__.py +2 -2
  701. paddlex/repo_apis/PaddleClas_api/shitu_rec/config.py +2 -2
  702. paddlex/repo_apis/PaddleClas_api/shitu_rec/model.py +1 -4
  703. paddlex/repo_apis/PaddleClas_api/shitu_rec/register.py +2 -2
  704. paddlex/repo_apis/PaddleClas_api/shitu_rec/runner.py +1 -6
  705. paddlex/repo_apis/PaddleDetection_api/__init__.py +2 -2
  706. paddlex/repo_apis/PaddleDetection_api/config_helper.py +3 -3
  707. paddlex/repo_apis/PaddleDetection_api/instance_seg/__init__.py +2 -2
  708. paddlex/repo_apis/PaddleDetection_api/instance_seg/config.py +2 -3
  709. paddlex/repo_apis/PaddleDetection_api/instance_seg/model.py +4 -4
  710. paddlex/repo_apis/PaddleDetection_api/instance_seg/register.py +2 -3
  711. paddlex/repo_apis/PaddleDetection_api/instance_seg/runner.py +2 -3
  712. paddlex/repo_apis/PaddleDetection_api/object_det/__init__.py +3 -3
  713. paddlex/repo_apis/PaddleDetection_api/object_det/config.py +5 -4
  714. paddlex/repo_apis/PaddleDetection_api/object_det/model.py +6 -7
  715. paddlex/repo_apis/PaddleDetection_api/object_det/official_categories.py +26 -1
  716. paddlex/repo_apis/PaddleDetection_api/object_det/register.py +32 -3
  717. paddlex/repo_apis/PaddleDetection_api/object_det/runner.py +2 -3
  718. paddlex/repo_apis/PaddleNLP_api/__init__.py +1 -1
  719. paddlex/repo_apis/PaddleOCR_api/__init__.py +4 -3
  720. paddlex/repo_apis/PaddleOCR_api/config_utils.py +1 -1
  721. paddlex/repo_apis/PaddleOCR_api/formula_rec/__init__.py +1 -1
  722. paddlex/repo_apis/PaddleOCR_api/formula_rec/config.py +7 -6
  723. paddlex/repo_apis/PaddleOCR_api/formula_rec/model.py +9 -13
  724. paddlex/repo_apis/PaddleOCR_api/formula_rec/register.py +29 -3
  725. paddlex/repo_apis/PaddleOCR_api/formula_rec/runner.py +2 -3
  726. paddlex/repo_apis/PaddleOCR_api/table_rec/__init__.py +1 -1
  727. paddlex/repo_apis/PaddleOCR_api/table_rec/config.py +1 -1
  728. paddlex/repo_apis/PaddleOCR_api/table_rec/model.py +4 -4
  729. paddlex/repo_apis/PaddleOCR_api/table_rec/register.py +2 -3
  730. paddlex/repo_apis/PaddleOCR_api/table_rec/runner.py +3 -3
  731. paddlex/repo_apis/PaddleOCR_api/text_det/__init__.py +1 -1
  732. paddlex/repo_apis/PaddleOCR_api/text_det/config.py +1 -1
  733. paddlex/repo_apis/PaddleOCR_api/text_det/model.py +4 -4
  734. paddlex/repo_apis/PaddleOCR_api/text_det/register.py +20 -3
  735. paddlex/repo_apis/PaddleOCR_api/text_det/runner.py +3 -3
  736. paddlex/repo_apis/PaddleOCR_api/text_rec/__init__.py +1 -1
  737. paddlex/repo_apis/PaddleOCR_api/text_rec/config.py +7 -6
  738. paddlex/repo_apis/PaddleOCR_api/text_rec/model.py +9 -13
  739. paddlex/repo_apis/PaddleOCR_api/text_rec/register.py +20 -3
  740. paddlex/repo_apis/PaddleOCR_api/text_rec/runner.py +2 -3
  741. paddlex/repo_apis/PaddleSeg_api/__init__.py +1 -1
  742. paddlex/repo_apis/PaddleSeg_api/base_seg_config.py +2 -2
  743. paddlex/repo_apis/PaddleSeg_api/seg/__init__.py +1 -1
  744. paddlex/repo_apis/PaddleSeg_api/seg/config.py +3 -6
  745. paddlex/repo_apis/PaddleSeg_api/seg/model.py +6 -6
  746. paddlex/repo_apis/PaddleSeg_api/seg/register.py +2 -3
  747. paddlex/repo_apis/PaddleSeg_api/seg/runner.py +2 -3
  748. paddlex/repo_apis/PaddleTS_api/__init__.py +4 -3
  749. paddlex/repo_apis/PaddleTS_api/ts_ad/__init__.py +1 -1
  750. paddlex/repo_apis/PaddleTS_api/ts_ad/config.py +5 -6
  751. paddlex/repo_apis/PaddleTS_api/ts_ad/register.py +2 -2
  752. paddlex/repo_apis/PaddleTS_api/ts_ad/runner.py +2 -2
  753. paddlex/repo_apis/PaddleTS_api/ts_base/__init__.py +1 -1
  754. paddlex/repo_apis/PaddleTS_api/ts_base/config.py +2 -4
  755. paddlex/repo_apis/PaddleTS_api/ts_base/model.py +4 -4
  756. paddlex/repo_apis/PaddleTS_api/ts_base/runner.py +2 -2
  757. paddlex/repo_apis/PaddleTS_api/ts_cls/__init__.py +1 -1
  758. paddlex/repo_apis/PaddleTS_api/ts_cls/config.py +4 -5
  759. paddlex/repo_apis/PaddleTS_api/ts_cls/register.py +2 -2
  760. paddlex/repo_apis/PaddleTS_api/ts_cls/runner.py +2 -2
  761. paddlex/repo_apis/PaddleTS_api/ts_fc/__init__.py +1 -1
  762. paddlex/repo_apis/PaddleTS_api/ts_fc/config.py +6 -7
  763. paddlex/repo_apis/PaddleTS_api/ts_fc/register.py +1 -1
  764. paddlex/repo_apis/PaddleVideo_api/__init__.py +1 -1
  765. paddlex/repo_apis/PaddleVideo_api/config_utils.py +1 -1
  766. paddlex/repo_apis/PaddleVideo_api/video_cls/__init__.py +3 -3
  767. paddlex/repo_apis/PaddleVideo_api/video_cls/config.py +5 -4
  768. paddlex/repo_apis/PaddleVideo_api/video_cls/model.py +4 -4
  769. paddlex/repo_apis/PaddleVideo_api/video_cls/register.py +2 -3
  770. paddlex/repo_apis/PaddleVideo_api/video_cls/runner.py +2 -3
  771. paddlex/repo_apis/PaddleVideo_api/video_det/__init__.py +3 -3
  772. paddlex/repo_apis/PaddleVideo_api/video_det/config.py +5 -4
  773. paddlex/repo_apis/PaddleVideo_api/video_det/model.py +5 -5
  774. paddlex/repo_apis/PaddleVideo_api/video_det/register.py +2 -3
  775. paddlex/repo_apis/PaddleVideo_api/video_det/runner.py +2 -3
  776. paddlex/repo_apis/__init__.py +1 -1
  777. paddlex/repo_apis/base/__init__.py +4 -5
  778. paddlex/repo_apis/base/config.py +3 -4
  779. paddlex/repo_apis/base/model.py +11 -19
  780. paddlex/repo_apis/base/register.py +1 -1
  781. paddlex/repo_apis/base/runner.py +11 -12
  782. paddlex/repo_apis/base/utils/__init__.py +1 -1
  783. paddlex/repo_apis/base/utils/arg.py +1 -1
  784. paddlex/repo_apis/base/utils/subprocess.py +1 -1
  785. paddlex/repo_manager/__init__.py +2 -9
  786. paddlex/repo_manager/core.py +12 -30
  787. paddlex/repo_manager/meta.py +41 -31
  788. paddlex/repo_manager/repo.py +171 -161
  789. paddlex/repo_manager/utils.py +13 -224
  790. paddlex/utils/__init__.py +1 -1
  791. paddlex/utils/cache.py +8 -10
  792. paddlex/utils/config.py +6 -5
  793. paddlex/utils/{custom_device_whitelist.py → custom_device_list.py} +53 -199
  794. paddlex/utils/deps.py +249 -0
  795. paddlex/utils/device.py +87 -36
  796. paddlex/utils/download.py +4 -4
  797. paddlex/utils/env.py +37 -7
  798. paddlex/utils/errors/__init__.py +1 -1
  799. paddlex/utils/errors/dataset_checker.py +1 -1
  800. paddlex/utils/errors/others.py +2 -16
  801. paddlex/utils/file_interface.py +4 -5
  802. paddlex/utils/flags.py +17 -12
  803. paddlex/utils/fonts/__init__.py +36 -5
  804. paddlex/utils/func_register.py +1 -1
  805. paddlex/utils/install.py +87 -0
  806. paddlex/utils/interactive_get_pipeline.py +3 -3
  807. paddlex/utils/lazy_loader.py +3 -3
  808. paddlex/utils/logging.py +10 -1
  809. paddlex/utils/misc.py +6 -6
  810. paddlex/utils/pipeline_arguments.py +15 -7
  811. paddlex/utils/result_saver.py +4 -5
  812. paddlex/utils/subclass_register.py +2 -4
  813. paddlex/version.py +2 -1
  814. {paddlex-3.0.0rc0.dist-info → paddlex-3.0.1.dist-info}/METADATA +237 -102
  815. paddlex-3.0.1.dist-info/RECORD +1095 -0
  816. {paddlex-3.0.0rc0.dist-info → paddlex-3.0.1.dist-info}/WHEEL +1 -1
  817. paddlex/inference/models/base/predictor/basic_predictor.py +0 -139
  818. paddlex/paddle2onnx_requirements.txt +0 -1
  819. paddlex/repo_manager/requirements.txt +0 -21
  820. paddlex/serving_requirements.txt +0 -9
  821. paddlex-3.0.0rc0.dist-info/RECORD +0 -1015
  822. {paddlex-3.0.0rc0.dist-info → paddlex-3.0.1.dist-info}/entry_points.txt +0 -0
  823. {paddlex-3.0.0rc0.dist-info → paddlex-3.0.1.dist-info/licenses}/LICENSE +0 -0
  824. {paddlex-3.0.0rc0.dist-info → paddlex-3.0.1.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,4 @@
1
- # copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -13,18 +13,27 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import copy
16
+ import inspect
16
17
  import io
17
18
  import json
18
19
  import os
19
-
20
20
  import warnings
21
- from collections import OrderedDict, UserDict
21
+ from collections import UserDict
22
22
  from dataclasses import dataclass, field
23
23
  from enum import Enum
24
- from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union
24
+ from typing import (
25
+ Any,
26
+ Dict,
27
+ List,
28
+ Literal,
29
+ NamedTuple,
30
+ Optional,
31
+ Sequence,
32
+ Tuple,
33
+ Union,
34
+ )
25
35
 
26
36
  import numpy as np
27
- import lazy_paddle as paddle
28
37
 
29
38
  from .....utils import logging
30
39
 
@@ -44,7 +53,6 @@ __all__ = [
44
53
 
45
54
  TOKENIZER_CONFIG_NAME = "tokenizer_config.json"
46
55
  CHAT_TEMPLATE_CONFIG_NAME = "chat_template.json"
47
- CHAT_TEMPLATE_CONFIG_NAME = "chat_template.json"
48
56
 
49
57
  VERY_LARGE_INTEGER = int(
50
58
  1e30
@@ -92,8 +100,6 @@ class AddedToken:
92
100
  class FastEncoding:
93
101
  """This is dummy class reserved for fast tokenizer"""
94
102
 
95
- pass
96
-
97
103
 
98
104
  class ExplicitEnum(Enum):
99
105
  """
@@ -132,6 +138,8 @@ def to_py_obj(obj):
132
138
  """
133
139
  Convert a Paddle tensor, Numpy array or python list to a python list.
134
140
  """
141
+ import paddle
142
+
135
143
  if isinstance(obj, (dict, UserDict)):
136
144
  return {k: to_py_obj(v) for k, v in obj.items()}
137
145
  elif isinstance(obj, (list, tuple)):
@@ -289,10 +297,6 @@ class BatchEncoding(UserDict):
289
297
  def items(self):
290
298
  return self.data.items()
291
299
 
292
- # After this point:
293
- # Extended properties and methods only available for fast tokenizers
294
- # not yet supported
295
-
296
300
  @property
297
301
  def encodings(self) -> Optional[List[FastEncoding]]:
298
302
  """
@@ -722,6 +726,8 @@ class BatchEncoding(UserDict):
722
726
  prepend_batch_axis (`int`, *optional*, defaults to `False`):
723
727
  Whether or not to add the batch dimension during the conversion.
724
728
  """
729
+ import paddle
730
+
725
731
  if tensor_type is None:
726
732
  return self
727
733
 
@@ -850,15 +856,17 @@ class SpecialTokensMixin:
850
856
  return self.add_tokens(self.all_special_tokens_extended, special_tokens=True)
851
857
 
852
858
  def add_special_tokens(
853
- self, special_tokens_dict: Dict[str, Union[str, AddedToken]]
859
+ self,
860
+ special_tokens_dict: Dict[str, Union[str, AddedToken]],
861
+ replace_additional_special_tokens=True,
854
862
  ) -> int:
855
863
  """
856
864
  Add a dictionary of special tokens (eos, pad, cls, etc.) to the encoder and link them to class attributes. If
857
865
  special tokens are NOT in the vocabulary, they are added to it (indexed starting from the last index of the
858
866
  current vocabulary).
859
867
 
860
- Note,None When adding new tokens to the vocabulary, you should make sure to also resize the token embedding
861
- matrix of the model so that its embedding matrix matches the tokenizer.
868
+ When adding new tokens to the vocabulary, you should make sure to also resize the token embedding matrix of the
869
+ model so that its embedding matrix matches the tokenizer.
862
870
 
863
871
  In order to do that, please use the [`~PreTrainedModel.resize_token_embeddings`] method.
864
872
 
@@ -879,6 +887,13 @@ class SpecialTokensMixin:
879
887
 
880
888
  Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer
881
889
  assign the index of the `unk_token` to them).
890
+ replace_additional_special_tokens (`bool`, *optional*,, defaults to `True`):
891
+ If `True`, the existing list of additional special tokens will be replaced by the list provided in
892
+ `special_tokens_dict`. Otherwise, `self._additional_special_tokens` is just extended. In the former
893
+ case, the tokens will NOT be removed from the tokenizer's full vocabulary - they are only being flagged
894
+ as non-special tokens. Remember, this only affects which tokens are skipped during decoding, not the
895
+ `added_tokens_encoder` and `added_tokens_decoder`. This means that the previous
896
+ `additional_special_tokens` are still added tokens, and will not be split by the model.
882
897
 
883
898
  Returns:
884
899
  `int`: Number of tokens added to the vocabulary.
@@ -902,7 +917,7 @@ class SpecialTokensMixin:
902
917
  if not special_tokens_dict:
903
918
  return 0
904
919
 
905
- added_tokens = 0
920
+ added_tokens = []
906
921
  for key, value in special_tokens_dict.items():
907
922
  assert (
908
923
  key in self.SPECIAL_TOKENS_ATTRIBUTES
@@ -910,19 +925,37 @@ class SpecialTokensMixin:
910
925
 
911
926
  if self.verbose:
912
927
  logging.info(f"Assigning {value} to the {key} key of the tokenizer")
913
- setattr(self, key, value)
914
928
 
915
929
  if key == "additional_special_tokens":
916
930
  assert isinstance(value, (list, tuple)) and all(
917
931
  isinstance(t, (str, AddedToken)) for t in value
918
932
  ), f"Tokens {value} for key {key} should all be str or AddedToken instances"
919
- added_tokens += self.add_tokens(value, special_tokens=True)
933
+
934
+ to_add = []
935
+ for token in value:
936
+ if (
937
+ not replace_additional_special_tokens
938
+ and str(token) in self.additional_special_tokens
939
+ ):
940
+ continue
941
+ to_add.append(token)
942
+ if replace_additional_special_tokens and len(to_add) > 0:
943
+ setattr(self, key, list(to_add))
944
+ else:
945
+ self._additional_special_tokens.extend(to_add)
946
+ added_tokens += to_add
947
+
920
948
  else:
921
- assert isinstance(
922
- value, (str, AddedToken)
923
- ), f"Token {value} for key {key} should be a str or an AddedToken instance"
924
- added_tokens += self.add_tokens([value], special_tokens=True)
949
+ if not isinstance(value, (str, AddedToken)):
950
+ raise ValueError(
951
+ f"Token {value} for key {key} should be a str or an AddedToken instance"
952
+ )
953
+ setattr(self, key, value)
954
+ if value not in added_tokens:
955
+ added_tokens.append(value)
925
956
 
957
+ # if we are adding tokens that were not part of the vocab, we ought to add them
958
+ added_tokens = self.add_tokens(added_tokens, special_tokens=True)
926
959
  return added_tokens
927
960
 
928
961
  def add_tokens(
@@ -972,6 +1005,11 @@ class SpecialTokensMixin:
972
1005
 
973
1006
  return self._add_tokens(new_tokens, special_tokens=special_tokens)
974
1007
 
1008
+ @classmethod
1009
+ def _add_extra_special_tokens(cls, extra_sp_token: Union[str, AddedToken]):
1010
+ if extra_sp_token not in cls.SPECIAL_TOKENS_ATTRIBUTES:
1011
+ cls.SPECIAL_TOKENS_ATTRIBUTES.append(extra_sp_token)
1012
+
975
1013
  def _add_tokens(
976
1014
  self,
977
1015
  new_tokens: Union[List[str], List[AddedToken]],
@@ -1238,7 +1276,13 @@ class SpecialTokensMixin:
1238
1276
  """
1239
1277
  set_attr = {}
1240
1278
  for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
1241
- attr_value = getattr(self, "_" + attr)
1279
+ try:
1280
+ attr_value = getattr(self, "_" + attr)
1281
+ except:
1282
+ try:
1283
+ attr_value = getattr(self, attr)
1284
+ except:
1285
+ continue
1242
1286
  if attr_value:
1243
1287
  set_attr[attr] = (
1244
1288
  type(attr_value)(
@@ -1262,7 +1306,13 @@ class SpecialTokensMixin:
1262
1306
  """
1263
1307
  set_attr = {}
1264
1308
  for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
1265
- attr_value = getattr(self, "_" + attr, None)
1309
+ try:
1310
+ attr_value = getattr(self, "_" + attr)
1311
+ except:
1312
+ try:
1313
+ attr_value = getattr(self, attr)
1314
+ except:
1315
+ continue
1266
1316
  if attr_value:
1267
1317
  set_attr[attr] = attr_value
1268
1318
  return set_attr
@@ -1286,16 +1336,16 @@ class SpecialTokensMixin:
1286
1336
  Don't convert tokens of `AddedToken` type to string so they can be used to control more finely how
1287
1337
  special tokens are tokenized.
1288
1338
  """
1289
- all_toks = []
1290
- set_attr = self.special_tokens_map_extended
1291
- for attr_value in set_attr.values():
1292
- all_toks = all_toks + (
1293
- list(attr_value)
1294
- if isinstance(attr_value, (list, tuple))
1295
- else [attr_value]
1296
- )
1297
- all_toks = list(OrderedDict.fromkeys(all_toks))
1298
- return all_toks
1339
+ all_tokens = []
1340
+ seen = set()
1341
+ for value in self.special_tokens_map_extended.values():
1342
+ if isinstance(value, (list, tuple)):
1343
+ tokens_to_add = [token for token in value if str(token) not in seen]
1344
+ else:
1345
+ tokens_to_add = [value] if str(value) not in seen else []
1346
+ seen.update(map(str, tokens_to_add))
1347
+ all_tokens.extend(tokens_to_add)
1348
+ return all_tokens
1299
1349
 
1300
1350
  @property
1301
1351
  def all_special_ids(self) -> List[int]:
@@ -1419,6 +1469,12 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
1419
1469
 
1420
1470
  self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)
1421
1471
 
1472
+ self.clean_up_tokenization_spaces = kwargs.pop(
1473
+ "clean_up_tokenization_spaces", False
1474
+ )
1475
+
1476
+ self.split_special_tokens = kwargs.pop("split_special_tokens", False)
1477
+
1422
1478
  self.deprecation_warnings = (
1423
1479
  {}
1424
1480
  ) # Use to store when we have already noticed a deprecation warning (avoid overlogging).
@@ -1466,7 +1522,6 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
1466
1522
 
1467
1523
  @max_len_sentences_pair.setter
1468
1524
  def max_len_sentences_pair(self, value) -> int:
1469
- # For backward compatibility, allow to try to setup 'max_len_sentences_pair'.
1470
1525
  if (
1471
1526
  value == self.model_max_length - self.num_special_tokens_to_add(pair=True)
1472
1527
  and self.verbose
@@ -1488,10 +1543,15 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
1488
1543
  self._processor_class = processor_class
1489
1544
 
1490
1545
  def __repr__(self) -> str:
1546
+ added_tokens_decoder_rep = "\n\t".join(
1547
+ [f"{k}: {v.__repr__()}," for k, v in self.added_tokens_decoder.items()]
1548
+ )
1491
1549
  return (
1492
- f"{'PretrainedTokenizer'}(name_or_path='{self.name_or_path}', "
1493
- f"vocab_size={self.vocab_size}, model_max_len={self.model_max_length}, "
1494
- f"padding_side='{self.padding_side}', truncation_side='{self.truncation_side}', special_tokens={self.special_tokens_map_extended})"
1550
+ f"{self.__class__.__name__}(name_or_path='{self.name_or_path}',"
1551
+ f" vocab_size={self.vocab_size}, model_max_length={self.model_max_length}, is_fast={self.is_fast},"
1552
+ f" padding_side='{self.padding_side}', truncation_side='{self.truncation_side}',"
1553
+ f" special_tokens={self.special_tokens_map}, clean_up_tokenization_spaces={self.clean_up_tokenization_spaces}), "
1554
+ " added_tokens_decoder={\n\t" + added_tokens_decoder_rep + "\n}"
1495
1555
  )
1496
1556
 
1497
1557
  def get_vocab(self) -> Dict[str, int]:
@@ -1547,17 +1607,13 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
1547
1607
  # Load from local directory path
1548
1608
  tokenizer = BertTokenizer.from_pretrained('./my_bert/')
1549
1609
  """
1550
-
1551
- pretrained_model_name_or_path = str(pretrained_model_name_or_path)
1552
1610
  cache_dir = kwargs.pop("cache_dir", None)
1553
1611
  from_hf_hub = kwargs.pop("from_hf_hub", False)
1554
1612
  from_aistudio = kwargs.pop("from_aistudio", False)
1555
1613
  subfolder = kwargs.pop("subfolder", "")
1556
1614
  return_tokenizer_file_dir = kwargs.pop("return_tokenizer_file_dir", False)
1557
1615
 
1558
- if subfolder is None:
1559
- subfolder = ""
1560
-
1616
+ pretrained_model_name_or_path = str(pretrained_model_name_or_path)
1561
1617
  vocab_files = {}
1562
1618
  init_configuration = {}
1563
1619
 
@@ -1568,12 +1624,17 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
1568
1624
  "chat_template_file": CHAT_TEMPLATE_CONFIG_NAME,
1569
1625
  }
1570
1626
 
1627
+ if hasattr(cls, "vocab_files_names") and len(cls.resource_files_names) == 0:
1628
+ cls.resource_files_names = copy.deepcopy(cls.vocab_files_names)
1629
+ logging.error(
1630
+ "The attribute 'vocab_files_names' is deprecated. Please use 'resource_files_names' instead.",
1631
+ DeprecationWarning,
1632
+ )
1571
1633
  vocab_files_target = {**cls.resource_files_names, **additional_files_names}
1572
-
1573
1634
  # From HF Hub or AI Studio
1574
1635
  if from_hf_hub or from_aistudio:
1575
1636
  # Only include the necessary resource files specified by the tokenizer cls
1576
- # Deep copy to avoid modifiying the class attributes
1637
+ # Deep copy to avoid modifying the class attributes
1577
1638
  vocab_files = copy.deepcopy(cls.resource_files_names)
1578
1639
  vocab_files["tokenizer_config_file"] = cls.tokenizer_config_file
1579
1640
 
@@ -1597,29 +1658,58 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
1597
1658
  # Assuming from community-contributed pretrained models
1598
1659
  for file_id, file_name in vocab_files_target.items():
1599
1660
  vocab_files[file_id] = file_name
1600
-
1601
1661
  resolved_vocab_files = {}
1602
1662
  for file_id, file_path in vocab_files.items():
1603
- if file_path is None or os.path.isfile(file_path):
1604
- resolved_vocab_files[file_id] = file_path
1605
- continue
1606
- else:
1607
- logging.warnings("need to download tokenizer, but not support yet.")
1608
- # tokenizer download not support yet
1609
- # resolved_vocab_files[file_id] = resolve_file_path(
1610
- # pretrained_model_name_or_path,
1611
- # [file_path],
1612
- # subfolder,
1613
- # cache_dir=cache_dir,
1614
- # from_aistudio=from_aistudio,
1615
- # from_hf_hub=from_hf_hub,
1616
- # )
1663
+ # adapt to PaddleX
1664
+ resolved_vocab_files[file_id] = file_path
1617
1665
 
1618
1666
  for file_id, file_path in resolved_vocab_files.items():
1619
1667
  if resolved_vocab_files[file_id] is not None:
1620
1668
  cache_dir = os.path.dirname(resolved_vocab_files[file_id])
1621
1669
  break
1670
+ return cls._from_pretrained(
1671
+ resolved_vocab_files,
1672
+ pretrained_model_name_or_path,
1673
+ init_configuration,
1674
+ *args,
1675
+ cache_dir=cache_dir,
1676
+ return_tokenizer_file_dir=return_tokenizer_file_dir,
1677
+ from_hf_hub=from_hf_hub,
1678
+ **kwargs,
1679
+ )
1622
1680
 
1681
+ @classmethod
1682
+ def _from_pretrained(
1683
+ cls,
1684
+ resolved_vocab_files,
1685
+ pretrained_model_name_or_path,
1686
+ init_configuration,
1687
+ *init_inputs,
1688
+ cache_dir=None,
1689
+ return_tokenizer_file_dir=False,
1690
+ from_hf_hub=False,
1691
+ **kwargs,
1692
+ ):
1693
+ if cls.__name__.endswith("Fast"):
1694
+ from_slow = kwargs.get("from_slow", False)
1695
+ else:
1696
+ from_slow = kwargs.get("from_slow", True)
1697
+ has_tokenizer_file = (
1698
+ resolved_vocab_files.get("tokenizer_file", None) is not None
1699
+ )
1700
+ if (
1701
+ from_slow or not has_tokenizer_file
1702
+ ) and cls.slow_tokenizer_class is not None:
1703
+ slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained(
1704
+ copy.deepcopy(resolved_vocab_files),
1705
+ pretrained_model_name_or_path,
1706
+ copy.deepcopy(init_configuration),
1707
+ *init_inputs,
1708
+ cache_dir=cache_dir,
1709
+ **(copy.deepcopy(kwargs)),
1710
+ )
1711
+ else:
1712
+ slow_tokenizer = None
1623
1713
  tokenizer_config_file_dir_list = set()
1624
1714
  for k, v in resolved_vocab_files.items():
1625
1715
  if v is not None and os.path.isfile(v):
@@ -1629,8 +1719,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
1629
1719
  assert (
1630
1720
  len(tokenizer_config_file_dir_list) > 0
1631
1721
  ), "All tokenizer files should be in the same directory."
1632
- # Prepare tokenizer initialization kwargs
1633
- # Did we saved some inputs and kwargs to reload ?
1722
+
1634
1723
  has_tokenizer_file = (
1635
1724
  resolved_vocab_files.get("tokenizer_file", None) is not None
1636
1725
  )
@@ -1638,15 +1727,34 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
1638
1727
  if tokenizer_config_file is not None:
1639
1728
  with io.open(tokenizer_config_file, encoding="utf-8") as f:
1640
1729
  init_kwargs = json.load(f)
1730
+ init_kwargs.pop("tokenizer_class", None)
1641
1731
  else:
1642
1732
  init_kwargs = init_configuration
1643
1733
 
1644
- # position args are stored in kwargs, maybe better not include
1645
- init_args = init_kwargs.pop("init_args", ())
1734
+ if slow_tokenizer is not None:
1735
+ init_kwargs["__slow_tokenizer"] = slow_tokenizer
1736
+ init_kwargs["name_or_path"] = pretrained_model_name_or_path
1737
+ init_kwargs["from_slow"] = from_slow
1738
+
1739
+ pass_added_tokens_file = False
1740
+ added_tokens_decoder: Dict[int, AddedToken] = {}
1741
+ if "added_tokens_decoder" in init_kwargs:
1742
+ for idx, token in init_kwargs["added_tokens_decoder"].items():
1743
+ if isinstance(token, dict):
1744
+ token = AddedToken(**token)
1745
+ if isinstance(token, AddedToken):
1746
+ added_tokens_decoder[int(idx)] = token
1747
+ else:
1748
+ raise ValueError(
1749
+ f"Found a {token.__class__} in the saved `added_tokens_decoder`, should be a dictionary or an AddedToken instance"
1750
+ )
1751
+ init_kwargs["added_tokens_decoder"] = (
1752
+ added_tokens_decoder # NOTE tokenizer_config.json下, 注册的`added_tokens_decoder`被解析成字典
1753
+ )
1754
+ pass_added_tokens_file = True
1755
+
1646
1756
  init_kwargs.pop("init_class", None)
1647
1757
 
1648
- # Update with newly provided args and kwargs
1649
- init_args = init_args if not args else args
1650
1758
  init_kwargs.update(kwargs)
1651
1759
 
1652
1760
  def convert_added_tokens(obj):
@@ -1664,10 +1772,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
1664
1772
  return obj
1665
1773
 
1666
1774
  init_kwargs = convert_added_tokens(init_kwargs)
1667
- # Set max length if needed
1668
1775
  if pretrained_model_name_or_path in cls.max_model_input_sizes:
1669
- # if we're using a pretrained model, ensure the tokenizer
1670
- # wont index sequences longer than the number of positional embeddings
1671
1776
  model_max_length = cls.max_model_input_sizes[pretrained_model_name_or_path]
1672
1777
  if model_max_length is not None and isinstance(
1673
1778
  model_max_length, (int, float)
@@ -1676,32 +1781,28 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
1676
1781
  init_kwargs.get("model_max_length", int(1e30)), model_max_length
1677
1782
  )
1678
1783
 
1679
- added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None)
1680
- # Merge resolved_vocab_files arguments in init_kwargs if not including.
1681
- # Maybe need more ways to load resources.
1682
1784
  for args_name, file_path in resolved_vocab_files.items():
1683
- # when `pretrained_model_name_or_path` is a pretrained model name,
1684
- # use pretrained_init_configuration as `init_kwargs` to init which
1685
- # does not include the vocab file in it, thus add vocab file into
1686
- # args.
1687
- if args_name not in init_kwargs:
1785
+ if args_name not in init_kwargs or init_kwargs[args_name] is None:
1688
1786
  init_kwargs[args_name] = file_path
1689
- # when `pretrained_model_name_or_path` is a pretrained model dir,
1690
- # use tokenizer_config_file.json as `init_kwargs` to init which
1691
- # does include a vocab file path in it. However, if the vocab file
1692
- # path included in json does not exist, such as was deleted, to make
1693
- # it still work, use the vocab file under this dir.
1694
1787
  elif not os.path.isfile(init_kwargs[args_name] or "") and os.path.isfile(
1695
1788
  file_path
1696
1789
  ):
1697
1790
  init_kwargs[args_name] = file_path
1698
1791
 
1699
- # TODO(zhoushunjie): It's not supportted to load tokenizer.json of hf so far.
1700
1792
  if from_hf_hub and "tokenizer_file" in init_kwargs:
1701
1793
  init_kwargs.pop("tokenizer_file")
1702
1794
 
1703
- # TODO(guosheng): avoid reduplication of position args and key word args
1704
- tokenizer = cls(*init_args, **init_kwargs)
1795
+ try:
1796
+ tokenizer = cls(*init_inputs, **init_kwargs)
1797
+ # adapt to PaddleX
1798
+ except RuntimeError as e:
1799
+ if "sentencepiece_processor.cc" in str(e):
1800
+ logging.info(
1801
+ "Unable to load tokenizer model from SPM, loading from TikToken will be attempted instead."
1802
+ "(SentencePiece RuntimeError: Tried to load SPM model with non-SPM vocab file).",
1803
+ )
1804
+ return False
1805
+
1705
1806
  chat_template = init_kwargs.pop("chat_template", None)
1706
1807
  if chat_template is not None:
1707
1808
  tokenizer.init_chat_template(chat_template)
@@ -1715,11 +1816,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
1715
1816
  special_tokens_map = json.load(special_tokens_map_handle)
1716
1817
  for key, value in special_tokens_map.items():
1717
1818
  if key in kwargs and kwargs[key]:
1718
- # This value has already been redefined by the kwargs
1719
- # We keep this new value and ignore the one stored in the special_tokens_map_file
1720
-
1721
1819
  continue
1722
-
1723
1820
  if isinstance(value, dict):
1724
1821
  value = AddedToken(**value)
1725
1822
  elif isinstance(value, list):
@@ -1728,13 +1825,15 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
1728
1825
  for token in value
1729
1826
  ]
1730
1827
  setattr(tokenizer, key, value)
1731
- # Add supplementary tokens.
1828
+ cls._add_extra_special_tokens(key)
1829
+
1732
1830
  special_tokens = tokenizer.all_special_tokens
1831
+ added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None)
1832
+ added_tokens_file = None if pass_added_tokens_file else added_tokens_file
1733
1833
  if added_tokens_file is not None:
1734
1834
  with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
1735
1835
  added_tok_encoder = json.load(added_tokens_handle)
1736
1836
 
1737
- # Sort added tokens by index
1738
1837
  added_tok_encoder_sorted = list(
1739
1838
  sorted(added_tok_encoder.items(), key=lambda x: x[1])
1740
1839
  )
@@ -1744,14 +1843,11 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
1744
1843
  and index != len(tokenizer)
1745
1844
  and tokenizer.convert_tokens_to_ids(token) != index
1746
1845
  ):
1747
- # index is the current length of the tokenizer (not in vocabulary)
1748
1846
  raise ValueError(
1749
1847
  f"Wrong index found for {token}: should be {tokenizer.convert_tokens_to_ids(token)} but found "
1750
1848
  f"{index}."
1751
1849
  )
1752
1850
  elif not has_tokenizer_file and index != len(tokenizer):
1753
- # Tokenizer slow: added token cannot already be in the vocabulary so its index needs to be the
1754
- # current length of the tokenizer.
1755
1851
  raise ValueError(
1756
1852
  f"Non-consecutive added token '{token}' found. "
1757
1853
  f"Should have index {len(tokenizer)} but has index {index} in saved vocabulary."
@@ -1760,15 +1856,12 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
1760
1856
  tokenizer.add_tokens(
1761
1857
  token, special_tokens=bool(token in special_tokens)
1762
1858
  )
1763
- # Check all our special tokens are registered as "no split" token (we don't cut them) and are in the vocab
1764
1859
  added_tokens = tokenizer.sanitize_special_tokens()
1765
1860
  if added_tokens:
1766
1861
  logging.info(
1767
1862
  "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained."
1768
1863
  )
1769
- # save all of related things into default root dir
1770
1864
  if pretrained_model_name_or_path in cls.pretrained_init_configuration:
1771
- # tokenizer.save_pretrained(os.path.join(cache_dir, pretrained_model_name_or_path, subfolder))
1772
1865
  tokenizer.save_pretrained(cache_dir)
1773
1866
 
1774
1867
  if return_tokenizer_file_dir:
@@ -1827,7 +1920,6 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
1827
1920
  for file_id in self.resource_files_names.keys():
1828
1921
  tokenizer_config.pop(file_id, None)
1829
1922
 
1830
- # Sanitize AddedTokens
1831
1923
  def convert_added_tokens(obj: Union[AddedToken, Any], add_type_field=True):
1832
1924
  if isinstance(obj, AddedToken):
1833
1925
  out = obj.__getstate__()
@@ -1845,10 +1937,16 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
1845
1937
  }
1846
1938
  return obj
1847
1939
 
1848
- # add_type_field=True to allow dicts in the kwargs / differentiate from AddedToken serialization
1849
1940
  tokenizer_config = convert_added_tokens(tokenizer_config, add_type_field=True)
1850
1941
 
1851
- # Add tokenizer class to the tokenizer config to be able to reload it with from_pretrained
1942
+ added_tokens = {}
1943
+ for key, value in self.added_tokens_decoder.items():
1944
+ if isinstance(value, AddedToken):
1945
+ added_tokens[key] = value.__getstate__()
1946
+ else:
1947
+ added_tokens[key] = AddedToken(value).__getstate__()
1948
+ tokenizer_config["added_tokens_decoder"] = added_tokens
1949
+
1852
1950
  tokenizer_class = self.__class__.__name__
1853
1951
  tokenizer_config["tokenizer_class"] = tokenizer_class
1854
1952
 
@@ -1856,7 +1954,6 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
1856
1954
  f.write(json.dumps(tokenizer_config, ensure_ascii=False))
1857
1955
  logging.info(f"tokenizer config file saved in {tokenizer_config_file}")
1858
1956
 
1859
- # Sanitize AddedTokens in special_tokens_map
1860
1957
  write_dict = convert_added_tokens(
1861
1958
  self.special_tokens_map_extended, add_type_field=False
1862
1959
  )
@@ -1946,8 +2043,6 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
1946
2043
  old_truncation_strategy = kwargs.pop("truncation_strategy", "do_not_truncate")
1947
2044
  old_pad_to_max_length = kwargs.pop("pad_to_max_seq_len", False)
1948
2045
 
1949
- # Backward compatibility for previous behavior, maybe we should deprecate it:
1950
- # If you only set max_length, it activates truncation for max_length
1951
2046
  if max_length is not None and padding is False and truncation is False:
1952
2047
  if verbose:
1953
2048
  if not self.deprecation_warnings.get(
@@ -1992,7 +2087,6 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
1992
2087
  warnings.warn(
1993
2088
  "Though `pad_to_max_length` = `True`, it is ignored because `padding`=`True`."
1994
2089
  )
1995
- # Default to pad to the longest sequence in the batch
1996
2090
  padding_strategy = PaddingStrategy.LONGEST
1997
2091
  elif not isinstance(padding, PaddingStrategy):
1998
2092
  padding_strategy = PaddingStrategy(padding)
@@ -2106,6 +2200,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
2106
2200
  return_offsets_mapping: bool = False,
2107
2201
  add_special_tokens: bool = True,
2108
2202
  pad_to_multiple_of: Optional[int] = None,
2203
+ padding_side: Optional[Literal["right", "left"]] = None,
2109
2204
  return_tensors: Optional[Union[str, TensorType]] = None,
2110
2205
  verbose: bool = True,
2111
2206
  **kwargs,
@@ -2215,6 +2310,9 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
2215
2310
  If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
2216
2311
  the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
2217
2312
  Defaults to `None`.
2313
+ padding_side (`str`, *optional*):
2314
+ The side on which the model should have padding applied. Should be selected between ['right', 'left'].
2315
+ Default value is picked from the class attribute of the same name.
2218
2316
  return_tensors (str or [TensorType], optional):
2219
2317
  If set, will return tensors instead of list of python integers. Acceptable values are:
2220
2318
 
@@ -2333,6 +2431,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
2333
2431
  return_offsets_mapping=return_offsets_mapping,
2334
2432
  add_special_tokens=add_special_tokens,
2335
2433
  pad_to_multiple_of=pad_to_multiple_of,
2434
+ padding_side=padding_side,
2336
2435
  return_tensors=return_tensors,
2337
2436
  verbose=verbose,
2338
2437
  **kwargs,
@@ -2355,6 +2454,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
2355
2454
  return_offsets_mapping=return_offsets_mapping,
2356
2455
  add_special_tokens=add_special_tokens,
2357
2456
  pad_to_multiple_of=pad_to_multiple_of,
2457
+ padding_side=padding_side,
2358
2458
  return_tensors=return_tensors,
2359
2459
  verbose=verbose,
2360
2460
  **kwargs,
@@ -2371,6 +2471,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
2371
2471
  stride: int = 0,
2372
2472
  is_split_into_words: bool = False,
2373
2473
  pad_to_multiple_of: Optional[int] = None,
2474
+ padding_side: Optional[Literal["right", "left"]] = None,
2374
2475
  return_tensors: Optional[Union[str, TensorType]] = None,
2375
2476
  return_token_type_ids: Optional[bool] = None,
2376
2477
  return_attention_mask: Optional[bool] = None,
@@ -2427,6 +2528,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
2427
2528
  stride=stride,
2428
2529
  is_split_into_words=is_split_into_words,
2429
2530
  pad_to_multiple_of=pad_to_multiple_of,
2531
+ padding_side=padding_side,
2430
2532
  return_tensors=return_tensors,
2431
2533
  return_position_ids=return_position_ids,
2432
2534
  return_token_type_ids=return_token_type_ids,
@@ -2449,6 +2551,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
2449
2551
  max_length: Optional[int] = None,
2450
2552
  stride: int = 0,
2451
2553
  is_split_into_words: bool = False,
2554
+ padding_side: Optional[Literal["right", "left"]] = None,
2452
2555
  pad_to_multiple_of: Optional[int] = None,
2453
2556
  return_tensors: Optional[Union[str, TensorType]] = None,
2454
2557
  return_token_type_ids: Optional[bool] = None,
@@ -2502,6 +2605,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
2502
2605
  stride=stride,
2503
2606
  is_split_into_words=is_split_into_words,
2504
2607
  pad_to_multiple_of=pad_to_multiple_of,
2608
+ padding_side=padding_side,
2505
2609
  return_tensors=return_tensors,
2506
2610
  return_token_type_ids=return_token_type_ids,
2507
2611
  return_attention_mask=return_attention_mask,
@@ -2524,6 +2628,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
2524
2628
  stride: int = 0,
2525
2629
  is_split_into_words: bool = False,
2526
2630
  pad_to_multiple_of: Optional[int] = None,
2631
+ padding_side: Optional[Literal["right", "left"]] = None,
2527
2632
  return_position_ids: Optional[bool] = None,
2528
2633
  return_tensors: Optional[Union[str, TensorType]] = None,
2529
2634
  return_token_type_ids: Optional[bool] = None,
@@ -2563,6 +2668,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
2563
2668
  return_offsets_mapping=False,
2564
2669
  add_special_tokens=True,
2565
2670
  pad_to_multiple_of: Optional[int] = None,
2671
+ padding_side: Optional[Literal["right", "left"]] = None,
2566
2672
  return_tensors: Optional[Union[str, TensorType]] = None,
2567
2673
  verbose: bool = True,
2568
2674
  **kwargs,
@@ -2615,6 +2721,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
2615
2721
  stride=stride,
2616
2722
  is_split_into_words=is_split_into_words,
2617
2723
  pad_to_multiple_of=pad_to_multiple_of,
2724
+ padding_side=padding_side,
2618
2725
  return_tensors=return_tensors,
2619
2726
  return_position_ids=return_position_ids,
2620
2727
  return_token_type_ids=return_token_type_ids,
@@ -2645,6 +2752,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
2645
2752
  stride: int = 0,
2646
2753
  is_split_into_words: bool = False,
2647
2754
  pad_to_multiple_of: Optional[int] = None,
2755
+ padding_side: Optional[Literal["right", "left"]] = None,
2648
2756
  return_position_ids: Optional[bool] = None,
2649
2757
  return_tensors: Optional[Union[str, TensorType]] = None,
2650
2758
  return_token_type_ids: Optional[bool] = None,
@@ -2670,6 +2778,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
2670
2778
  ],
2671
2779
  padding: Union[bool, str, PaddingStrategy] = True,
2672
2780
  max_length: Optional[int] = None,
2781
+ padding_side: Optional[Literal["right", "left"]] = None,
2673
2782
  pad_to_multiple_of: Optional[int] = None,
2674
2783
  return_attention_mask: Optional[bool] = None,
2675
2784
  return_tensors: Optional[Union[str, TensorType]] = None,
@@ -2714,6 +2823,9 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
2714
2823
 
2715
2824
  This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
2716
2825
  >= 7.5 (Volta).
2826
+ padding_side (`str`, *optional*):
2827
+ The side on which the model should have padding applied. Should be selected between ['right', 'left'].
2828
+ Default value is picked from the class attribute of the same name.
2717
2829
  return_attention_mask (`bool`, *optional*):
2718
2830
  Whether to return the attention mask. If left to the default, will return the attention mask according
2719
2831
  to the specific tokenizer's default, defined by the `return_outputs` attribute.
@@ -2727,6 +2839,8 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
2727
2839
  verbose (`bool`, *optional*, defaults to `True`):
2728
2840
  Whether or not to print more information and warnings.
2729
2841
  """
2842
+ import paddle
2843
+
2730
2844
  # If we have a list of dicts, let's convert it in a dict of lists
2731
2845
  if isinstance(encoded_inputs, (list, tuple)) and isinstance(
2732
2846
  encoded_inputs[0], (dict, BatchEncoding)
@@ -2780,13 +2894,28 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
2780
2894
 
2781
2895
  required_input = encoded_inputs[self.model_input_names[0]]
2782
2896
  if required_input and not isinstance(required_input[0], (list, tuple)):
2783
- encoded_inputs = self._pad(
2784
- encoded_inputs,
2785
- max_length=max_length,
2786
- padding_strategy=padding_strategy,
2787
- pad_to_multiple_of=pad_to_multiple_of,
2788
- return_attention_mask=return_attention_mask,
2789
- )
2897
+ # some tokenizers might not have the padding_side attribute
2898
+ if "padding_side" in set(inspect.signature(self._pad).parameters.keys()):
2899
+ encoded_inputs = self._pad(
2900
+ encoded_inputs,
2901
+ max_length=max_length,
2902
+ padding_strategy=padding_strategy,
2903
+ pad_to_multiple_of=pad_to_multiple_of,
2904
+ padding_side=padding_side,
2905
+ return_attention_mask=return_attention_mask,
2906
+ )
2907
+ else:
2908
+ original_padding_side = self.padding_side
2909
+ self.padding_side = padding_side
2910
+ encoded_inputs = self._pad(
2911
+ encoded_inputs,
2912
+ max_length=max_length,
2913
+ padding_strategy=padding_strategy,
2914
+ pad_to_multiple_of=pad_to_multiple_of,
2915
+ return_attention_mask=return_attention_mask,
2916
+ )
2917
+ self.padding_side = original_padding_side
2918
+
2790
2919
  return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
2791
2920
 
2792
2921
  batch_size = len(required_input)
@@ -2805,6 +2934,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
2805
2934
  inputs,
2806
2935
  max_length=max_length,
2807
2936
  padding_strategy=padding_strategy,
2937
+ padding_side=padding_side,
2808
2938
  pad_to_multiple_of=pad_to_multiple_of,
2809
2939
  return_attention_mask=return_attention_mask,
2810
2940
  )
@@ -2887,6 +3017,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
2887
3017
  max_length: Optional[int] = None,
2888
3018
  stride: int = 0,
2889
3019
  pad_to_multiple_of: Optional[int] = None,
3020
+ padding_side: Optional[Literal["right", "left"]] = None,
2890
3021
  return_tensors: Optional[Union[str, TensorType]] = None,
2891
3022
  return_position_ids=None,
2892
3023
  return_token_type_ids: Optional[bool] = None,
@@ -2979,7 +3110,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
2979
3110
  sequence = ids + pair_ids if pair else ids
2980
3111
  token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else [])
2981
3112
 
2982
- # Build output dictionnary
3113
+ # Build output dictionary
2983
3114
  encoded_inputs["input_ids"] = sequence
2984
3115
  if return_token_type_ids:
2985
3116
  encoded_inputs["token_type_ids"] = token_type_ids
@@ -3037,6 +3168,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
3037
3168
  max_length=max_length,
3038
3169
  padding=padding_strategy.value,
3039
3170
  pad_to_multiple_of=pad_to_multiple_of,
3171
+ padding_side=padding_side,
3040
3172
  return_attention_mask=return_attention_mask,
3041
3173
  )
3042
3174
 
@@ -3189,6 +3321,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
3189
3321
  max_length: Optional[int] = None,
3190
3322
  padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
3191
3323
  pad_to_multiple_of: Optional[int] = None,
3324
+ padding_side: Optional[Literal["right", "left"]] = None,
3192
3325
  return_attention_mask: Optional[bool] = None,
3193
3326
  ) -> dict:
3194
3327
  """
@@ -3204,13 +3337,16 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
3204
3337
  - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
3205
3338
  - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
3206
3339
  - PaddingStrategy.DO_NOT_PAD: Do not pad
3207
- The tokenizer padding sides are defined in self.padding_side:
3340
+ The tokenizer padding sides are defined in `padding_side` argument:
3208
3341
 
3209
3342
  - 'left': pads on the left of the sequences
3210
3343
  - 'right': pads on the right of the sequences
3211
3344
  pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
3212
3345
  This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
3213
3346
  >= 7.5 (Volta).
3347
+ padding_side: (optional) The side on which the model should have padding applied.
3348
+ Should be selected between ['right', 'left'].
3349
+ Default value is picked from the class attribute of the same name.
3214
3350
  return_attention_mask:
3215
3351
  (optional) Set to False to avoid returning attention mask (default: set to model specifics)
3216
3352
  """
@@ -3244,12 +3380,33 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
3244
3380
 
3245
3381
  if needs_to_be_padded:
3246
3382
  difference = max_length - len(required_input)
3383
+ padding_side = (
3384
+ padding_side if padding_side is not None else self.padding_side
3385
+ )
3247
3386
 
3248
- if self.padding_side == "right":
3387
+ if padding_side == "right":
3249
3388
  if return_attention_mask:
3250
-
3251
- encoded_inputs["attention_mask"] = (
3252
- encoded_inputs["attention_mask"] + [0] * difference
3389
+ if len(np.shape(encoded_inputs["attention_mask"])) > 2:
3390
+ encoded_inputs["attention_mask"] = np.pad(
3391
+ encoded_inputs["attention_mask"],
3392
+ pad_width=[(0, 0), (0, difference), (0, difference)],
3393
+ mode="constant",
3394
+ constant_values=0,
3395
+ ).tolist()
3396
+ else:
3397
+ encoded_inputs["attention_mask"] = (
3398
+ encoded_inputs["attention_mask"] + [0] * difference
3399
+ )
3400
+ if "attn_mask_startend_row_indices" in encoded_inputs:
3401
+ encoded_inputs["attn_mask_startend_row_indices"] = np.concatenate(
3402
+ [
3403
+ np.array(
3404
+ [encoded_inputs["attn_mask_startend_row_indices"]],
3405
+ dtype=np.int32,
3406
+ ),
3407
+ np.zeros([1, difference], dtype=np.int32),
3408
+ ],
3409
+ axis=-1,
3253
3410
  )
3254
3411
  if "token_type_ids" in encoded_inputs:
3255
3412
  encoded_inputs["token_type_ids"] = (
@@ -3284,11 +3441,32 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
3284
3441
  encoded_inputs[self.model_input_names[0]] = (
3285
3442
  required_input + [self.pad_token_id] * difference
3286
3443
  )
3287
- elif self.padding_side == "left":
3444
+ elif padding_side == "left":
3288
3445
  if return_attention_mask:
3289
- encoded_inputs["attention_mask"] = [
3290
- 0
3291
- ] * difference + encoded_inputs["attention_mask"]
3446
+ if len(np.shape(encoded_inputs["attention_mask"])) > 2:
3447
+ # attention_mask shape [1,seq_len,seq_len]
3448
+ encoded_inputs["attention_mask"] = np.pad(
3449
+ encoded_inputs["attention_mask"],
3450
+ pad_width=[(0, 0), (difference, 0), (difference, 0)],
3451
+ mode="constant",
3452
+ constant_values=0,
3453
+ ).tolist()
3454
+ else:
3455
+ encoded_inputs["attention_mask"] = [
3456
+ 0
3457
+ ] * difference + encoded_inputs["attention_mask"]
3458
+ if "attn_mask_startend_row_indices" in encoded_inputs:
3459
+ encoded_inputs["attn_mask_startend_row_indices"] = np.concatenate(
3460
+ [
3461
+ np.zeros([1, difference], dtype=np.int32),
3462
+ np.array(
3463
+ [encoded_inputs["attn_mask_startend_row_indices"]],
3464
+ dtype=np.int32,
3465
+ )
3466
+ + difference,
3467
+ ],
3468
+ axis=-1,
3469
+ )
3292
3470
  if "token_type_ids" in encoded_inputs:
3293
3471
  encoded_inputs["token_type_ids"] = [
3294
3472
  self.pad_token_type_id
@@ -3322,6 +3500,15 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
3322
3500
  ] * difference + required_input
3323
3501
  else:
3324
3502
  raise ValueError("Invalid padding strategy:" + str(self.padding_side))
3503
+ else:
3504
+ if "attn_mask_startend_row_indices" in encoded_inputs:
3505
+ if len(np.shape(encoded_inputs["attn_mask_startend_row_indices"])) == 1:
3506
+ encoded_inputs["attn_mask_startend_row_indices"] = np.array([encoded_inputs["attn_mask_startend_row_indices"]], dtype=np.int32) # fmt:skip
3507
+
3508
+ if "attn_mask_startend_row_indices" in encoded_inputs:
3509
+ assert (
3510
+ len(np.shape(encoded_inputs["attn_mask_startend_row_indices"])) == 2
3511
+ ) # [num_head, seq_len]
3325
3512
 
3326
3513
  return encoded_inputs
3327
3514
 
@@ -3338,9 +3525,38 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
3338
3525
  """
3339
3526
  raise NotImplementedError
3340
3527
 
3528
+ def decode_token(
3529
+ self,
3530
+ all_input_ids: List[int],
3531
+ prefix_offset: int = 0,
3532
+ read_offset: int = 0,
3533
+ ) -> Tuple[str, int, int]:
3534
+ """tokenizer decoding for the streaming generation use case. This method can be overridden for tokenizer that doesn't follow this API"""
3535
+ prefix_text = self.decode(
3536
+ all_input_ids[prefix_offset:read_offset],
3537
+ skip_special_tokens=False,
3538
+ clean_up_tokenization_spaces=False,
3539
+ )
3540
+ new_text = self.decode(
3541
+ all_input_ids[prefix_offset:],
3542
+ skip_special_tokens=False,
3543
+ clean_up_tokenization_spaces=False,
3544
+ )
3545
+
3546
+ if (
3547
+ len(new_text) > len(prefix_text)
3548
+ and not prefix_text.endswith("�")
3549
+ and not new_text.endswith("�")
3550
+ ):
3551
+ prefix_index = new_text.index(prefix_text)
3552
+ new_text = new_text[prefix_index + len(prefix_text) :]
3553
+ return new_text, read_offset, len(all_input_ids)
3554
+ else:
3555
+ return "", prefix_offset, read_offset
3556
+
3341
3557
  def batch_decode(
3342
3558
  self,
3343
- sequences: Union[List[int], List[List[int]], "np.ndarray", "paddle.Tensor"],
3559
+ sequences,
3344
3560
  skip_special_tokens: bool = False,
3345
3561
  clean_up_tokenization_spaces: bool = True,
3346
3562
  **kwargs,
@@ -3373,7 +3589,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
3373
3589
 
3374
3590
  def decode(
3375
3591
  self,
3376
- token_ids: Union[int, List[int], "np.ndarray", "paddle.Tensor"],
3592
+ token_ids,
3377
3593
  skip_special_tokens: bool = False,
3378
3594
  clean_up_tokenization_spaces: bool = True,
3379
3595
  **kwargs,