paddlex 3.0.0rc0__py3-none-any.whl → 3.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- paddlex/.version +1 -1
- paddlex/__init__.py +17 -34
- paddlex/__main__.py +1 -1
- paddlex/configs/modules/chart_parsing/PP-Chart2Table.yaml +13 -0
- paddlex/configs/modules/doc_vlm/PP-DocBee-2B.yaml +14 -0
- paddlex/configs/modules/doc_vlm/PP-DocBee-7B.yaml +14 -0
- paddlex/configs/modules/doc_vlm/PP-DocBee2-3B.yaml +14 -0
- paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-L.yaml +40 -0
- paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-M.yaml +40 -0
- paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-S.yaml +40 -0
- paddlex/configs/modules/layout_detection/PP-DocBlockLayout.yaml +40 -0
- paddlex/configs/modules/layout_detection/PP-DocLayout-L.yaml +2 -2
- paddlex/configs/modules/layout_detection/PP-DocLayout-M.yaml +2 -2
- paddlex/configs/modules/layout_detection/PP-DocLayout-S.yaml +2 -2
- paddlex/configs/modules/layout_detection/PP-DocLayout_plus-L.yaml +40 -0
- paddlex/configs/modules/open_vocabulary_detection/YOLO-Worldv2-L.yaml +13 -0
- paddlex/configs/modules/text_detection/PP-OCRv5_mobile_det.yaml +40 -0
- paddlex/configs/modules/text_detection/PP-OCRv5_server_det.yaml +40 -0
- paddlex/configs/modules/text_recognition/PP-OCRv5_mobile_rec.yaml +39 -0
- paddlex/configs/modules/text_recognition/PP-OCRv5_server_rec.yaml +39 -0
- paddlex/configs/modules/textline_orientation/PP-LCNet_x1_0_textline_ori.yaml +41 -0
- paddlex/configs/pipelines/OCR.yaml +7 -6
- paddlex/configs/pipelines/PP-ChatOCRv3-doc.yaml +3 -1
- paddlex/configs/pipelines/PP-ChatOCRv4-doc.yaml +91 -34
- paddlex/configs/pipelines/PP-StructureV3.yaml +72 -72
- paddlex/configs/pipelines/anomaly_detection.yaml +1 -1
- paddlex/configs/pipelines/doc_understanding.yaml +9 -0
- paddlex/configs/pipelines/formula_recognition.yaml +2 -2
- paddlex/configs/pipelines/layout_parsing.yaml +3 -2
- paddlex/configs/pipelines/seal_recognition.yaml +1 -0
- paddlex/configs/pipelines/table_recognition.yaml +2 -1
- paddlex/configs/pipelines/table_recognition_v2.yaml +7 -1
- paddlex/configs/pipelines/ts_anomaly_detection.yaml +1 -1
- paddlex/configs/pipelines/ts_classification.yaml +1 -1
- paddlex/configs/pipelines/ts_forecast.yaml +1 -1
- paddlex/constants.py +17 -0
- paddlex/engine.py +7 -5
- paddlex/hpip_links.html +23 -11
- paddlex/inference/__init__.py +3 -3
- paddlex/inference/common/__init__.py +1 -1
- paddlex/inference/common/batch_sampler/__init__.py +5 -4
- paddlex/inference/common/batch_sampler/audio_batch_sampler.py +5 -6
- paddlex/inference/common/batch_sampler/base_batch_sampler.py +20 -16
- paddlex/inference/common/batch_sampler/det_3d_batch_sampler.py +4 -7
- paddlex/inference/common/batch_sampler/doc_vlm_batch_sampler.py +87 -0
- paddlex/inference/common/batch_sampler/image_batch_sampler.py +45 -60
- paddlex/inference/common/batch_sampler/ts_batch_sampler.py +9 -10
- paddlex/inference/common/batch_sampler/video_batch_sampler.py +2 -22
- paddlex/inference/common/reader/__init__.py +4 -4
- paddlex/inference/common/reader/audio_reader.py +3 -3
- paddlex/inference/common/reader/det_3d_reader.py +7 -5
- paddlex/inference/common/reader/image_reader.py +16 -12
- paddlex/inference/common/reader/ts_reader.py +3 -2
- paddlex/inference/common/reader/video_reader.py +3 -3
- paddlex/inference/common/result/__init__.py +7 -7
- paddlex/inference/common/result/base_cv_result.py +12 -2
- paddlex/inference/common/result/base_result.py +7 -5
- paddlex/inference/common/result/base_ts_result.py +1 -2
- paddlex/inference/common/result/base_video_result.py +2 -2
- paddlex/inference/common/result/mixin.py +31 -25
- paddlex/inference/models/__init__.py +41 -85
- paddlex/inference/models/anomaly_detection/__init__.py +1 -1
- paddlex/inference/models/anomaly_detection/predictor.py +9 -19
- paddlex/inference/models/anomaly_detection/processors.py +9 -2
- paddlex/inference/models/anomaly_detection/result.py +3 -2
- paddlex/inference/models/base/__init__.py +2 -2
- paddlex/inference/models/base/predictor/__init__.py +1 -2
- paddlex/inference/models/base/predictor/base_predictor.py +278 -39
- paddlex/inference/models/common/__init__.py +6 -15
- paddlex/inference/models/common/static_infer.py +724 -251
- paddlex/inference/models/common/tokenizer/__init__.py +7 -3
- paddlex/inference/models/common/tokenizer/bert_tokenizer.py +1 -1
- paddlex/inference/models/common/tokenizer/clip_tokenizer.py +609 -0
- paddlex/inference/models/common/tokenizer/gpt_tokenizer.py +9 -7
- paddlex/inference/models/common/tokenizer/qwen2_5_tokenizer.py +112 -0
- paddlex/inference/models/common/tokenizer/qwen2_tokenizer.py +438 -0
- paddlex/inference/models/common/tokenizer/qwen_tokenizer.py +288 -0
- paddlex/inference/models/common/tokenizer/tokenizer_utils.py +85 -77
- paddlex/inference/models/common/tokenizer/tokenizer_utils_base.py +339 -123
- paddlex/inference/models/common/tokenizer/utils.py +1 -1
- paddlex/inference/models/common/tokenizer/vocab.py +8 -8
- paddlex/inference/models/common/ts/__init__.py +1 -1
- paddlex/inference/models/common/ts/funcs.py +13 -6
- paddlex/inference/models/common/ts/processors.py +14 -5
- paddlex/inference/models/common/vision/__init__.py +3 -3
- paddlex/inference/models/common/vision/funcs.py +17 -12
- paddlex/inference/models/common/vision/processors.py +61 -46
- paddlex/inference/models/common/vlm/__init__.py +13 -0
- paddlex/inference/models/common/vlm/activations.py +189 -0
- paddlex/inference/models/common/vlm/bert_padding.py +127 -0
- paddlex/inference/models/common/vlm/conversion_utils.py +99 -0
- paddlex/inference/models/common/vlm/distributed.py +229 -0
- paddlex/inference/models/common/vlm/flash_attn_utils.py +119 -0
- paddlex/inference/models/common/vlm/fusion_ops.py +205 -0
- paddlex/inference/models/common/vlm/generation/__init__.py +34 -0
- paddlex/inference/models/common/vlm/generation/configuration_utils.py +533 -0
- paddlex/inference/models/common/vlm/generation/logits_process.py +730 -0
- paddlex/inference/models/common/vlm/generation/stopping_criteria.py +106 -0
- paddlex/inference/models/common/vlm/generation/utils.py +2162 -0
- paddlex/inference/models/common/vlm/transformers/__init__.py +16 -0
- paddlex/inference/models/common/vlm/transformers/configuration_utils.py +1037 -0
- paddlex/inference/models/common/vlm/transformers/conversion_utils.py +408 -0
- paddlex/inference/models/common/vlm/transformers/model_outputs.py +1612 -0
- paddlex/inference/models/common/vlm/transformers/model_utils.py +2014 -0
- paddlex/inference/models/common/vlm/transformers/utils.py +178 -0
- paddlex/inference/models/common/vlm/utils.py +109 -0
- paddlex/inference/models/doc_vlm/__init__.py +15 -0
- paddlex/inference/models/doc_vlm/modeling/GOT_ocr_2_0.py +830 -0
- paddlex/inference/models/doc_vlm/modeling/__init__.py +17 -0
- paddlex/inference/models/doc_vlm/modeling/qwen2.py +1606 -0
- paddlex/inference/models/doc_vlm/modeling/qwen2_5_vl.py +3006 -0
- paddlex/inference/models/doc_vlm/modeling/qwen2_vl.py +2495 -0
- paddlex/inference/models/doc_vlm/predictor.py +253 -0
- paddlex/inference/models/doc_vlm/processors/GOT_ocr_2_0.py +97 -0
- paddlex/inference/models/doc_vlm/processors/__init__.py +17 -0
- paddlex/inference/models/doc_vlm/processors/common.py +561 -0
- paddlex/inference/models/doc_vlm/processors/qwen2_5_vl.py +548 -0
- paddlex/inference/models/doc_vlm/processors/qwen2_vl.py +543 -0
- paddlex/inference/models/doc_vlm/result.py +21 -0
- paddlex/inference/models/face_feature/__init__.py +1 -1
- paddlex/inference/models/face_feature/predictor.py +2 -1
- paddlex/inference/models/formula_recognition/__init__.py +1 -1
- paddlex/inference/models/formula_recognition/predictor.py +18 -28
- paddlex/inference/models/formula_recognition/processors.py +126 -97
- paddlex/inference/models/formula_recognition/result.py +43 -35
- paddlex/inference/models/image_classification/__init__.py +1 -1
- paddlex/inference/models/image_classification/predictor.py +9 -19
- paddlex/inference/models/image_classification/processors.py +4 -2
- paddlex/inference/models/image_classification/result.py +4 -3
- paddlex/inference/models/image_feature/__init__.py +1 -1
- paddlex/inference/models/image_feature/predictor.py +9 -19
- paddlex/inference/models/image_feature/processors.py +7 -5
- paddlex/inference/models/image_feature/result.py +2 -3
- paddlex/inference/models/image_multilabel_classification/__init__.py +1 -1
- paddlex/inference/models/image_multilabel_classification/predictor.py +7 -6
- paddlex/inference/models/image_multilabel_classification/processors.py +6 -2
- paddlex/inference/models/image_multilabel_classification/result.py +4 -3
- paddlex/inference/models/image_unwarping/__init__.py +1 -1
- paddlex/inference/models/image_unwarping/predictor.py +8 -16
- paddlex/inference/models/image_unwarping/processors.py +6 -2
- paddlex/inference/models/image_unwarping/result.py +4 -2
- paddlex/inference/models/instance_segmentation/__init__.py +1 -1
- paddlex/inference/models/instance_segmentation/predictor.py +7 -15
- paddlex/inference/models/instance_segmentation/processors.py +4 -7
- paddlex/inference/models/instance_segmentation/result.py +11 -10
- paddlex/inference/models/keypoint_detection/__init__.py +1 -1
- paddlex/inference/models/keypoint_detection/predictor.py +5 -3
- paddlex/inference/models/keypoint_detection/processors.py +11 -3
- paddlex/inference/models/keypoint_detection/result.py +9 -4
- paddlex/inference/models/{3d_bev_detection → m_3d_bev_detection}/__init__.py +1 -1
- paddlex/inference/models/{3d_bev_detection → m_3d_bev_detection}/predictor.py +15 -26
- paddlex/inference/models/{3d_bev_detection → m_3d_bev_detection}/processors.py +26 -14
- paddlex/inference/models/{3d_bev_detection → m_3d_bev_detection}/result.py +15 -12
- paddlex/inference/models/{3d_bev_detection → m_3d_bev_detection}/visualizer_3d.py +77 -39
- paddlex/inference/models/multilingual_speech_recognition/__init__.py +1 -1
- paddlex/inference/models/multilingual_speech_recognition/predictor.py +11 -15
- paddlex/inference/models/multilingual_speech_recognition/processors.py +45 -53
- paddlex/inference/models/multilingual_speech_recognition/result.py +1 -1
- paddlex/inference/models/object_detection/__init__.py +1 -1
- paddlex/inference/models/object_detection/predictor.py +8 -12
- paddlex/inference/models/object_detection/processors.py +63 -33
- paddlex/inference/models/object_detection/result.py +5 -4
- paddlex/inference/models/object_detection/utils.py +3 -1
- paddlex/inference/models/open_vocabulary_detection/__init__.py +1 -1
- paddlex/inference/models/open_vocabulary_detection/predictor.py +31 -14
- paddlex/inference/models/open_vocabulary_detection/processors/__init__.py +3 -2
- paddlex/inference/models/open_vocabulary_detection/processors/common.py +114 -0
- paddlex/inference/models/open_vocabulary_detection/processors/groundingdino_processors.py +19 -8
- paddlex/inference/models/open_vocabulary_detection/processors/yoloworld_processors.py +209 -0
- paddlex/inference/models/open_vocabulary_segmentation/__init__.py +1 -1
- paddlex/inference/models/open_vocabulary_segmentation/predictor.py +6 -13
- paddlex/inference/models/open_vocabulary_segmentation/processors/__init__.py +1 -1
- paddlex/inference/models/open_vocabulary_segmentation/processors/sam_processer.py +12 -12
- paddlex/inference/models/open_vocabulary_segmentation/results/__init__.py +1 -1
- paddlex/inference/models/open_vocabulary_segmentation/results/sam_result.py +11 -9
- paddlex/inference/models/semantic_segmentation/__init__.py +1 -1
- paddlex/inference/models/semantic_segmentation/predictor.py +9 -18
- paddlex/inference/models/semantic_segmentation/processors.py +11 -8
- paddlex/inference/models/semantic_segmentation/result.py +4 -3
- paddlex/inference/models/table_structure_recognition/__init__.py +1 -1
- paddlex/inference/models/table_structure_recognition/predictor.py +8 -18
- paddlex/inference/models/table_structure_recognition/processors.py +23 -29
- paddlex/inference/models/table_structure_recognition/result.py +8 -15
- paddlex/inference/models/text_detection/__init__.py +1 -1
- paddlex/inference/models/text_detection/predictor.py +24 -24
- paddlex/inference/models/text_detection/processors.py +116 -44
- paddlex/inference/models/text_detection/result.py +8 -13
- paddlex/inference/models/text_recognition/__init__.py +1 -1
- paddlex/inference/models/text_recognition/predictor.py +11 -19
- paddlex/inference/models/text_recognition/processors.py +27 -13
- paddlex/inference/models/text_recognition/result.py +3 -2
- paddlex/inference/models/ts_anomaly_detection/__init__.py +1 -1
- paddlex/inference/models/ts_anomaly_detection/predictor.py +12 -17
- paddlex/inference/models/ts_anomaly_detection/processors.py +6 -2
- paddlex/inference/models/ts_anomaly_detection/result.py +21 -10
- paddlex/inference/models/ts_classification/__init__.py +1 -1
- paddlex/inference/models/ts_classification/predictor.py +14 -27
- paddlex/inference/models/ts_classification/processors.py +7 -2
- paddlex/inference/models/ts_classification/result.py +21 -12
- paddlex/inference/models/ts_forecasting/__init__.py +1 -1
- paddlex/inference/models/ts_forecasting/predictor.py +13 -18
- paddlex/inference/models/ts_forecasting/processors.py +12 -3
- paddlex/inference/models/ts_forecasting/result.py +24 -11
- paddlex/inference/models/video_classification/__init__.py +1 -1
- paddlex/inference/models/video_classification/predictor.py +9 -15
- paddlex/inference/models/video_classification/processors.py +24 -24
- paddlex/inference/models/video_classification/result.py +7 -3
- paddlex/inference/models/video_detection/__init__.py +1 -1
- paddlex/inference/models/video_detection/predictor.py +8 -15
- paddlex/inference/models/video_detection/processors.py +24 -11
- paddlex/inference/models/video_detection/result.py +10 -5
- paddlex/inference/pipelines/__init__.py +48 -37
- paddlex/inference/pipelines/_parallel.py +172 -0
- paddlex/inference/pipelines/anomaly_detection/__init__.py +1 -1
- paddlex/inference/pipelines/anomaly_detection/pipeline.py +29 -9
- paddlex/inference/pipelines/attribute_recognition/__init__.py +1 -1
- paddlex/inference/pipelines/attribute_recognition/pipeline.py +24 -9
- paddlex/inference/pipelines/attribute_recognition/result.py +10 -8
- paddlex/inference/pipelines/base.py +43 -13
- paddlex/inference/pipelines/components/__init__.py +14 -8
- paddlex/inference/pipelines/components/chat_server/__init__.py +1 -1
- paddlex/inference/pipelines/components/chat_server/base.py +2 -2
- paddlex/inference/pipelines/components/chat_server/openai_bot_chat.py +8 -8
- paddlex/inference/pipelines/components/common/__init__.py +5 -4
- paddlex/inference/pipelines/components/common/base_operator.py +2 -1
- paddlex/inference/pipelines/components/common/base_result.py +3 -2
- paddlex/inference/pipelines/components/common/convert_points_and_boxes.py +1 -2
- paddlex/inference/pipelines/components/common/crop_image_regions.py +11 -5
- paddlex/inference/pipelines/components/common/seal_det_warp.py +44 -13
- paddlex/inference/pipelines/components/common/sort_boxes.py +4 -2
- paddlex/inference/pipelines/components/common/warp_image.py +50 -0
- paddlex/inference/pipelines/components/faisser.py +10 -5
- paddlex/inference/pipelines/components/prompt_engineering/__init__.py +2 -2
- paddlex/inference/pipelines/components/prompt_engineering/base.py +2 -2
- paddlex/inference/pipelines/components/prompt_engineering/generate_ensemble_prompt.py +2 -1
- paddlex/inference/pipelines/components/prompt_engineering/generate_kie_prompt.py +2 -2
- paddlex/inference/pipelines/components/retriever/__init__.py +2 -2
- paddlex/inference/pipelines/components/retriever/base.py +18 -16
- paddlex/inference/pipelines/components/retriever/openai_bot_retriever.py +2 -2
- paddlex/inference/pipelines/components/retriever/qianfan_bot_retriever.py +87 -84
- paddlex/inference/pipelines/components/utils/__init__.py +1 -1
- paddlex/inference/pipelines/components/utils/mixin.py +7 -7
- paddlex/inference/pipelines/doc_preprocessor/__init__.py +1 -1
- paddlex/inference/pipelines/doc_preprocessor/pipeline.py +70 -51
- paddlex/inference/pipelines/doc_preprocessor/result.py +5 -10
- paddlex/inference/pipelines/doc_understanding/__init__.py +15 -0
- paddlex/inference/pipelines/doc_understanding/pipeline.py +71 -0
- paddlex/inference/pipelines/face_recognition/__init__.py +1 -1
- paddlex/inference/pipelines/face_recognition/pipeline.py +3 -1
- paddlex/inference/pipelines/face_recognition/result.py +3 -2
- paddlex/inference/pipelines/formula_recognition/__init__.py +1 -1
- paddlex/inference/pipelines/formula_recognition/pipeline.py +137 -93
- paddlex/inference/pipelines/formula_recognition/result.py +20 -29
- paddlex/inference/pipelines/image_classification/__init__.py +1 -1
- paddlex/inference/pipelines/image_classification/pipeline.py +30 -11
- paddlex/inference/pipelines/image_multilabel_classification/__init__.py +1 -1
- paddlex/inference/pipelines/image_multilabel_classification/pipeline.py +31 -12
- paddlex/inference/pipelines/instance_segmentation/__init__.py +1 -1
- paddlex/inference/pipelines/instance_segmentation/pipeline.py +30 -9
- paddlex/inference/pipelines/keypoint_detection/__init__.py +1 -1
- paddlex/inference/pipelines/keypoint_detection/pipeline.py +30 -9
- paddlex/inference/pipelines/layout_parsing/__init__.py +1 -1
- paddlex/inference/pipelines/layout_parsing/pipeline.py +54 -56
- paddlex/inference/pipelines/layout_parsing/pipeline_v2.py +904 -261
- paddlex/inference/pipelines/layout_parsing/result.py +9 -21
- paddlex/inference/pipelines/layout_parsing/result_v2.py +525 -250
- paddlex/inference/pipelines/layout_parsing/setting.py +87 -0
- paddlex/inference/pipelines/layout_parsing/utils.py +570 -2004
- paddlex/inference/pipelines/layout_parsing/xycut_enhanced/__init__.py +16 -0
- paddlex/inference/pipelines/layout_parsing/xycut_enhanced/utils.py +1144 -0
- paddlex/inference/pipelines/layout_parsing/xycut_enhanced/xycuts.py +563 -0
- paddlex/inference/pipelines/{3d_bev_detection → m_3d_bev_detection}/__init__.py +1 -1
- paddlex/inference/pipelines/{3d_bev_detection → m_3d_bev_detection}/pipeline.py +17 -10
- paddlex/inference/pipelines/multilingual_speech_recognition/__init__.py +1 -1
- paddlex/inference/pipelines/multilingual_speech_recognition/pipeline.py +17 -6
- paddlex/inference/pipelines/object_detection/__init__.py +1 -1
- paddlex/inference/pipelines/object_detection/pipeline.py +29 -9
- paddlex/inference/pipelines/ocr/__init__.py +1 -1
- paddlex/inference/pipelines/ocr/pipeline.py +151 -77
- paddlex/inference/pipelines/ocr/result.py +31 -24
- paddlex/inference/pipelines/open_vocabulary_detection/__init__.py +1 -1
- paddlex/inference/pipelines/open_vocabulary_detection/pipeline.py +17 -6
- paddlex/inference/pipelines/open_vocabulary_segmentation/__init__.py +1 -1
- paddlex/inference/pipelines/open_vocabulary_segmentation/pipeline.py +17 -6
- paddlex/inference/pipelines/pp_chatocr/__init__.py +1 -1
- paddlex/inference/pipelines/pp_chatocr/pipeline_base.py +14 -5
- paddlex/inference/pipelines/pp_chatocr/pipeline_v3.py +22 -14
- paddlex/inference/pipelines/pp_chatocr/pipeline_v4.py +34 -16
- paddlex/inference/pipelines/pp_shitu_v2/__init__.py +1 -1
- paddlex/inference/pipelines/pp_shitu_v2/pipeline.py +12 -8
- paddlex/inference/pipelines/pp_shitu_v2/result.py +4 -4
- paddlex/inference/pipelines/rotated_object_detection/__init__.py +1 -1
- paddlex/inference/pipelines/rotated_object_detection/pipeline.py +30 -9
- paddlex/inference/pipelines/seal_recognition/__init__.py +1 -1
- paddlex/inference/pipelines/seal_recognition/pipeline.py +127 -63
- paddlex/inference/pipelines/seal_recognition/result.py +4 -2
- paddlex/inference/pipelines/semantic_segmentation/__init__.py +1 -1
- paddlex/inference/pipelines/semantic_segmentation/pipeline.py +30 -9
- paddlex/inference/pipelines/small_object_detection/__init__.py +1 -1
- paddlex/inference/pipelines/small_object_detection/pipeline.py +30 -9
- paddlex/inference/pipelines/table_recognition/__init__.py +1 -1
- paddlex/inference/pipelines/table_recognition/pipeline.py +61 -37
- paddlex/inference/pipelines/table_recognition/pipeline_v2.py +668 -65
- paddlex/inference/pipelines/table_recognition/result.py +12 -10
- paddlex/inference/pipelines/table_recognition/table_recognition_post_processing.py +12 -8
- paddlex/inference/pipelines/table_recognition/table_recognition_post_processing_v2.py +55 -37
- paddlex/inference/pipelines/table_recognition/utils.py +1 -1
- paddlex/inference/pipelines/ts_anomaly_detection/__init__.py +1 -1
- paddlex/inference/pipelines/ts_anomaly_detection/pipeline.py +16 -6
- paddlex/inference/pipelines/ts_classification/__init__.py +1 -1
- paddlex/inference/pipelines/ts_classification/pipeline.py +16 -6
- paddlex/inference/pipelines/ts_forecasting/__init__.py +1 -1
- paddlex/inference/pipelines/ts_forecasting/pipeline.py +16 -6
- paddlex/inference/pipelines/video_classification/__init__.py +1 -1
- paddlex/inference/pipelines/video_classification/pipeline.py +17 -6
- paddlex/inference/pipelines/video_detection/__init__.py +1 -1
- paddlex/inference/pipelines/video_detection/pipeline.py +20 -7
- paddlex/inference/serving/__init__.py +5 -1
- paddlex/inference/serving/basic_serving/__init__.py +1 -1
- paddlex/inference/serving/basic_serving/_app.py +31 -19
- paddlex/inference/serving/basic_serving/_pipeline_apps/__init__.py +7 -4
- paddlex/inference/serving/basic_serving/_pipeline_apps/_common/__init__.py +1 -1
- paddlex/inference/serving/basic_serving/_pipeline_apps/_common/common.py +12 -4
- paddlex/inference/serving/basic_serving/_pipeline_apps/_common/image_recognition.py +1 -1
- paddlex/inference/serving/basic_serving/_pipeline_apps/_common/ocr.py +7 -2
- paddlex/inference/serving/basic_serving/_pipeline_apps/anomaly_detection.py +10 -7
- paddlex/inference/serving/basic_serving/_pipeline_apps/doc_preprocessor.py +10 -7
- paddlex/inference/serving/basic_serving/_pipeline_apps/doc_understanding.py +153 -0
- paddlex/inference/serving/basic_serving/_pipeline_apps/face_recognition.py +16 -13
- paddlex/inference/serving/basic_serving/_pipeline_apps/formula_recognition.py +10 -7
- paddlex/inference/serving/basic_serving/_pipeline_apps/human_keypoint_detection.py +10 -7
- paddlex/inference/serving/basic_serving/_pipeline_apps/image_classification.py +10 -7
- paddlex/inference/serving/basic_serving/_pipeline_apps/image_multilabel_classification.py +10 -7
- paddlex/inference/serving/basic_serving/_pipeline_apps/instance_segmentation.py +13 -7
- paddlex/inference/serving/basic_serving/_pipeline_apps/layout_parsing.py +10 -8
- paddlex/inference/serving/basic_serving/_pipeline_apps/m_3d_bev_detection.py +10 -7
- paddlex/inference/serving/basic_serving/_pipeline_apps/multilingual_speech_recognition.py +10 -7
- paddlex/inference/serving/basic_serving/_pipeline_apps/object_detection.py +10 -7
- paddlex/inference/serving/basic_serving/_pipeline_apps/ocr.py +10 -7
- paddlex/inference/serving/basic_serving/_pipeline_apps/open_vocabulary_detection.py +10 -7
- paddlex/inference/serving/basic_serving/_pipeline_apps/open_vocabulary_segmentation.py +13 -7
- paddlex/inference/serving/basic_serving/_pipeline_apps/pedestrian_attribute_recognition.py +10 -7
- paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv3_doc.py +14 -12
- paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv4_doc.py +17 -14
- paddlex/inference/serving/basic_serving/_pipeline_apps/pp_shituv2.py +16 -13
- paddlex/inference/serving/basic_serving/_pipeline_apps/pp_structurev3.py +16 -9
- paddlex/inference/serving/basic_serving/_pipeline_apps/rotated_object_detection.py +10 -7
- paddlex/inference/serving/basic_serving/_pipeline_apps/seal_recognition.py +10 -7
- paddlex/inference/serving/basic_serving/_pipeline_apps/semantic_segmentation.py +10 -7
- paddlex/inference/serving/basic_serving/_pipeline_apps/small_object_detection.py +10 -7
- paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition.py +11 -12
- paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition_v2.py +14 -12
- paddlex/inference/serving/basic_serving/_pipeline_apps/ts_anomaly_detection.py +10 -7
- paddlex/inference/serving/basic_serving/_pipeline_apps/ts_classification.py +10 -7
- paddlex/inference/serving/basic_serving/_pipeline_apps/ts_forecast.py +10 -7
- paddlex/inference/serving/basic_serving/_pipeline_apps/vehicle_attribute_recognition.py +10 -7
- paddlex/inference/serving/basic_serving/_pipeline_apps/video_classification.py +10 -7
- paddlex/inference/serving/basic_serving/_pipeline_apps/video_detection.py +10 -7
- paddlex/inference/serving/basic_serving/_server.py +9 -4
- paddlex/inference/serving/infra/__init__.py +1 -1
- paddlex/inference/serving/infra/config.py +1 -1
- paddlex/inference/serving/infra/models.py +13 -6
- paddlex/inference/serving/infra/storage.py +9 -4
- paddlex/inference/serving/infra/utils.py +54 -28
- paddlex/inference/serving/schemas/__init__.py +1 -1
- paddlex/inference/serving/schemas/anomaly_detection.py +1 -1
- paddlex/inference/serving/schemas/doc_preprocessor.py +1 -1
- paddlex/inference/serving/schemas/doc_understanding.py +78 -0
- paddlex/inference/serving/schemas/face_recognition.py +1 -1
- paddlex/inference/serving/schemas/formula_recognition.py +2 -2
- paddlex/inference/serving/schemas/human_keypoint_detection.py +1 -1
- paddlex/inference/serving/schemas/image_classification.py +1 -1
- paddlex/inference/serving/schemas/image_multilabel_classification.py +1 -1
- paddlex/inference/serving/schemas/instance_segmentation.py +1 -1
- paddlex/inference/serving/schemas/layout_parsing.py +2 -3
- paddlex/inference/serving/schemas/m_3d_bev_detection.py +1 -1
- paddlex/inference/serving/schemas/multilingual_speech_recognition.py +1 -1
- paddlex/inference/serving/schemas/object_detection.py +1 -1
- paddlex/inference/serving/schemas/ocr.py +1 -1
- paddlex/inference/serving/schemas/open_vocabulary_detection.py +1 -1
- paddlex/inference/serving/schemas/open_vocabulary_segmentation.py +1 -1
- paddlex/inference/serving/schemas/pedestrian_attribute_recognition.py +1 -1
- paddlex/inference/serving/schemas/pp_chatocrv3_doc.py +2 -3
- paddlex/inference/serving/schemas/pp_chatocrv4_doc.py +3 -3
- paddlex/inference/serving/schemas/pp_shituv2.py +1 -1
- paddlex/inference/serving/schemas/pp_structurev3.py +11 -7
- paddlex/inference/serving/schemas/rotated_object_detection.py +1 -1
- paddlex/inference/serving/schemas/seal_recognition.py +2 -2
- paddlex/inference/serving/schemas/semantic_segmentation.py +1 -1
- paddlex/inference/serving/schemas/shared/__init__.py +1 -1
- paddlex/inference/serving/schemas/shared/classification.py +1 -1
- paddlex/inference/serving/schemas/shared/image_segmentation.py +1 -1
- paddlex/inference/serving/schemas/shared/object_detection.py +1 -1
- paddlex/inference/serving/schemas/shared/ocr.py +1 -1
- paddlex/inference/serving/schemas/small_object_detection.py +1 -1
- paddlex/inference/serving/schemas/table_recognition.py +3 -7
- paddlex/inference/serving/schemas/table_recognition_v2.py +6 -7
- paddlex/inference/serving/schemas/ts_anomaly_detection.py +1 -1
- paddlex/inference/serving/schemas/ts_classification.py +1 -1
- paddlex/inference/serving/schemas/ts_forecast.py +1 -1
- paddlex/inference/serving/schemas/vehicle_attribute_recognition.py +1 -1
- paddlex/inference/serving/schemas/video_classification.py +1 -1
- paddlex/inference/serving/schemas/video_detection.py +1 -1
- paddlex/inference/utils/__init__.py +1 -1
- paddlex/inference/utils/benchmark.py +332 -179
- paddlex/inference/utils/color_map.py +1 -1
- paddlex/inference/utils/get_pipeline_path.py +1 -1
- paddlex/inference/utils/hpi.py +258 -0
- paddlex/inference/utils/hpi_model_info_collection.json +2331 -0
- paddlex/inference/utils/io/__init__.py +11 -11
- paddlex/inference/utils/io/readers.py +31 -27
- paddlex/inference/utils/io/style.py +21 -14
- paddlex/inference/utils/io/tablepyxl.py +13 -5
- paddlex/inference/utils/io/writers.py +9 -10
- paddlex/inference/utils/mkldnn_blocklist.py +25 -0
- paddlex/inference/utils/model_paths.py +48 -0
- paddlex/inference/utils/{new_ir_blacklist.py → new_ir_blocklist.py} +1 -2
- paddlex/inference/utils/official_models.py +278 -262
- paddlex/inference/utils/pp_option.py +184 -92
- paddlex/inference/utils/trt_blocklist.py +43 -0
- paddlex/inference/utils/trt_config.py +420 -0
- paddlex/model.py +30 -12
- paddlex/modules/__init__.py +57 -80
- paddlex/modules/anomaly_detection/__init__.py +2 -2
- paddlex/modules/anomaly_detection/dataset_checker/__init__.py +2 -3
- paddlex/modules/anomaly_detection/dataset_checker/dataset_src/__init__.py +2 -2
- paddlex/modules/anomaly_detection/dataset_checker/dataset_src/analyse_dataset.py +6 -3
- paddlex/modules/anomaly_detection/dataset_checker/dataset_src/check_dataset.py +8 -4
- paddlex/modules/anomaly_detection/dataset_checker/dataset_src/convert_dataset.py +7 -4
- paddlex/modules/anomaly_detection/dataset_checker/dataset_src/split_dataset.py +2 -2
- paddlex/modules/anomaly_detection/dataset_checker/dataset_src/utils/__init__.py +1 -1
- paddlex/modules/anomaly_detection/dataset_checker/dataset_src/utils/visualizer.py +7 -2
- paddlex/modules/anomaly_detection/evaluator.py +3 -3
- paddlex/modules/anomaly_detection/exportor.py +1 -1
- paddlex/modules/anomaly_detection/model_list.py +1 -1
- paddlex/modules/anomaly_detection/trainer.py +3 -4
- paddlex/modules/base/__init__.py +5 -5
- paddlex/modules/base/build_model.py +1 -2
- paddlex/modules/base/dataset_checker/__init__.py +2 -2
- paddlex/modules/base/dataset_checker/dataset_checker.py +4 -4
- paddlex/modules/base/dataset_checker/utils.py +1 -3
- paddlex/modules/base/evaluator.py +13 -13
- paddlex/modules/base/exportor.py +12 -13
- paddlex/modules/base/trainer.py +21 -11
- paddlex/modules/base/utils/__init__.py +13 -0
- paddlex/modules/base/utils/cinn_setting.py +89 -0
- paddlex/modules/base/utils/coco_eval.py +94 -0
- paddlex/modules/base/utils/topk_eval.py +118 -0
- paddlex/modules/doc_vlm/__init__.py +18 -0
- paddlex/modules/doc_vlm/dataset_checker.py +29 -0
- paddlex/modules/doc_vlm/evaluator.py +29 -0
- paddlex/modules/doc_vlm/exportor.py +29 -0
- paddlex/modules/doc_vlm/model_list.py +16 -0
- paddlex/modules/doc_vlm/trainer.py +41 -0
- paddlex/modules/face_recognition/__init__.py +2 -2
- paddlex/modules/face_recognition/dataset_checker/__init__.py +2 -2
- paddlex/modules/face_recognition/dataset_checker/dataset_src/__init__.py +1 -1
- paddlex/modules/face_recognition/dataset_checker/dataset_src/check_dataset.py +3 -5
- paddlex/modules/face_recognition/dataset_checker/dataset_src/utils/__init__.py +1 -1
- paddlex/modules/face_recognition/dataset_checker/dataset_src/utils/visualizer.py +2 -5
- paddlex/modules/face_recognition/evaluator.py +3 -3
- paddlex/modules/face_recognition/exportor.py +1 -1
- paddlex/modules/face_recognition/model_list.py +1 -1
- paddlex/modules/face_recognition/trainer.py +1 -1
- paddlex/modules/formula_recognition/__init__.py +2 -2
- paddlex/modules/formula_recognition/dataset_checker/__init__.py +3 -3
- paddlex/modules/formula_recognition/dataset_checker/dataset_src/__init__.py +2 -2
- paddlex/modules/formula_recognition/dataset_checker/dataset_src/analyse_dataset.py +13 -12
- paddlex/modules/formula_recognition/dataset_checker/dataset_src/check_dataset.py +2 -6
- paddlex/modules/formula_recognition/dataset_checker/dataset_src/convert_dataset.py +11 -10
- paddlex/modules/formula_recognition/dataset_checker/dataset_src/split_dataset.py +1 -2
- paddlex/modules/formula_recognition/evaluator.py +6 -3
- paddlex/modules/formula_recognition/exportor.py +1 -1
- paddlex/modules/formula_recognition/model_list.py +4 -1
- paddlex/modules/formula_recognition/trainer.py +5 -3
- paddlex/modules/general_recognition/__init__.py +2 -2
- paddlex/modules/general_recognition/dataset_checker/__init__.py +2 -2
- paddlex/modules/general_recognition/dataset_checker/dataset_src/__init__.py +2 -2
- paddlex/modules/general_recognition/dataset_checker/dataset_src/analyse_dataset.py +7 -9
- paddlex/modules/general_recognition/dataset_checker/dataset_src/check_dataset.py +4 -5
- paddlex/modules/general_recognition/dataset_checker/dataset_src/convert_dataset.py +6 -5
- paddlex/modules/general_recognition/dataset_checker/dataset_src/split_dataset.py +1 -1
- paddlex/modules/general_recognition/dataset_checker/dataset_src/utils/__init__.py +1 -1
- paddlex/modules/general_recognition/dataset_checker/dataset_src/utils/visualizer.py +2 -5
- paddlex/modules/general_recognition/evaluator.py +2 -2
- paddlex/modules/general_recognition/exportor.py +1 -1
- paddlex/modules/general_recognition/model_list.py +1 -1
- paddlex/modules/general_recognition/trainer.py +1 -1
- paddlex/modules/image_classification/__init__.py +2 -2
- paddlex/modules/image_classification/dataset_checker/__init__.py +2 -2
- paddlex/modules/image_classification/dataset_checker/dataset_src/__init__.py +2 -2
- paddlex/modules/image_classification/dataset_checker/dataset_src/analyse_dataset.py +8 -9
- paddlex/modules/image_classification/dataset_checker/dataset_src/check_dataset.py +4 -3
- paddlex/modules/image_classification/dataset_checker/dataset_src/convert_dataset.py +4 -4
- paddlex/modules/image_classification/dataset_checker/dataset_src/split_dataset.py +1 -1
- paddlex/modules/image_classification/dataset_checker/dataset_src/utils/__init__.py +1 -1
- paddlex/modules/image_classification/dataset_checker/dataset_src/utils/visualizer.py +2 -5
- paddlex/modules/image_classification/evaluator.py +3 -3
- paddlex/modules/image_classification/exportor.py +1 -1
- paddlex/modules/image_classification/model_list.py +2 -1
- paddlex/modules/image_classification/trainer.py +3 -3
- paddlex/modules/image_unwarping/__init__.py +1 -1
- paddlex/modules/image_unwarping/model_list.py +1 -1
- paddlex/modules/instance_segmentation/__init__.py +2 -2
- paddlex/modules/instance_segmentation/dataset_checker/__init__.py +2 -3
- paddlex/modules/instance_segmentation/dataset_checker/dataset_src/__init__.py +2 -2
- paddlex/modules/instance_segmentation/dataset_checker/dataset_src/analyse_dataset.py +9 -5
- paddlex/modules/instance_segmentation/dataset_checker/dataset_src/check_dataset.py +8 -5
- paddlex/modules/instance_segmentation/dataset_checker/dataset_src/convert_dataset.py +8 -8
- paddlex/modules/instance_segmentation/dataset_checker/dataset_src/split_dataset.py +7 -4
- paddlex/modules/instance_segmentation/dataset_checker/dataset_src/utils/__init__.py +1 -1
- paddlex/modules/instance_segmentation/dataset_checker/dataset_src/utils/visualizer.py +10 -8
- paddlex/modules/instance_segmentation/evaluator.py +2 -2
- paddlex/modules/instance_segmentation/exportor.py +1 -1
- paddlex/modules/instance_segmentation/model_list.py +1 -1
- paddlex/modules/instance_segmentation/trainer.py +1 -1
- paddlex/modules/keypoint_detection/__init__.py +2 -2
- paddlex/modules/keypoint_detection/dataset_checker/__init__.py +2 -2
- paddlex/modules/keypoint_detection/dataset_checker/dataset_src/__init__.py +1 -1
- paddlex/modules/keypoint_detection/dataset_checker/dataset_src/check_dataset.py +10 -5
- paddlex/modules/keypoint_detection/dataset_checker/dataset_src/utils/__init__.py +1 -1
- paddlex/modules/keypoint_detection/dataset_checker/dataset_src/utils/visualizer.py +8 -3
- paddlex/modules/keypoint_detection/evaluator.py +2 -2
- paddlex/modules/keypoint_detection/exportor.py +1 -1
- paddlex/modules/keypoint_detection/model_list.py +1 -1
- paddlex/modules/keypoint_detection/trainer.py +2 -2
- paddlex/modules/{3d_bev_detection → m_3d_bev_detection}/__init__.py +2 -2
- paddlex/modules/{3d_bev_detection → m_3d_bev_detection}/dataset_checker/__init__.py +3 -3
- paddlex/modules/{3d_bev_detection → m_3d_bev_detection}/dataset_checker/dataset_src/__init__.py +2 -2
- paddlex/modules/{3d_bev_detection → m_3d_bev_detection}/dataset_checker/dataset_src/analyse_dataset.py +8 -8
- paddlex/modules/{3d_bev_detection → m_3d_bev_detection}/dataset_checker/dataset_src/check_dataset.py +1 -2
- paddlex/modules/{3d_bev_detection → m_3d_bev_detection}/evaluator.py +3 -3
- paddlex/modules/{3d_bev_detection → m_3d_bev_detection}/exportor.py +1 -1
- paddlex/modules/{3d_bev_detection → m_3d_bev_detection}/model_list.py +1 -1
- paddlex/modules/{3d_bev_detection → m_3d_bev_detection}/trainer.py +5 -7
- paddlex/modules/multilabel_classification/__init__.py +2 -2
- paddlex/modules/multilabel_classification/dataset_checker/__init__.py +2 -2
- paddlex/modules/multilabel_classification/dataset_checker/dataset_src/__init__.py +2 -2
- paddlex/modules/multilabel_classification/dataset_checker/dataset_src/analyse_dataset.py +8 -9
- paddlex/modules/multilabel_classification/dataset_checker/dataset_src/check_dataset.py +4 -3
- paddlex/modules/multilabel_classification/dataset_checker/dataset_src/convert_dataset.py +10 -7
- paddlex/modules/multilabel_classification/dataset_checker/dataset_src/split_dataset.py +1 -1
- paddlex/modules/multilabel_classification/dataset_checker/dataset_src/utils/__init__.py +1 -1
- paddlex/modules/multilabel_classification/dataset_checker/dataset_src/utils/visualizer.py +1 -5
- paddlex/modules/multilabel_classification/evaluator.py +3 -3
- paddlex/modules/multilabel_classification/exportor.py +1 -1
- paddlex/modules/multilabel_classification/model_list.py +1 -1
- paddlex/modules/multilabel_classification/trainer.py +3 -3
- paddlex/modules/multilingual_speech_recognition/__init__.py +2 -2
- paddlex/modules/multilingual_speech_recognition/dataset_checker.py +3 -3
- paddlex/modules/multilingual_speech_recognition/evaluator.py +3 -3
- paddlex/modules/multilingual_speech_recognition/exportor.py +3 -3
- paddlex/modules/multilingual_speech_recognition/model_list.py +1 -1
- paddlex/modules/multilingual_speech_recognition/trainer.py +7 -5
- paddlex/modules/object_detection/__init__.py +2 -2
- paddlex/modules/object_detection/dataset_checker/__init__.py +2 -11
- paddlex/modules/object_detection/dataset_checker/dataset_src/__init__.py +2 -2
- paddlex/modules/object_detection/dataset_checker/dataset_src/analyse_dataset.py +10 -8
- paddlex/modules/object_detection/dataset_checker/dataset_src/check_dataset.py +10 -5
- paddlex/modules/object_detection/dataset_checker/dataset_src/convert_dataset.py +17 -12
- paddlex/modules/object_detection/dataset_checker/dataset_src/split_dataset.py +8 -4
- paddlex/modules/object_detection/dataset_checker/dataset_src/utils/__init__.py +1 -1
- paddlex/modules/object_detection/dataset_checker/dataset_src/utils/visualizer.py +9 -8
- paddlex/modules/object_detection/evaluator.py +11 -6
- paddlex/modules/object_detection/exportor.py +1 -1
- paddlex/modules/object_detection/model_list.py +3 -1
- paddlex/modules/object_detection/trainer.py +4 -5
- paddlex/modules/open_vocabulary_detection/__init__.py +2 -2
- paddlex/modules/open_vocabulary_detection/dataset_checker.py +3 -3
- paddlex/modules/open_vocabulary_detection/evaluator.py +3 -3
- paddlex/modules/open_vocabulary_detection/exportor.py +3 -3
- paddlex/modules/open_vocabulary_detection/model_list.py +2 -4
- paddlex/modules/open_vocabulary_detection/trainer.py +7 -5
- paddlex/modules/open_vocabulary_segmentation/__init__.py +2 -2
- paddlex/modules/open_vocabulary_segmentation/dataset_checker.py +3 -3
- paddlex/modules/open_vocabulary_segmentation/evaluator.py +3 -3
- paddlex/modules/open_vocabulary_segmentation/exportor.py +3 -3
- paddlex/modules/open_vocabulary_segmentation/model_list.py +1 -1
- paddlex/modules/open_vocabulary_segmentation/trainer.py +7 -5
- paddlex/modules/semantic_segmentation/__init__.py +2 -2
- paddlex/modules/semantic_segmentation/dataset_checker/__init__.py +2 -3
- paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/__init__.py +2 -2
- paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/analyse_dataset.py +6 -3
- paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/check_dataset.py +2 -2
- paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/convert_dataset.py +7 -4
- paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/split_dataset.py +2 -2
- paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/utils/__init__.py +1 -1
- paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/utils/visualizer.py +6 -2
- paddlex/modules/semantic_segmentation/evaluator.py +3 -3
- paddlex/modules/semantic_segmentation/exportor.py +1 -1
- paddlex/modules/semantic_segmentation/model_list.py +1 -1
- paddlex/modules/semantic_segmentation/trainer.py +3 -4
- paddlex/modules/table_recognition/__init__.py +2 -2
- paddlex/modules/table_recognition/dataset_checker/__init__.py +5 -5
- paddlex/modules/table_recognition/dataset_checker/dataset_src/__init__.py +2 -2
- paddlex/modules/table_recognition/dataset_checker/dataset_src/analyse_dataset.py +3 -2
- paddlex/modules/table_recognition/dataset_checker/dataset_src/check_dataset.py +8 -7
- paddlex/modules/table_recognition/dataset_checker/dataset_src/split_dataset.py +2 -1
- paddlex/modules/table_recognition/evaluator.py +3 -3
- paddlex/modules/table_recognition/exportor.py +1 -1
- paddlex/modules/table_recognition/model_list.py +1 -1
- paddlex/modules/table_recognition/trainer.py +2 -5
- paddlex/modules/text_detection/__init__.py +2 -2
- paddlex/modules/text_detection/dataset_checker/__init__.py +4 -6
- paddlex/modules/text_detection/dataset_checker/dataset_src/__init__.py +2 -2
- paddlex/modules/text_detection/dataset_checker/dataset_src/analyse_dataset.py +12 -9
- paddlex/modules/text_detection/dataset_checker/dataset_src/check_dataset.py +3 -3
- paddlex/modules/text_detection/dataset_checker/dataset_src/split_dataset.py +3 -3
- paddlex/modules/text_detection/evaluator.py +3 -3
- paddlex/modules/text_detection/exportor.py +1 -1
- paddlex/modules/text_detection/model_list.py +3 -1
- paddlex/modules/text_detection/trainer.py +2 -5
- paddlex/modules/text_recognition/__init__.py +2 -2
- paddlex/modules/text_recognition/dataset_checker/__init__.py +4 -5
- paddlex/modules/text_recognition/dataset_checker/dataset_src/__init__.py +2 -2
- paddlex/modules/text_recognition/dataset_checker/dataset_src/analyse_dataset.py +13 -12
- paddlex/modules/text_recognition/dataset_checker/dataset_src/check_dataset.py +2 -5
- paddlex/modules/text_recognition/dataset_checker/dataset_src/convert_dataset.py +11 -10
- paddlex/modules/text_recognition/dataset_checker/dataset_src/split_dataset.py +1 -2
- paddlex/modules/text_recognition/evaluator.py +3 -3
- paddlex/modules/text_recognition/exportor.py +1 -1
- paddlex/modules/text_recognition/model_list.py +3 -1
- paddlex/modules/text_recognition/trainer.py +2 -3
- paddlex/modules/ts_anomaly_detection/__init__.py +2 -2
- paddlex/modules/ts_anomaly_detection/dataset_checker/__init__.py +4 -5
- paddlex/modules/ts_anomaly_detection/dataset_checker/dataset_src/__init__.py +2 -2
- paddlex/modules/ts_anomaly_detection/dataset_checker/dataset_src/analyse_dataset.py +1 -9
- paddlex/modules/ts_anomaly_detection/dataset_checker/dataset_src/check_dataset.py +2 -2
- paddlex/modules/ts_anomaly_detection/dataset_checker/dataset_src/convert_dataset.py +2 -6
- paddlex/modules/ts_anomaly_detection/dataset_checker/dataset_src/split_dataset.py +4 -4
- paddlex/modules/ts_anomaly_detection/evaluator.py +3 -3
- paddlex/modules/ts_anomaly_detection/exportor.py +2 -3
- paddlex/modules/ts_anomaly_detection/model_list.py +1 -1
- paddlex/modules/ts_anomaly_detection/trainer.py +8 -8
- paddlex/modules/ts_classification/__init__.py +2 -2
- paddlex/modules/ts_classification/dataset_checker/__init__.py +4 -5
- paddlex/modules/ts_classification/dataset_checker/dataset_src/__init__.py +2 -2
- paddlex/modules/ts_classification/dataset_checker/dataset_src/analyse_dataset.py +8 -5
- paddlex/modules/ts_classification/dataset_checker/dataset_src/check_dataset.py +2 -2
- paddlex/modules/ts_classification/dataset_checker/dataset_src/convert_dataset.py +2 -6
- paddlex/modules/ts_classification/dataset_checker/dataset_src/split_dataset.py +5 -5
- paddlex/modules/ts_classification/evaluator.py +3 -3
- paddlex/modules/ts_classification/exportor.py +2 -3
- paddlex/modules/ts_classification/model_list.py +1 -1
- paddlex/modules/ts_classification/trainer.py +7 -7
- paddlex/modules/ts_forecast/__init__.py +2 -2
- paddlex/modules/ts_forecast/dataset_checker/__init__.py +4 -5
- paddlex/modules/ts_forecast/dataset_checker/dataset_src/__init__.py +2 -2
- paddlex/modules/ts_forecast/dataset_checker/dataset_src/analyse_dataset.py +1 -9
- paddlex/modules/ts_forecast/dataset_checker/dataset_src/check_dataset.py +2 -2
- paddlex/modules/ts_forecast/dataset_checker/dataset_src/convert_dataset.py +2 -6
- paddlex/modules/ts_forecast/dataset_checker/dataset_src/split_dataset.py +4 -4
- paddlex/modules/ts_forecast/evaluator.py +3 -3
- paddlex/modules/ts_forecast/exportor.py +2 -3
- paddlex/modules/ts_forecast/model_list.py +1 -1
- paddlex/modules/ts_forecast/trainer.py +7 -7
- paddlex/modules/video_classification/__init__.py +2 -2
- paddlex/modules/video_classification/dataset_checker/__init__.py +2 -2
- paddlex/modules/video_classification/dataset_checker/dataset_src/__init__.py +2 -2
- paddlex/modules/video_classification/dataset_checker/dataset_src/analyse_dataset.py +9 -9
- paddlex/modules/video_classification/dataset_checker/dataset_src/check_dataset.py +2 -3
- paddlex/modules/video_classification/dataset_checker/dataset_src/split_dataset.py +1 -1
- paddlex/modules/video_classification/evaluator.py +3 -3
- paddlex/modules/video_classification/exportor.py +1 -1
- paddlex/modules/video_classification/model_list.py +1 -1
- paddlex/modules/video_classification/trainer.py +3 -3
- paddlex/modules/video_detection/__init__.py +2 -2
- paddlex/modules/video_detection/dataset_checker/__init__.py +2 -2
- paddlex/modules/video_detection/dataset_checker/dataset_src/__init__.py +2 -2
- paddlex/modules/video_detection/dataset_checker/dataset_src/analyse_dataset.py +8 -9
- paddlex/modules/video_detection/dataset_checker/dataset_src/check_dataset.py +3 -5
- paddlex/modules/video_detection/evaluator.py +3 -3
- paddlex/modules/video_detection/exportor.py +1 -1
- paddlex/modules/video_detection/model_list.py +1 -1
- paddlex/modules/video_detection/trainer.py +3 -3
- paddlex/ops/__init__.py +7 -4
- paddlex/ops/iou3d_nms/iou3d_cpu.cpp +8 -6
- paddlex/ops/iou3d_nms/iou3d_cpu.h +3 -2
- paddlex/ops/iou3d_nms/iou3d_nms.cpp +8 -6
- paddlex/ops/iou3d_nms/iou3d_nms.h +6 -4
- paddlex/ops/iou3d_nms/iou3d_nms_api.cpp +24 -18
- paddlex/ops/iou3d_nms/iou3d_nms_kernel.cu +9 -7
- paddlex/ops/setup.py +3 -3
- paddlex/ops/voxel/voxelize_op.cc +22 -19
- paddlex/ops/voxel/voxelize_op.cu +25 -25
- paddlex/paddlex_cli.py +104 -87
- paddlex/repo_apis/Paddle3D_api/__init__.py +1 -1
- paddlex/repo_apis/Paddle3D_api/bev_fusion/__init__.py +1 -1
- paddlex/repo_apis/Paddle3D_api/bev_fusion/config.py +1 -1
- paddlex/repo_apis/Paddle3D_api/bev_fusion/model.py +6 -6
- paddlex/repo_apis/Paddle3D_api/bev_fusion/register.py +2 -2
- paddlex/repo_apis/Paddle3D_api/bev_fusion/runner.py +1 -1
- paddlex/repo_apis/Paddle3D_api/pp3d_config.py +3 -2
- paddlex/repo_apis/PaddleClas_api/__init__.py +1 -1
- paddlex/repo_apis/PaddleClas_api/cls/__init__.py +3 -3
- paddlex/repo_apis/PaddleClas_api/cls/config.py +5 -4
- paddlex/repo_apis/PaddleClas_api/cls/model.py +4 -4
- paddlex/repo_apis/PaddleClas_api/cls/register.py +12 -3
- paddlex/repo_apis/PaddleClas_api/cls/runner.py +2 -3
- paddlex/repo_apis/PaddleClas_api/shitu_rec/__init__.py +2 -2
- paddlex/repo_apis/PaddleClas_api/shitu_rec/config.py +2 -2
- paddlex/repo_apis/PaddleClas_api/shitu_rec/model.py +1 -4
- paddlex/repo_apis/PaddleClas_api/shitu_rec/register.py +2 -2
- paddlex/repo_apis/PaddleClas_api/shitu_rec/runner.py +1 -6
- paddlex/repo_apis/PaddleDetection_api/__init__.py +2 -2
- paddlex/repo_apis/PaddleDetection_api/config_helper.py +3 -3
- paddlex/repo_apis/PaddleDetection_api/instance_seg/__init__.py +2 -2
- paddlex/repo_apis/PaddleDetection_api/instance_seg/config.py +2 -3
- paddlex/repo_apis/PaddleDetection_api/instance_seg/model.py +4 -4
- paddlex/repo_apis/PaddleDetection_api/instance_seg/register.py +2 -3
- paddlex/repo_apis/PaddleDetection_api/instance_seg/runner.py +2 -3
- paddlex/repo_apis/PaddleDetection_api/object_det/__init__.py +3 -3
- paddlex/repo_apis/PaddleDetection_api/object_det/config.py +5 -4
- paddlex/repo_apis/PaddleDetection_api/object_det/model.py +6 -7
- paddlex/repo_apis/PaddleDetection_api/object_det/official_categories.py +26 -1
- paddlex/repo_apis/PaddleDetection_api/object_det/register.py +32 -3
- paddlex/repo_apis/PaddleDetection_api/object_det/runner.py +2 -3
- paddlex/repo_apis/PaddleNLP_api/__init__.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/__init__.py +4 -3
- paddlex/repo_apis/PaddleOCR_api/config_utils.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/formula_rec/__init__.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/formula_rec/config.py +7 -6
- paddlex/repo_apis/PaddleOCR_api/formula_rec/model.py +9 -13
- paddlex/repo_apis/PaddleOCR_api/formula_rec/register.py +29 -3
- paddlex/repo_apis/PaddleOCR_api/formula_rec/runner.py +2 -3
- paddlex/repo_apis/PaddleOCR_api/table_rec/__init__.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/table_rec/config.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/table_rec/model.py +4 -4
- paddlex/repo_apis/PaddleOCR_api/table_rec/register.py +2 -3
- paddlex/repo_apis/PaddleOCR_api/table_rec/runner.py +3 -3
- paddlex/repo_apis/PaddleOCR_api/text_det/__init__.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/text_det/config.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/text_det/model.py +4 -4
- paddlex/repo_apis/PaddleOCR_api/text_det/register.py +20 -3
- paddlex/repo_apis/PaddleOCR_api/text_det/runner.py +3 -3
- paddlex/repo_apis/PaddleOCR_api/text_rec/__init__.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/text_rec/config.py +7 -6
- paddlex/repo_apis/PaddleOCR_api/text_rec/model.py +9 -13
- paddlex/repo_apis/PaddleOCR_api/text_rec/register.py +20 -3
- paddlex/repo_apis/PaddleOCR_api/text_rec/runner.py +2 -3
- paddlex/repo_apis/PaddleSeg_api/__init__.py +1 -1
- paddlex/repo_apis/PaddleSeg_api/base_seg_config.py +2 -2
- paddlex/repo_apis/PaddleSeg_api/seg/__init__.py +1 -1
- paddlex/repo_apis/PaddleSeg_api/seg/config.py +3 -6
- paddlex/repo_apis/PaddleSeg_api/seg/model.py +6 -6
- paddlex/repo_apis/PaddleSeg_api/seg/register.py +2 -3
- paddlex/repo_apis/PaddleSeg_api/seg/runner.py +2 -3
- paddlex/repo_apis/PaddleTS_api/__init__.py +4 -3
- paddlex/repo_apis/PaddleTS_api/ts_ad/__init__.py +1 -1
- paddlex/repo_apis/PaddleTS_api/ts_ad/config.py +5 -6
- paddlex/repo_apis/PaddleTS_api/ts_ad/register.py +2 -2
- paddlex/repo_apis/PaddleTS_api/ts_ad/runner.py +2 -2
- paddlex/repo_apis/PaddleTS_api/ts_base/__init__.py +1 -1
- paddlex/repo_apis/PaddleTS_api/ts_base/config.py +2 -4
- paddlex/repo_apis/PaddleTS_api/ts_base/model.py +4 -4
- paddlex/repo_apis/PaddleTS_api/ts_base/runner.py +2 -2
- paddlex/repo_apis/PaddleTS_api/ts_cls/__init__.py +1 -1
- paddlex/repo_apis/PaddleTS_api/ts_cls/config.py +4 -5
- paddlex/repo_apis/PaddleTS_api/ts_cls/register.py +2 -2
- paddlex/repo_apis/PaddleTS_api/ts_cls/runner.py +2 -2
- paddlex/repo_apis/PaddleTS_api/ts_fc/__init__.py +1 -1
- paddlex/repo_apis/PaddleTS_api/ts_fc/config.py +6 -7
- paddlex/repo_apis/PaddleTS_api/ts_fc/register.py +1 -1
- paddlex/repo_apis/PaddleVideo_api/__init__.py +1 -1
- paddlex/repo_apis/PaddleVideo_api/config_utils.py +1 -1
- paddlex/repo_apis/PaddleVideo_api/video_cls/__init__.py +3 -3
- paddlex/repo_apis/PaddleVideo_api/video_cls/config.py +5 -4
- paddlex/repo_apis/PaddleVideo_api/video_cls/model.py +4 -4
- paddlex/repo_apis/PaddleVideo_api/video_cls/register.py +2 -3
- paddlex/repo_apis/PaddleVideo_api/video_cls/runner.py +2 -3
- paddlex/repo_apis/PaddleVideo_api/video_det/__init__.py +3 -3
- paddlex/repo_apis/PaddleVideo_api/video_det/config.py +5 -4
- paddlex/repo_apis/PaddleVideo_api/video_det/model.py +5 -5
- paddlex/repo_apis/PaddleVideo_api/video_det/register.py +2 -3
- paddlex/repo_apis/PaddleVideo_api/video_det/runner.py +2 -3
- paddlex/repo_apis/__init__.py +1 -1
- paddlex/repo_apis/base/__init__.py +4 -5
- paddlex/repo_apis/base/config.py +3 -4
- paddlex/repo_apis/base/model.py +11 -19
- paddlex/repo_apis/base/register.py +1 -1
- paddlex/repo_apis/base/runner.py +11 -12
- paddlex/repo_apis/base/utils/__init__.py +1 -1
- paddlex/repo_apis/base/utils/arg.py +1 -1
- paddlex/repo_apis/base/utils/subprocess.py +1 -1
- paddlex/repo_manager/__init__.py +2 -9
- paddlex/repo_manager/core.py +12 -30
- paddlex/repo_manager/meta.py +41 -31
- paddlex/repo_manager/repo.py +171 -161
- paddlex/repo_manager/utils.py +13 -224
- paddlex/utils/__init__.py +1 -1
- paddlex/utils/cache.py +8 -10
- paddlex/utils/config.py +6 -5
- paddlex/utils/{custom_device_whitelist.py → custom_device_list.py} +53 -199
- paddlex/utils/deps.py +249 -0
- paddlex/utils/device.py +87 -36
- paddlex/utils/download.py +4 -4
- paddlex/utils/env.py +37 -7
- paddlex/utils/errors/__init__.py +1 -1
- paddlex/utils/errors/dataset_checker.py +1 -1
- paddlex/utils/errors/others.py +2 -16
- paddlex/utils/file_interface.py +4 -5
- paddlex/utils/flags.py +17 -12
- paddlex/utils/fonts/__init__.py +36 -5
- paddlex/utils/func_register.py +1 -1
- paddlex/utils/install.py +87 -0
- paddlex/utils/interactive_get_pipeline.py +3 -3
- paddlex/utils/lazy_loader.py +3 -3
- paddlex/utils/logging.py +10 -1
- paddlex/utils/misc.py +6 -6
- paddlex/utils/pipeline_arguments.py +15 -7
- paddlex/utils/result_saver.py +4 -5
- paddlex/utils/subclass_register.py +2 -4
- paddlex/version.py +2 -1
- {paddlex-3.0.0rc0.dist-info → paddlex-3.0.1.dist-info}/METADATA +237 -102
- paddlex-3.0.1.dist-info/RECORD +1095 -0
- {paddlex-3.0.0rc0.dist-info → paddlex-3.0.1.dist-info}/WHEEL +1 -1
- paddlex/inference/models/base/predictor/basic_predictor.py +0 -139
- paddlex/paddle2onnx_requirements.txt +0 -1
- paddlex/repo_manager/requirements.txt +0 -21
- paddlex/serving_requirements.txt +0 -9
- paddlex-3.0.0rc0.dist-info/RECORD +0 -1015
- {paddlex-3.0.0rc0.dist-info → paddlex-3.0.1.dist-info}/entry_points.txt +0 -0
- {paddlex-3.0.0rc0.dist-info → paddlex-3.0.1.dist-info/licenses}/LICENSE +0 -0
- {paddlex-3.0.0rc0.dist-info → paddlex-3.0.1.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,4 @@
|
|
1
|
-
#
|
1
|
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2
2
|
#
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
4
|
# you may not use this file except in compliance with the License.
|
@@ -13,18 +13,27 @@
|
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
15
|
import copy
|
16
|
+
import inspect
|
16
17
|
import io
|
17
18
|
import json
|
18
19
|
import os
|
19
|
-
|
20
20
|
import warnings
|
21
|
-
from collections import
|
21
|
+
from collections import UserDict
|
22
22
|
from dataclasses import dataclass, field
|
23
23
|
from enum import Enum
|
24
|
-
from typing import
|
24
|
+
from typing import (
|
25
|
+
Any,
|
26
|
+
Dict,
|
27
|
+
List,
|
28
|
+
Literal,
|
29
|
+
NamedTuple,
|
30
|
+
Optional,
|
31
|
+
Sequence,
|
32
|
+
Tuple,
|
33
|
+
Union,
|
34
|
+
)
|
25
35
|
|
26
36
|
import numpy as np
|
27
|
-
import lazy_paddle as paddle
|
28
37
|
|
29
38
|
from .....utils import logging
|
30
39
|
|
@@ -44,7 +53,6 @@ __all__ = [
|
|
44
53
|
|
45
54
|
TOKENIZER_CONFIG_NAME = "tokenizer_config.json"
|
46
55
|
CHAT_TEMPLATE_CONFIG_NAME = "chat_template.json"
|
47
|
-
CHAT_TEMPLATE_CONFIG_NAME = "chat_template.json"
|
48
56
|
|
49
57
|
VERY_LARGE_INTEGER = int(
|
50
58
|
1e30
|
@@ -92,8 +100,6 @@ class AddedToken:
|
|
92
100
|
class FastEncoding:
|
93
101
|
"""This is dummy class reserved for fast tokenizer"""
|
94
102
|
|
95
|
-
pass
|
96
|
-
|
97
103
|
|
98
104
|
class ExplicitEnum(Enum):
|
99
105
|
"""
|
@@ -132,6 +138,8 @@ def to_py_obj(obj):
|
|
132
138
|
"""
|
133
139
|
Convert a Paddle tensor, Numpy array or python list to a python list.
|
134
140
|
"""
|
141
|
+
import paddle
|
142
|
+
|
135
143
|
if isinstance(obj, (dict, UserDict)):
|
136
144
|
return {k: to_py_obj(v) for k, v in obj.items()}
|
137
145
|
elif isinstance(obj, (list, tuple)):
|
@@ -289,10 +297,6 @@ class BatchEncoding(UserDict):
|
|
289
297
|
def items(self):
|
290
298
|
return self.data.items()
|
291
299
|
|
292
|
-
# After this point:
|
293
|
-
# Extended properties and methods only available for fast tokenizers
|
294
|
-
# not yet supported
|
295
|
-
|
296
300
|
@property
|
297
301
|
def encodings(self) -> Optional[List[FastEncoding]]:
|
298
302
|
"""
|
@@ -722,6 +726,8 @@ class BatchEncoding(UserDict):
|
|
722
726
|
prepend_batch_axis (`int`, *optional*, defaults to `False`):
|
723
727
|
Whether or not to add the batch dimension during the conversion.
|
724
728
|
"""
|
729
|
+
import paddle
|
730
|
+
|
725
731
|
if tensor_type is None:
|
726
732
|
return self
|
727
733
|
|
@@ -850,15 +856,17 @@ class SpecialTokensMixin:
|
|
850
856
|
return self.add_tokens(self.all_special_tokens_extended, special_tokens=True)
|
851
857
|
|
852
858
|
def add_special_tokens(
|
853
|
-
self,
|
859
|
+
self,
|
860
|
+
special_tokens_dict: Dict[str, Union[str, AddedToken]],
|
861
|
+
replace_additional_special_tokens=True,
|
854
862
|
) -> int:
|
855
863
|
"""
|
856
864
|
Add a dictionary of special tokens (eos, pad, cls, etc.) to the encoder and link them to class attributes. If
|
857
865
|
special tokens are NOT in the vocabulary, they are added to it (indexed starting from the last index of the
|
858
866
|
current vocabulary).
|
859
867
|
|
860
|
-
|
861
|
-
|
868
|
+
When adding new tokens to the vocabulary, you should make sure to also resize the token embedding matrix of the
|
869
|
+
model so that its embedding matrix matches the tokenizer.
|
862
870
|
|
863
871
|
In order to do that, please use the [`~PreTrainedModel.resize_token_embeddings`] method.
|
864
872
|
|
@@ -879,6 +887,13 @@ class SpecialTokensMixin:
|
|
879
887
|
|
880
888
|
Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer
|
881
889
|
assign the index of the `unk_token` to them).
|
890
|
+
replace_additional_special_tokens (`bool`, *optional*,, defaults to `True`):
|
891
|
+
If `True`, the existing list of additional special tokens will be replaced by the list provided in
|
892
|
+
`special_tokens_dict`. Otherwise, `self._additional_special_tokens` is just extended. In the former
|
893
|
+
case, the tokens will NOT be removed from the tokenizer's full vocabulary - they are only being flagged
|
894
|
+
as non-special tokens. Remember, this only affects which tokens are skipped during decoding, not the
|
895
|
+
`added_tokens_encoder` and `added_tokens_decoder`. This means that the previous
|
896
|
+
`additional_special_tokens` are still added tokens, and will not be split by the model.
|
882
897
|
|
883
898
|
Returns:
|
884
899
|
`int`: Number of tokens added to the vocabulary.
|
@@ -902,7 +917,7 @@ class SpecialTokensMixin:
|
|
902
917
|
if not special_tokens_dict:
|
903
918
|
return 0
|
904
919
|
|
905
|
-
added_tokens =
|
920
|
+
added_tokens = []
|
906
921
|
for key, value in special_tokens_dict.items():
|
907
922
|
assert (
|
908
923
|
key in self.SPECIAL_TOKENS_ATTRIBUTES
|
@@ -910,19 +925,37 @@ class SpecialTokensMixin:
|
|
910
925
|
|
911
926
|
if self.verbose:
|
912
927
|
logging.info(f"Assigning {value} to the {key} key of the tokenizer")
|
913
|
-
setattr(self, key, value)
|
914
928
|
|
915
929
|
if key == "additional_special_tokens":
|
916
930
|
assert isinstance(value, (list, tuple)) and all(
|
917
931
|
isinstance(t, (str, AddedToken)) for t in value
|
918
932
|
), f"Tokens {value} for key {key} should all be str or AddedToken instances"
|
919
|
-
|
933
|
+
|
934
|
+
to_add = []
|
935
|
+
for token in value:
|
936
|
+
if (
|
937
|
+
not replace_additional_special_tokens
|
938
|
+
and str(token) in self.additional_special_tokens
|
939
|
+
):
|
940
|
+
continue
|
941
|
+
to_add.append(token)
|
942
|
+
if replace_additional_special_tokens and len(to_add) > 0:
|
943
|
+
setattr(self, key, list(to_add))
|
944
|
+
else:
|
945
|
+
self._additional_special_tokens.extend(to_add)
|
946
|
+
added_tokens += to_add
|
947
|
+
|
920
948
|
else:
|
921
|
-
|
922
|
-
|
923
|
-
|
924
|
-
|
949
|
+
if not isinstance(value, (str, AddedToken)):
|
950
|
+
raise ValueError(
|
951
|
+
f"Token {value} for key {key} should be a str or an AddedToken instance"
|
952
|
+
)
|
953
|
+
setattr(self, key, value)
|
954
|
+
if value not in added_tokens:
|
955
|
+
added_tokens.append(value)
|
925
956
|
|
957
|
+
# if we are adding tokens that were not part of the vocab, we ought to add them
|
958
|
+
added_tokens = self.add_tokens(added_tokens, special_tokens=True)
|
926
959
|
return added_tokens
|
927
960
|
|
928
961
|
def add_tokens(
|
@@ -972,6 +1005,11 @@ class SpecialTokensMixin:
|
|
972
1005
|
|
973
1006
|
return self._add_tokens(new_tokens, special_tokens=special_tokens)
|
974
1007
|
|
1008
|
+
@classmethod
|
1009
|
+
def _add_extra_special_tokens(cls, extra_sp_token: Union[str, AddedToken]):
|
1010
|
+
if extra_sp_token not in cls.SPECIAL_TOKENS_ATTRIBUTES:
|
1011
|
+
cls.SPECIAL_TOKENS_ATTRIBUTES.append(extra_sp_token)
|
1012
|
+
|
975
1013
|
def _add_tokens(
|
976
1014
|
self,
|
977
1015
|
new_tokens: Union[List[str], List[AddedToken]],
|
@@ -1238,7 +1276,13 @@ class SpecialTokensMixin:
|
|
1238
1276
|
"""
|
1239
1277
|
set_attr = {}
|
1240
1278
|
for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
|
1241
|
-
|
1279
|
+
try:
|
1280
|
+
attr_value = getattr(self, "_" + attr)
|
1281
|
+
except:
|
1282
|
+
try:
|
1283
|
+
attr_value = getattr(self, attr)
|
1284
|
+
except:
|
1285
|
+
continue
|
1242
1286
|
if attr_value:
|
1243
1287
|
set_attr[attr] = (
|
1244
1288
|
type(attr_value)(
|
@@ -1262,7 +1306,13 @@ class SpecialTokensMixin:
|
|
1262
1306
|
"""
|
1263
1307
|
set_attr = {}
|
1264
1308
|
for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
|
1265
|
-
|
1309
|
+
try:
|
1310
|
+
attr_value = getattr(self, "_" + attr)
|
1311
|
+
except:
|
1312
|
+
try:
|
1313
|
+
attr_value = getattr(self, attr)
|
1314
|
+
except:
|
1315
|
+
continue
|
1266
1316
|
if attr_value:
|
1267
1317
|
set_attr[attr] = attr_value
|
1268
1318
|
return set_attr
|
@@ -1286,16 +1336,16 @@ class SpecialTokensMixin:
|
|
1286
1336
|
Don't convert tokens of `AddedToken` type to string so they can be used to control more finely how
|
1287
1337
|
special tokens are tokenized.
|
1288
1338
|
"""
|
1289
|
-
|
1290
|
-
|
1291
|
-
for
|
1292
|
-
|
1293
|
-
|
1294
|
-
|
1295
|
-
else [
|
1296
|
-
)
|
1297
|
-
|
1298
|
-
return
|
1339
|
+
all_tokens = []
|
1340
|
+
seen = set()
|
1341
|
+
for value in self.special_tokens_map_extended.values():
|
1342
|
+
if isinstance(value, (list, tuple)):
|
1343
|
+
tokens_to_add = [token for token in value if str(token) not in seen]
|
1344
|
+
else:
|
1345
|
+
tokens_to_add = [value] if str(value) not in seen else []
|
1346
|
+
seen.update(map(str, tokens_to_add))
|
1347
|
+
all_tokens.extend(tokens_to_add)
|
1348
|
+
return all_tokens
|
1299
1349
|
|
1300
1350
|
@property
|
1301
1351
|
def all_special_ids(self) -> List[int]:
|
@@ -1419,6 +1469,12 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
1419
1469
|
|
1420
1470
|
self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)
|
1421
1471
|
|
1472
|
+
self.clean_up_tokenization_spaces = kwargs.pop(
|
1473
|
+
"clean_up_tokenization_spaces", False
|
1474
|
+
)
|
1475
|
+
|
1476
|
+
self.split_special_tokens = kwargs.pop("split_special_tokens", False)
|
1477
|
+
|
1422
1478
|
self.deprecation_warnings = (
|
1423
1479
|
{}
|
1424
1480
|
) # Use to store when we have already noticed a deprecation warning (avoid overlogging).
|
@@ -1466,7 +1522,6 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
1466
1522
|
|
1467
1523
|
@max_len_sentences_pair.setter
|
1468
1524
|
def max_len_sentences_pair(self, value) -> int:
|
1469
|
-
# For backward compatibility, allow to try to setup 'max_len_sentences_pair'.
|
1470
1525
|
if (
|
1471
1526
|
value == self.model_max_length - self.num_special_tokens_to_add(pair=True)
|
1472
1527
|
and self.verbose
|
@@ -1488,10 +1543,15 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
1488
1543
|
self._processor_class = processor_class
|
1489
1544
|
|
1490
1545
|
def __repr__(self) -> str:
|
1546
|
+
added_tokens_decoder_rep = "\n\t".join(
|
1547
|
+
[f"{k}: {v.__repr__()}," for k, v in self.added_tokens_decoder.items()]
|
1548
|
+
)
|
1491
1549
|
return (
|
1492
|
-
f"{
|
1493
|
-
f"vocab_size={self.vocab_size},
|
1494
|
-
f"padding_side='{self.padding_side}', truncation_side='{self.truncation_side}',
|
1550
|
+
f"{self.__class__.__name__}(name_or_path='{self.name_or_path}',"
|
1551
|
+
f" vocab_size={self.vocab_size}, model_max_length={self.model_max_length}, is_fast={self.is_fast},"
|
1552
|
+
f" padding_side='{self.padding_side}', truncation_side='{self.truncation_side}',"
|
1553
|
+
f" special_tokens={self.special_tokens_map}, clean_up_tokenization_spaces={self.clean_up_tokenization_spaces}), "
|
1554
|
+
" added_tokens_decoder={\n\t" + added_tokens_decoder_rep + "\n}"
|
1495
1555
|
)
|
1496
1556
|
|
1497
1557
|
def get_vocab(self) -> Dict[str, int]:
|
@@ -1547,17 +1607,13 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
1547
1607
|
# Load from local directory path
|
1548
1608
|
tokenizer = BertTokenizer.from_pretrained('./my_bert/')
|
1549
1609
|
"""
|
1550
|
-
|
1551
|
-
pretrained_model_name_or_path = str(pretrained_model_name_or_path)
|
1552
1610
|
cache_dir = kwargs.pop("cache_dir", None)
|
1553
1611
|
from_hf_hub = kwargs.pop("from_hf_hub", False)
|
1554
1612
|
from_aistudio = kwargs.pop("from_aistudio", False)
|
1555
1613
|
subfolder = kwargs.pop("subfolder", "")
|
1556
1614
|
return_tokenizer_file_dir = kwargs.pop("return_tokenizer_file_dir", False)
|
1557
1615
|
|
1558
|
-
|
1559
|
-
subfolder = ""
|
1560
|
-
|
1616
|
+
pretrained_model_name_or_path = str(pretrained_model_name_or_path)
|
1561
1617
|
vocab_files = {}
|
1562
1618
|
init_configuration = {}
|
1563
1619
|
|
@@ -1568,12 +1624,17 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
1568
1624
|
"chat_template_file": CHAT_TEMPLATE_CONFIG_NAME,
|
1569
1625
|
}
|
1570
1626
|
|
1627
|
+
if hasattr(cls, "vocab_files_names") and len(cls.resource_files_names) == 0:
|
1628
|
+
cls.resource_files_names = copy.deepcopy(cls.vocab_files_names)
|
1629
|
+
logging.error(
|
1630
|
+
"The attribute 'vocab_files_names' is deprecated. Please use 'resource_files_names' instead.",
|
1631
|
+
DeprecationWarning,
|
1632
|
+
)
|
1571
1633
|
vocab_files_target = {**cls.resource_files_names, **additional_files_names}
|
1572
|
-
|
1573
1634
|
# From HF Hub or AI Studio
|
1574
1635
|
if from_hf_hub or from_aistudio:
|
1575
1636
|
# Only include the necessary resource files specified by the tokenizer cls
|
1576
|
-
# Deep copy to avoid
|
1637
|
+
# Deep copy to avoid modifying the class attributes
|
1577
1638
|
vocab_files = copy.deepcopy(cls.resource_files_names)
|
1578
1639
|
vocab_files["tokenizer_config_file"] = cls.tokenizer_config_file
|
1579
1640
|
|
@@ -1597,29 +1658,58 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
1597
1658
|
# Assuming from community-contributed pretrained models
|
1598
1659
|
for file_id, file_name in vocab_files_target.items():
|
1599
1660
|
vocab_files[file_id] = file_name
|
1600
|
-
|
1601
1661
|
resolved_vocab_files = {}
|
1602
1662
|
for file_id, file_path in vocab_files.items():
|
1603
|
-
|
1604
|
-
|
1605
|
-
continue
|
1606
|
-
else:
|
1607
|
-
logging.warnings("need to download tokenizer, but not support yet.")
|
1608
|
-
# tokenizer download not support yet
|
1609
|
-
# resolved_vocab_files[file_id] = resolve_file_path(
|
1610
|
-
# pretrained_model_name_or_path,
|
1611
|
-
# [file_path],
|
1612
|
-
# subfolder,
|
1613
|
-
# cache_dir=cache_dir,
|
1614
|
-
# from_aistudio=from_aistudio,
|
1615
|
-
# from_hf_hub=from_hf_hub,
|
1616
|
-
# )
|
1663
|
+
# adapt to PaddleX
|
1664
|
+
resolved_vocab_files[file_id] = file_path
|
1617
1665
|
|
1618
1666
|
for file_id, file_path in resolved_vocab_files.items():
|
1619
1667
|
if resolved_vocab_files[file_id] is not None:
|
1620
1668
|
cache_dir = os.path.dirname(resolved_vocab_files[file_id])
|
1621
1669
|
break
|
1670
|
+
return cls._from_pretrained(
|
1671
|
+
resolved_vocab_files,
|
1672
|
+
pretrained_model_name_or_path,
|
1673
|
+
init_configuration,
|
1674
|
+
*args,
|
1675
|
+
cache_dir=cache_dir,
|
1676
|
+
return_tokenizer_file_dir=return_tokenizer_file_dir,
|
1677
|
+
from_hf_hub=from_hf_hub,
|
1678
|
+
**kwargs,
|
1679
|
+
)
|
1622
1680
|
|
1681
|
+
@classmethod
|
1682
|
+
def _from_pretrained(
|
1683
|
+
cls,
|
1684
|
+
resolved_vocab_files,
|
1685
|
+
pretrained_model_name_or_path,
|
1686
|
+
init_configuration,
|
1687
|
+
*init_inputs,
|
1688
|
+
cache_dir=None,
|
1689
|
+
return_tokenizer_file_dir=False,
|
1690
|
+
from_hf_hub=False,
|
1691
|
+
**kwargs,
|
1692
|
+
):
|
1693
|
+
if cls.__name__.endswith("Fast"):
|
1694
|
+
from_slow = kwargs.get("from_slow", False)
|
1695
|
+
else:
|
1696
|
+
from_slow = kwargs.get("from_slow", True)
|
1697
|
+
has_tokenizer_file = (
|
1698
|
+
resolved_vocab_files.get("tokenizer_file", None) is not None
|
1699
|
+
)
|
1700
|
+
if (
|
1701
|
+
from_slow or not has_tokenizer_file
|
1702
|
+
) and cls.slow_tokenizer_class is not None:
|
1703
|
+
slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained(
|
1704
|
+
copy.deepcopy(resolved_vocab_files),
|
1705
|
+
pretrained_model_name_or_path,
|
1706
|
+
copy.deepcopy(init_configuration),
|
1707
|
+
*init_inputs,
|
1708
|
+
cache_dir=cache_dir,
|
1709
|
+
**(copy.deepcopy(kwargs)),
|
1710
|
+
)
|
1711
|
+
else:
|
1712
|
+
slow_tokenizer = None
|
1623
1713
|
tokenizer_config_file_dir_list = set()
|
1624
1714
|
for k, v in resolved_vocab_files.items():
|
1625
1715
|
if v is not None and os.path.isfile(v):
|
@@ -1629,8 +1719,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
1629
1719
|
assert (
|
1630
1720
|
len(tokenizer_config_file_dir_list) > 0
|
1631
1721
|
), "All tokenizer files should be in the same directory."
|
1632
|
-
|
1633
|
-
# Did we saved some inputs and kwargs to reload ?
|
1722
|
+
|
1634
1723
|
has_tokenizer_file = (
|
1635
1724
|
resolved_vocab_files.get("tokenizer_file", None) is not None
|
1636
1725
|
)
|
@@ -1638,15 +1727,34 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
1638
1727
|
if tokenizer_config_file is not None:
|
1639
1728
|
with io.open(tokenizer_config_file, encoding="utf-8") as f:
|
1640
1729
|
init_kwargs = json.load(f)
|
1730
|
+
init_kwargs.pop("tokenizer_class", None)
|
1641
1731
|
else:
|
1642
1732
|
init_kwargs = init_configuration
|
1643
1733
|
|
1644
|
-
|
1645
|
-
|
1734
|
+
if slow_tokenizer is not None:
|
1735
|
+
init_kwargs["__slow_tokenizer"] = slow_tokenizer
|
1736
|
+
init_kwargs["name_or_path"] = pretrained_model_name_or_path
|
1737
|
+
init_kwargs["from_slow"] = from_slow
|
1738
|
+
|
1739
|
+
pass_added_tokens_file = False
|
1740
|
+
added_tokens_decoder: Dict[int, AddedToken] = {}
|
1741
|
+
if "added_tokens_decoder" in init_kwargs:
|
1742
|
+
for idx, token in init_kwargs["added_tokens_decoder"].items():
|
1743
|
+
if isinstance(token, dict):
|
1744
|
+
token = AddedToken(**token)
|
1745
|
+
if isinstance(token, AddedToken):
|
1746
|
+
added_tokens_decoder[int(idx)] = token
|
1747
|
+
else:
|
1748
|
+
raise ValueError(
|
1749
|
+
f"Found a {token.__class__} in the saved `added_tokens_decoder`, should be a dictionary or an AddedToken instance"
|
1750
|
+
)
|
1751
|
+
init_kwargs["added_tokens_decoder"] = (
|
1752
|
+
added_tokens_decoder # NOTE tokenizer_config.json下, 注册的`added_tokens_decoder`被解析成字典
|
1753
|
+
)
|
1754
|
+
pass_added_tokens_file = True
|
1755
|
+
|
1646
1756
|
init_kwargs.pop("init_class", None)
|
1647
1757
|
|
1648
|
-
# Update with newly provided args and kwargs
|
1649
|
-
init_args = init_args if not args else args
|
1650
1758
|
init_kwargs.update(kwargs)
|
1651
1759
|
|
1652
1760
|
def convert_added_tokens(obj):
|
@@ -1664,10 +1772,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
1664
1772
|
return obj
|
1665
1773
|
|
1666
1774
|
init_kwargs = convert_added_tokens(init_kwargs)
|
1667
|
-
# Set max length if needed
|
1668
1775
|
if pretrained_model_name_or_path in cls.max_model_input_sizes:
|
1669
|
-
# if we're using a pretrained model, ensure the tokenizer
|
1670
|
-
# wont index sequences longer than the number of positional embeddings
|
1671
1776
|
model_max_length = cls.max_model_input_sizes[pretrained_model_name_or_path]
|
1672
1777
|
if model_max_length is not None and isinstance(
|
1673
1778
|
model_max_length, (int, float)
|
@@ -1676,32 +1781,28 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
1676
1781
|
init_kwargs.get("model_max_length", int(1e30)), model_max_length
|
1677
1782
|
)
|
1678
1783
|
|
1679
|
-
added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None)
|
1680
|
-
# Merge resolved_vocab_files arguments in init_kwargs if not including.
|
1681
|
-
# Maybe need more ways to load resources.
|
1682
1784
|
for args_name, file_path in resolved_vocab_files.items():
|
1683
|
-
|
1684
|
-
# use pretrained_init_configuration as `init_kwargs` to init which
|
1685
|
-
# does not include the vocab file in it, thus add vocab file into
|
1686
|
-
# args.
|
1687
|
-
if args_name not in init_kwargs:
|
1785
|
+
if args_name not in init_kwargs or init_kwargs[args_name] is None:
|
1688
1786
|
init_kwargs[args_name] = file_path
|
1689
|
-
# when `pretrained_model_name_or_path` is a pretrained model dir,
|
1690
|
-
# use tokenizer_config_file.json as `init_kwargs` to init which
|
1691
|
-
# does include a vocab file path in it. However, if the vocab file
|
1692
|
-
# path included in json does not exist, such as was deleted, to make
|
1693
|
-
# it still work, use the vocab file under this dir.
|
1694
1787
|
elif not os.path.isfile(init_kwargs[args_name] or "") and os.path.isfile(
|
1695
1788
|
file_path
|
1696
1789
|
):
|
1697
1790
|
init_kwargs[args_name] = file_path
|
1698
1791
|
|
1699
|
-
# TODO(zhoushunjie): It's not supportted to load tokenizer.json of hf so far.
|
1700
1792
|
if from_hf_hub and "tokenizer_file" in init_kwargs:
|
1701
1793
|
init_kwargs.pop("tokenizer_file")
|
1702
1794
|
|
1703
|
-
|
1704
|
-
|
1795
|
+
try:
|
1796
|
+
tokenizer = cls(*init_inputs, **init_kwargs)
|
1797
|
+
# adapt to PaddleX
|
1798
|
+
except RuntimeError as e:
|
1799
|
+
if "sentencepiece_processor.cc" in str(e):
|
1800
|
+
logging.info(
|
1801
|
+
"Unable to load tokenizer model from SPM, loading from TikToken will be attempted instead."
|
1802
|
+
"(SentencePiece RuntimeError: Tried to load SPM model with non-SPM vocab file).",
|
1803
|
+
)
|
1804
|
+
return False
|
1805
|
+
|
1705
1806
|
chat_template = init_kwargs.pop("chat_template", None)
|
1706
1807
|
if chat_template is not None:
|
1707
1808
|
tokenizer.init_chat_template(chat_template)
|
@@ -1715,11 +1816,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
1715
1816
|
special_tokens_map = json.load(special_tokens_map_handle)
|
1716
1817
|
for key, value in special_tokens_map.items():
|
1717
1818
|
if key in kwargs and kwargs[key]:
|
1718
|
-
# This value has already been redefined by the kwargs
|
1719
|
-
# We keep this new value and ignore the one stored in the special_tokens_map_file
|
1720
|
-
|
1721
1819
|
continue
|
1722
|
-
|
1723
1820
|
if isinstance(value, dict):
|
1724
1821
|
value = AddedToken(**value)
|
1725
1822
|
elif isinstance(value, list):
|
@@ -1728,13 +1825,15 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
1728
1825
|
for token in value
|
1729
1826
|
]
|
1730
1827
|
setattr(tokenizer, key, value)
|
1731
|
-
|
1828
|
+
cls._add_extra_special_tokens(key)
|
1829
|
+
|
1732
1830
|
special_tokens = tokenizer.all_special_tokens
|
1831
|
+
added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None)
|
1832
|
+
added_tokens_file = None if pass_added_tokens_file else added_tokens_file
|
1733
1833
|
if added_tokens_file is not None:
|
1734
1834
|
with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
|
1735
1835
|
added_tok_encoder = json.load(added_tokens_handle)
|
1736
1836
|
|
1737
|
-
# Sort added tokens by index
|
1738
1837
|
added_tok_encoder_sorted = list(
|
1739
1838
|
sorted(added_tok_encoder.items(), key=lambda x: x[1])
|
1740
1839
|
)
|
@@ -1744,14 +1843,11 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
1744
1843
|
and index != len(tokenizer)
|
1745
1844
|
and tokenizer.convert_tokens_to_ids(token) != index
|
1746
1845
|
):
|
1747
|
-
# index is the current length of the tokenizer (not in vocabulary)
|
1748
1846
|
raise ValueError(
|
1749
1847
|
f"Wrong index found for {token}: should be {tokenizer.convert_tokens_to_ids(token)} but found "
|
1750
1848
|
f"{index}."
|
1751
1849
|
)
|
1752
1850
|
elif not has_tokenizer_file and index != len(tokenizer):
|
1753
|
-
# Tokenizer slow: added token cannot already be in the vocabulary so its index needs to be the
|
1754
|
-
# current length of the tokenizer.
|
1755
1851
|
raise ValueError(
|
1756
1852
|
f"Non-consecutive added token '{token}' found. "
|
1757
1853
|
f"Should have index {len(tokenizer)} but has index {index} in saved vocabulary."
|
@@ -1760,15 +1856,12 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
1760
1856
|
tokenizer.add_tokens(
|
1761
1857
|
token, special_tokens=bool(token in special_tokens)
|
1762
1858
|
)
|
1763
|
-
# Check all our special tokens are registered as "no split" token (we don't cut them) and are in the vocab
|
1764
1859
|
added_tokens = tokenizer.sanitize_special_tokens()
|
1765
1860
|
if added_tokens:
|
1766
1861
|
logging.info(
|
1767
1862
|
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained."
|
1768
1863
|
)
|
1769
|
-
# save all of related things into default root dir
|
1770
1864
|
if pretrained_model_name_or_path in cls.pretrained_init_configuration:
|
1771
|
-
# tokenizer.save_pretrained(os.path.join(cache_dir, pretrained_model_name_or_path, subfolder))
|
1772
1865
|
tokenizer.save_pretrained(cache_dir)
|
1773
1866
|
|
1774
1867
|
if return_tokenizer_file_dir:
|
@@ -1827,7 +1920,6 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
1827
1920
|
for file_id in self.resource_files_names.keys():
|
1828
1921
|
tokenizer_config.pop(file_id, None)
|
1829
1922
|
|
1830
|
-
# Sanitize AddedTokens
|
1831
1923
|
def convert_added_tokens(obj: Union[AddedToken, Any], add_type_field=True):
|
1832
1924
|
if isinstance(obj, AddedToken):
|
1833
1925
|
out = obj.__getstate__()
|
@@ -1845,10 +1937,16 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
1845
1937
|
}
|
1846
1938
|
return obj
|
1847
1939
|
|
1848
|
-
# add_type_field=True to allow dicts in the kwargs / differentiate from AddedToken serialization
|
1849
1940
|
tokenizer_config = convert_added_tokens(tokenizer_config, add_type_field=True)
|
1850
1941
|
|
1851
|
-
|
1942
|
+
added_tokens = {}
|
1943
|
+
for key, value in self.added_tokens_decoder.items():
|
1944
|
+
if isinstance(value, AddedToken):
|
1945
|
+
added_tokens[key] = value.__getstate__()
|
1946
|
+
else:
|
1947
|
+
added_tokens[key] = AddedToken(value).__getstate__()
|
1948
|
+
tokenizer_config["added_tokens_decoder"] = added_tokens
|
1949
|
+
|
1852
1950
|
tokenizer_class = self.__class__.__name__
|
1853
1951
|
tokenizer_config["tokenizer_class"] = tokenizer_class
|
1854
1952
|
|
@@ -1856,7 +1954,6 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
1856
1954
|
f.write(json.dumps(tokenizer_config, ensure_ascii=False))
|
1857
1955
|
logging.info(f"tokenizer config file saved in {tokenizer_config_file}")
|
1858
1956
|
|
1859
|
-
# Sanitize AddedTokens in special_tokens_map
|
1860
1957
|
write_dict = convert_added_tokens(
|
1861
1958
|
self.special_tokens_map_extended, add_type_field=False
|
1862
1959
|
)
|
@@ -1946,8 +2043,6 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
1946
2043
|
old_truncation_strategy = kwargs.pop("truncation_strategy", "do_not_truncate")
|
1947
2044
|
old_pad_to_max_length = kwargs.pop("pad_to_max_seq_len", False)
|
1948
2045
|
|
1949
|
-
# Backward compatibility for previous behavior, maybe we should deprecate it:
|
1950
|
-
# If you only set max_length, it activates truncation for max_length
|
1951
2046
|
if max_length is not None and padding is False and truncation is False:
|
1952
2047
|
if verbose:
|
1953
2048
|
if not self.deprecation_warnings.get(
|
@@ -1992,7 +2087,6 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
1992
2087
|
warnings.warn(
|
1993
2088
|
"Though `pad_to_max_length` = `True`, it is ignored because `padding`=`True`."
|
1994
2089
|
)
|
1995
|
-
# Default to pad to the longest sequence in the batch
|
1996
2090
|
padding_strategy = PaddingStrategy.LONGEST
|
1997
2091
|
elif not isinstance(padding, PaddingStrategy):
|
1998
2092
|
padding_strategy = PaddingStrategy(padding)
|
@@ -2106,6 +2200,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
2106
2200
|
return_offsets_mapping: bool = False,
|
2107
2201
|
add_special_tokens: bool = True,
|
2108
2202
|
pad_to_multiple_of: Optional[int] = None,
|
2203
|
+
padding_side: Optional[Literal["right", "left"]] = None,
|
2109
2204
|
return_tensors: Optional[Union[str, TensorType]] = None,
|
2110
2205
|
verbose: bool = True,
|
2111
2206
|
**kwargs,
|
@@ -2215,6 +2310,9 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
2215
2310
|
If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
|
2216
2311
|
the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
|
2217
2312
|
Defaults to `None`.
|
2313
|
+
padding_side (`str`, *optional*):
|
2314
|
+
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
|
2315
|
+
Default value is picked from the class attribute of the same name.
|
2218
2316
|
return_tensors (str or [TensorType], optional):
|
2219
2317
|
If set, will return tensors instead of list of python integers. Acceptable values are:
|
2220
2318
|
|
@@ -2333,6 +2431,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
2333
2431
|
return_offsets_mapping=return_offsets_mapping,
|
2334
2432
|
add_special_tokens=add_special_tokens,
|
2335
2433
|
pad_to_multiple_of=pad_to_multiple_of,
|
2434
|
+
padding_side=padding_side,
|
2336
2435
|
return_tensors=return_tensors,
|
2337
2436
|
verbose=verbose,
|
2338
2437
|
**kwargs,
|
@@ -2355,6 +2454,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
2355
2454
|
return_offsets_mapping=return_offsets_mapping,
|
2356
2455
|
add_special_tokens=add_special_tokens,
|
2357
2456
|
pad_to_multiple_of=pad_to_multiple_of,
|
2457
|
+
padding_side=padding_side,
|
2358
2458
|
return_tensors=return_tensors,
|
2359
2459
|
verbose=verbose,
|
2360
2460
|
**kwargs,
|
@@ -2371,6 +2471,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
2371
2471
|
stride: int = 0,
|
2372
2472
|
is_split_into_words: bool = False,
|
2373
2473
|
pad_to_multiple_of: Optional[int] = None,
|
2474
|
+
padding_side: Optional[Literal["right", "left"]] = None,
|
2374
2475
|
return_tensors: Optional[Union[str, TensorType]] = None,
|
2375
2476
|
return_token_type_ids: Optional[bool] = None,
|
2376
2477
|
return_attention_mask: Optional[bool] = None,
|
@@ -2427,6 +2528,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
2427
2528
|
stride=stride,
|
2428
2529
|
is_split_into_words=is_split_into_words,
|
2429
2530
|
pad_to_multiple_of=pad_to_multiple_of,
|
2531
|
+
padding_side=padding_side,
|
2430
2532
|
return_tensors=return_tensors,
|
2431
2533
|
return_position_ids=return_position_ids,
|
2432
2534
|
return_token_type_ids=return_token_type_ids,
|
@@ -2449,6 +2551,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
2449
2551
|
max_length: Optional[int] = None,
|
2450
2552
|
stride: int = 0,
|
2451
2553
|
is_split_into_words: bool = False,
|
2554
|
+
padding_side: Optional[Literal["right", "left"]] = None,
|
2452
2555
|
pad_to_multiple_of: Optional[int] = None,
|
2453
2556
|
return_tensors: Optional[Union[str, TensorType]] = None,
|
2454
2557
|
return_token_type_ids: Optional[bool] = None,
|
@@ -2502,6 +2605,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
2502
2605
|
stride=stride,
|
2503
2606
|
is_split_into_words=is_split_into_words,
|
2504
2607
|
pad_to_multiple_of=pad_to_multiple_of,
|
2608
|
+
padding_side=padding_side,
|
2505
2609
|
return_tensors=return_tensors,
|
2506
2610
|
return_token_type_ids=return_token_type_ids,
|
2507
2611
|
return_attention_mask=return_attention_mask,
|
@@ -2524,6 +2628,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
2524
2628
|
stride: int = 0,
|
2525
2629
|
is_split_into_words: bool = False,
|
2526
2630
|
pad_to_multiple_of: Optional[int] = None,
|
2631
|
+
padding_side: Optional[Literal["right", "left"]] = None,
|
2527
2632
|
return_position_ids: Optional[bool] = None,
|
2528
2633
|
return_tensors: Optional[Union[str, TensorType]] = None,
|
2529
2634
|
return_token_type_ids: Optional[bool] = None,
|
@@ -2563,6 +2668,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
2563
2668
|
return_offsets_mapping=False,
|
2564
2669
|
add_special_tokens=True,
|
2565
2670
|
pad_to_multiple_of: Optional[int] = None,
|
2671
|
+
padding_side: Optional[Literal["right", "left"]] = None,
|
2566
2672
|
return_tensors: Optional[Union[str, TensorType]] = None,
|
2567
2673
|
verbose: bool = True,
|
2568
2674
|
**kwargs,
|
@@ -2615,6 +2721,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
2615
2721
|
stride=stride,
|
2616
2722
|
is_split_into_words=is_split_into_words,
|
2617
2723
|
pad_to_multiple_of=pad_to_multiple_of,
|
2724
|
+
padding_side=padding_side,
|
2618
2725
|
return_tensors=return_tensors,
|
2619
2726
|
return_position_ids=return_position_ids,
|
2620
2727
|
return_token_type_ids=return_token_type_ids,
|
@@ -2645,6 +2752,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
2645
2752
|
stride: int = 0,
|
2646
2753
|
is_split_into_words: bool = False,
|
2647
2754
|
pad_to_multiple_of: Optional[int] = None,
|
2755
|
+
padding_side: Optional[Literal["right", "left"]] = None,
|
2648
2756
|
return_position_ids: Optional[bool] = None,
|
2649
2757
|
return_tensors: Optional[Union[str, TensorType]] = None,
|
2650
2758
|
return_token_type_ids: Optional[bool] = None,
|
@@ -2670,6 +2778,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
2670
2778
|
],
|
2671
2779
|
padding: Union[bool, str, PaddingStrategy] = True,
|
2672
2780
|
max_length: Optional[int] = None,
|
2781
|
+
padding_side: Optional[Literal["right", "left"]] = None,
|
2673
2782
|
pad_to_multiple_of: Optional[int] = None,
|
2674
2783
|
return_attention_mask: Optional[bool] = None,
|
2675
2784
|
return_tensors: Optional[Union[str, TensorType]] = None,
|
@@ -2714,6 +2823,9 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
2714
2823
|
|
2715
2824
|
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
|
2716
2825
|
>= 7.5 (Volta).
|
2826
|
+
padding_side (`str`, *optional*):
|
2827
|
+
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
|
2828
|
+
Default value is picked from the class attribute of the same name.
|
2717
2829
|
return_attention_mask (`bool`, *optional*):
|
2718
2830
|
Whether to return the attention mask. If left to the default, will return the attention mask according
|
2719
2831
|
to the specific tokenizer's default, defined by the `return_outputs` attribute.
|
@@ -2727,6 +2839,8 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
2727
2839
|
verbose (`bool`, *optional*, defaults to `True`):
|
2728
2840
|
Whether or not to print more information and warnings.
|
2729
2841
|
"""
|
2842
|
+
import paddle
|
2843
|
+
|
2730
2844
|
# If we have a list of dicts, let's convert it in a dict of lists
|
2731
2845
|
if isinstance(encoded_inputs, (list, tuple)) and isinstance(
|
2732
2846
|
encoded_inputs[0], (dict, BatchEncoding)
|
@@ -2780,13 +2894,28 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
2780
2894
|
|
2781
2895
|
required_input = encoded_inputs[self.model_input_names[0]]
|
2782
2896
|
if required_input and not isinstance(required_input[0], (list, tuple)):
|
2783
|
-
|
2784
|
-
|
2785
|
-
|
2786
|
-
|
2787
|
-
|
2788
|
-
|
2789
|
-
|
2897
|
+
# some tokenizers might not have the padding_side attribute
|
2898
|
+
if "padding_side" in set(inspect.signature(self._pad).parameters.keys()):
|
2899
|
+
encoded_inputs = self._pad(
|
2900
|
+
encoded_inputs,
|
2901
|
+
max_length=max_length,
|
2902
|
+
padding_strategy=padding_strategy,
|
2903
|
+
pad_to_multiple_of=pad_to_multiple_of,
|
2904
|
+
padding_side=padding_side,
|
2905
|
+
return_attention_mask=return_attention_mask,
|
2906
|
+
)
|
2907
|
+
else:
|
2908
|
+
original_padding_side = self.padding_side
|
2909
|
+
self.padding_side = padding_side
|
2910
|
+
encoded_inputs = self._pad(
|
2911
|
+
encoded_inputs,
|
2912
|
+
max_length=max_length,
|
2913
|
+
padding_strategy=padding_strategy,
|
2914
|
+
pad_to_multiple_of=pad_to_multiple_of,
|
2915
|
+
return_attention_mask=return_attention_mask,
|
2916
|
+
)
|
2917
|
+
self.padding_side = original_padding_side
|
2918
|
+
|
2790
2919
|
return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
|
2791
2920
|
|
2792
2921
|
batch_size = len(required_input)
|
@@ -2805,6 +2934,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
2805
2934
|
inputs,
|
2806
2935
|
max_length=max_length,
|
2807
2936
|
padding_strategy=padding_strategy,
|
2937
|
+
padding_side=padding_side,
|
2808
2938
|
pad_to_multiple_of=pad_to_multiple_of,
|
2809
2939
|
return_attention_mask=return_attention_mask,
|
2810
2940
|
)
|
@@ -2887,6 +3017,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
2887
3017
|
max_length: Optional[int] = None,
|
2888
3018
|
stride: int = 0,
|
2889
3019
|
pad_to_multiple_of: Optional[int] = None,
|
3020
|
+
padding_side: Optional[Literal["right", "left"]] = None,
|
2890
3021
|
return_tensors: Optional[Union[str, TensorType]] = None,
|
2891
3022
|
return_position_ids=None,
|
2892
3023
|
return_token_type_ids: Optional[bool] = None,
|
@@ -2979,7 +3110,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
2979
3110
|
sequence = ids + pair_ids if pair else ids
|
2980
3111
|
token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else [])
|
2981
3112
|
|
2982
|
-
# Build output
|
3113
|
+
# Build output dictionary
|
2983
3114
|
encoded_inputs["input_ids"] = sequence
|
2984
3115
|
if return_token_type_ids:
|
2985
3116
|
encoded_inputs["token_type_ids"] = token_type_ids
|
@@ -3037,6 +3168,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
3037
3168
|
max_length=max_length,
|
3038
3169
|
padding=padding_strategy.value,
|
3039
3170
|
pad_to_multiple_of=pad_to_multiple_of,
|
3171
|
+
padding_side=padding_side,
|
3040
3172
|
return_attention_mask=return_attention_mask,
|
3041
3173
|
)
|
3042
3174
|
|
@@ -3189,6 +3321,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
3189
3321
|
max_length: Optional[int] = None,
|
3190
3322
|
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
|
3191
3323
|
pad_to_multiple_of: Optional[int] = None,
|
3324
|
+
padding_side: Optional[Literal["right", "left"]] = None,
|
3192
3325
|
return_attention_mask: Optional[bool] = None,
|
3193
3326
|
) -> dict:
|
3194
3327
|
"""
|
@@ -3204,13 +3337,16 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
3204
3337
|
- PaddingStrategy.LONGEST Pad to the longest sequence in the batch
|
3205
3338
|
- PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
|
3206
3339
|
- PaddingStrategy.DO_NOT_PAD: Do not pad
|
3207
|
-
The tokenizer padding sides are defined in
|
3340
|
+
The tokenizer padding sides are defined in `padding_side` argument:
|
3208
3341
|
|
3209
3342
|
- 'left': pads on the left of the sequences
|
3210
3343
|
- 'right': pads on the right of the sequences
|
3211
3344
|
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
|
3212
3345
|
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
|
3213
3346
|
>= 7.5 (Volta).
|
3347
|
+
padding_side: (optional) The side on which the model should have padding applied.
|
3348
|
+
Should be selected between ['right', 'left'].
|
3349
|
+
Default value is picked from the class attribute of the same name.
|
3214
3350
|
return_attention_mask:
|
3215
3351
|
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
|
3216
3352
|
"""
|
@@ -3244,12 +3380,33 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
3244
3380
|
|
3245
3381
|
if needs_to_be_padded:
|
3246
3382
|
difference = max_length - len(required_input)
|
3383
|
+
padding_side = (
|
3384
|
+
padding_side if padding_side is not None else self.padding_side
|
3385
|
+
)
|
3247
3386
|
|
3248
|
-
if
|
3387
|
+
if padding_side == "right":
|
3249
3388
|
if return_attention_mask:
|
3250
|
-
|
3251
|
-
|
3252
|
-
|
3389
|
+
if len(np.shape(encoded_inputs["attention_mask"])) > 2:
|
3390
|
+
encoded_inputs["attention_mask"] = np.pad(
|
3391
|
+
encoded_inputs["attention_mask"],
|
3392
|
+
pad_width=[(0, 0), (0, difference), (0, difference)],
|
3393
|
+
mode="constant",
|
3394
|
+
constant_values=0,
|
3395
|
+
).tolist()
|
3396
|
+
else:
|
3397
|
+
encoded_inputs["attention_mask"] = (
|
3398
|
+
encoded_inputs["attention_mask"] + [0] * difference
|
3399
|
+
)
|
3400
|
+
if "attn_mask_startend_row_indices" in encoded_inputs:
|
3401
|
+
encoded_inputs["attn_mask_startend_row_indices"] = np.concatenate(
|
3402
|
+
[
|
3403
|
+
np.array(
|
3404
|
+
[encoded_inputs["attn_mask_startend_row_indices"]],
|
3405
|
+
dtype=np.int32,
|
3406
|
+
),
|
3407
|
+
np.zeros([1, difference], dtype=np.int32),
|
3408
|
+
],
|
3409
|
+
axis=-1,
|
3253
3410
|
)
|
3254
3411
|
if "token_type_ids" in encoded_inputs:
|
3255
3412
|
encoded_inputs["token_type_ids"] = (
|
@@ -3284,11 +3441,32 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
3284
3441
|
encoded_inputs[self.model_input_names[0]] = (
|
3285
3442
|
required_input + [self.pad_token_id] * difference
|
3286
3443
|
)
|
3287
|
-
elif
|
3444
|
+
elif padding_side == "left":
|
3288
3445
|
if return_attention_mask:
|
3289
|
-
encoded_inputs["attention_mask"]
|
3290
|
-
|
3291
|
-
|
3446
|
+
if len(np.shape(encoded_inputs["attention_mask"])) > 2:
|
3447
|
+
# attention_mask shape [1,seq_len,seq_len]
|
3448
|
+
encoded_inputs["attention_mask"] = np.pad(
|
3449
|
+
encoded_inputs["attention_mask"],
|
3450
|
+
pad_width=[(0, 0), (difference, 0), (difference, 0)],
|
3451
|
+
mode="constant",
|
3452
|
+
constant_values=0,
|
3453
|
+
).tolist()
|
3454
|
+
else:
|
3455
|
+
encoded_inputs["attention_mask"] = [
|
3456
|
+
0
|
3457
|
+
] * difference + encoded_inputs["attention_mask"]
|
3458
|
+
if "attn_mask_startend_row_indices" in encoded_inputs:
|
3459
|
+
encoded_inputs["attn_mask_startend_row_indices"] = np.concatenate(
|
3460
|
+
[
|
3461
|
+
np.zeros([1, difference], dtype=np.int32),
|
3462
|
+
np.array(
|
3463
|
+
[encoded_inputs["attn_mask_startend_row_indices"]],
|
3464
|
+
dtype=np.int32,
|
3465
|
+
)
|
3466
|
+
+ difference,
|
3467
|
+
],
|
3468
|
+
axis=-1,
|
3469
|
+
)
|
3292
3470
|
if "token_type_ids" in encoded_inputs:
|
3293
3471
|
encoded_inputs["token_type_ids"] = [
|
3294
3472
|
self.pad_token_type_id
|
@@ -3322,6 +3500,15 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
3322
3500
|
] * difference + required_input
|
3323
3501
|
else:
|
3324
3502
|
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
|
3503
|
+
else:
|
3504
|
+
if "attn_mask_startend_row_indices" in encoded_inputs:
|
3505
|
+
if len(np.shape(encoded_inputs["attn_mask_startend_row_indices"])) == 1:
|
3506
|
+
encoded_inputs["attn_mask_startend_row_indices"] = np.array([encoded_inputs["attn_mask_startend_row_indices"]], dtype=np.int32) # fmt:skip
|
3507
|
+
|
3508
|
+
if "attn_mask_startend_row_indices" in encoded_inputs:
|
3509
|
+
assert (
|
3510
|
+
len(np.shape(encoded_inputs["attn_mask_startend_row_indices"])) == 2
|
3511
|
+
) # [num_head, seq_len]
|
3325
3512
|
|
3326
3513
|
return encoded_inputs
|
3327
3514
|
|
@@ -3338,9 +3525,38 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
3338
3525
|
"""
|
3339
3526
|
raise NotImplementedError
|
3340
3527
|
|
3528
|
+
def decode_token(
|
3529
|
+
self,
|
3530
|
+
all_input_ids: List[int],
|
3531
|
+
prefix_offset: int = 0,
|
3532
|
+
read_offset: int = 0,
|
3533
|
+
) -> Tuple[str, int, int]:
|
3534
|
+
"""tokenizer decoding for the streaming generation use case. This method can be overridden for tokenizer that doesn't follow this API"""
|
3535
|
+
prefix_text = self.decode(
|
3536
|
+
all_input_ids[prefix_offset:read_offset],
|
3537
|
+
skip_special_tokens=False,
|
3538
|
+
clean_up_tokenization_spaces=False,
|
3539
|
+
)
|
3540
|
+
new_text = self.decode(
|
3541
|
+
all_input_ids[prefix_offset:],
|
3542
|
+
skip_special_tokens=False,
|
3543
|
+
clean_up_tokenization_spaces=False,
|
3544
|
+
)
|
3545
|
+
|
3546
|
+
if (
|
3547
|
+
len(new_text) > len(prefix_text)
|
3548
|
+
and not prefix_text.endswith("�")
|
3549
|
+
and not new_text.endswith("�")
|
3550
|
+
):
|
3551
|
+
prefix_index = new_text.index(prefix_text)
|
3552
|
+
new_text = new_text[prefix_index + len(prefix_text) :]
|
3553
|
+
return new_text, read_offset, len(all_input_ids)
|
3554
|
+
else:
|
3555
|
+
return "", prefix_offset, read_offset
|
3556
|
+
|
3341
3557
|
def batch_decode(
|
3342
3558
|
self,
|
3343
|
-
sequences
|
3559
|
+
sequences,
|
3344
3560
|
skip_special_tokens: bool = False,
|
3345
3561
|
clean_up_tokenization_spaces: bool = True,
|
3346
3562
|
**kwargs,
|
@@ -3373,7 +3589,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
3373
3589
|
|
3374
3590
|
def decode(
|
3375
3591
|
self,
|
3376
|
-
token_ids
|
3592
|
+
token_ids,
|
3377
3593
|
skip_special_tokens: bool = False,
|
3378
3594
|
clean_up_tokenization_spaces: bool = True,
|
3379
3595
|
**kwargs,
|