paddlex 3.0.0rc1__py3-none-any.whl → 3.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- paddlex/.version +1 -1
- paddlex/__init__.py +1 -1
- paddlex/configs/modules/chart_parsing/PP-Chart2Table.yaml +13 -0
- paddlex/configs/modules/doc_vlm/PP-DocBee2-3B.yaml +14 -0
- paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-L.yaml +40 -0
- paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-M.yaml +40 -0
- paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-S.yaml +40 -0
- paddlex/configs/modules/layout_detection/PP-DocBlockLayout.yaml +40 -0
- paddlex/configs/modules/layout_detection/PP-DocLayout-L.yaml +2 -2
- paddlex/configs/modules/layout_detection/PP-DocLayout-M.yaml +2 -2
- paddlex/configs/modules/layout_detection/PP-DocLayout-S.yaml +2 -2
- paddlex/configs/modules/layout_detection/PP-DocLayout_plus-L.yaml +40 -0
- paddlex/configs/modules/text_detection/PP-OCRv5_mobile_det.yaml +40 -0
- paddlex/configs/modules/text_detection/PP-OCRv5_server_det.yaml +40 -0
- paddlex/configs/modules/text_recognition/PP-OCRv5_mobile_rec.yaml +39 -0
- paddlex/configs/modules/text_recognition/PP-OCRv5_server_rec.yaml +39 -0
- paddlex/configs/modules/textline_orientation/PP-LCNet_x1_0_textline_ori.yaml +41 -0
- paddlex/configs/pipelines/OCR.yaml +7 -6
- paddlex/configs/pipelines/PP-ChatOCRv3-doc.yaml +3 -1
- paddlex/configs/pipelines/PP-ChatOCRv4-doc.yaml +91 -34
- paddlex/configs/pipelines/PP-StructureV3.yaml +72 -72
- paddlex/configs/pipelines/doc_understanding.yaml +1 -1
- paddlex/configs/pipelines/formula_recognition.yaml +2 -2
- paddlex/configs/pipelines/layout_parsing.yaml +3 -2
- paddlex/configs/pipelines/seal_recognition.yaml +1 -0
- paddlex/configs/pipelines/table_recognition.yaml +2 -1
- paddlex/configs/pipelines/table_recognition_v2.yaml +7 -1
- paddlex/hpip_links.html +20 -20
- paddlex/inference/common/batch_sampler/doc_vlm_batch_sampler.py +33 -10
- paddlex/inference/common/batch_sampler/image_batch_sampler.py +34 -25
- paddlex/inference/common/result/mixin.py +19 -12
- paddlex/inference/models/base/predictor/base_predictor.py +2 -8
- paddlex/inference/models/common/static_infer.py +29 -73
- paddlex/inference/models/common/tokenizer/__init__.py +2 -0
- paddlex/inference/models/common/tokenizer/clip_tokenizer.py +1 -1
- paddlex/inference/models/common/tokenizer/gpt_tokenizer.py +2 -2
- paddlex/inference/models/common/tokenizer/qwen2_5_tokenizer.py +112 -0
- paddlex/inference/models/common/tokenizer/qwen2_tokenizer.py +7 -1
- paddlex/inference/models/common/tokenizer/qwen_tokenizer.py +288 -0
- paddlex/inference/models/common/tokenizer/tokenizer_utils.py +13 -13
- paddlex/inference/models/common/tokenizer/tokenizer_utils_base.py +3 -3
- paddlex/inference/models/common/tokenizer/vocab.py +7 -7
- paddlex/inference/models/common/ts/funcs.py +19 -8
- paddlex/inference/models/common/vlm/conversion_utils.py +99 -0
- paddlex/inference/models/common/vlm/fusion_ops.py +205 -0
- paddlex/inference/models/common/vlm/generation/configuration_utils.py +1 -1
- paddlex/inference/models/common/vlm/generation/logits_process.py +1 -1
- paddlex/inference/models/common/vlm/generation/utils.py +1 -1
- paddlex/inference/models/common/vlm/transformers/configuration_utils.py +3 -3
- paddlex/inference/models/common/vlm/transformers/conversion_utils.py +3 -3
- paddlex/inference/models/common/vlm/transformers/model_outputs.py +2 -2
- paddlex/inference/models/common/vlm/transformers/model_utils.py +7 -31
- paddlex/inference/models/doc_vlm/modeling/GOT_ocr_2_0.py +830 -0
- paddlex/inference/models/doc_vlm/modeling/__init__.py +2 -0
- paddlex/inference/models/doc_vlm/modeling/qwen2.py +1606 -0
- paddlex/inference/models/doc_vlm/modeling/qwen2_5_vl.py +3006 -0
- paddlex/inference/models/doc_vlm/modeling/qwen2_vl.py +0 -105
- paddlex/inference/models/doc_vlm/predictor.py +79 -24
- paddlex/inference/models/doc_vlm/processors/GOT_ocr_2_0.py +97 -0
- paddlex/inference/models/doc_vlm/processors/__init__.py +2 -0
- paddlex/inference/models/doc_vlm/processors/common.py +189 -0
- paddlex/inference/models/doc_vlm/processors/qwen2_5_vl.py +548 -0
- paddlex/inference/models/doc_vlm/processors/qwen2_vl.py +21 -176
- paddlex/inference/models/formula_recognition/predictor.py +8 -2
- paddlex/inference/models/formula_recognition/processors.py +90 -77
- paddlex/inference/models/formula_recognition/result.py +28 -27
- paddlex/inference/models/image_feature/processors.py +3 -4
- paddlex/inference/models/keypoint_detection/predictor.py +3 -0
- paddlex/inference/models/object_detection/predictor.py +2 -0
- paddlex/inference/models/object_detection/processors.py +28 -3
- paddlex/inference/models/object_detection/utils.py +2 -0
- paddlex/inference/models/table_structure_recognition/result.py +0 -10
- paddlex/inference/models/text_detection/predictor.py +8 -0
- paddlex/inference/models/text_detection/processors.py +44 -10
- paddlex/inference/models/text_detection/result.py +0 -10
- paddlex/inference/models/text_recognition/result.py +1 -1
- paddlex/inference/pipelines/__init__.py +9 -5
- paddlex/inference/pipelines/_parallel.py +172 -0
- paddlex/inference/pipelines/anomaly_detection/pipeline.py +16 -6
- paddlex/inference/pipelines/attribute_recognition/pipeline.py +11 -1
- paddlex/inference/pipelines/base.py +14 -4
- paddlex/inference/pipelines/components/faisser.py +1 -1
- paddlex/inference/pipelines/doc_preprocessor/pipeline.py +53 -27
- paddlex/inference/pipelines/formula_recognition/pipeline.py +120 -82
- paddlex/inference/pipelines/formula_recognition/result.py +1 -11
- paddlex/inference/pipelines/image_classification/pipeline.py +16 -6
- paddlex/inference/pipelines/image_multilabel_classification/pipeline.py +16 -6
- paddlex/inference/pipelines/instance_segmentation/pipeline.py +16 -6
- paddlex/inference/pipelines/keypoint_detection/pipeline.py +16 -6
- paddlex/inference/pipelines/layout_parsing/layout_objects.py +859 -0
- paddlex/inference/pipelines/layout_parsing/pipeline.py +34 -47
- paddlex/inference/pipelines/layout_parsing/pipeline_v2.py +832 -260
- paddlex/inference/pipelines/layout_parsing/result.py +4 -17
- paddlex/inference/pipelines/layout_parsing/result_v2.py +259 -245
- paddlex/inference/pipelines/layout_parsing/setting.py +88 -0
- paddlex/inference/pipelines/layout_parsing/utils.py +391 -2028
- paddlex/inference/pipelines/layout_parsing/xycut_enhanced/__init__.py +16 -0
- paddlex/inference/pipelines/layout_parsing/xycut_enhanced/utils.py +1199 -0
- paddlex/inference/pipelines/layout_parsing/xycut_enhanced/xycuts.py +615 -0
- paddlex/inference/pipelines/m_3d_bev_detection/pipeline.py +2 -2
- paddlex/inference/pipelines/multilingual_speech_recognition/pipeline.py +2 -2
- paddlex/inference/pipelines/object_detection/pipeline.py +16 -6
- paddlex/inference/pipelines/ocr/pipeline.py +127 -70
- paddlex/inference/pipelines/ocr/result.py +21 -18
- paddlex/inference/pipelines/open_vocabulary_detection/pipeline.py +2 -2
- paddlex/inference/pipelines/open_vocabulary_segmentation/pipeline.py +2 -2
- paddlex/inference/pipelines/pp_chatocr/pipeline_base.py +2 -2
- paddlex/inference/pipelines/pp_chatocr/pipeline_v3.py +2 -5
- paddlex/inference/pipelines/pp_chatocr/pipeline_v4.py +6 -6
- paddlex/inference/pipelines/rotated_object_detection/pipeline.py +16 -6
- paddlex/inference/pipelines/seal_recognition/pipeline.py +109 -53
- paddlex/inference/pipelines/semantic_segmentation/pipeline.py +16 -6
- paddlex/inference/pipelines/small_object_detection/pipeline.py +16 -6
- paddlex/inference/pipelines/table_recognition/pipeline.py +26 -18
- paddlex/inference/pipelines/table_recognition/pipeline_v2.py +624 -53
- paddlex/inference/pipelines/table_recognition/result.py +1 -1
- paddlex/inference/pipelines/table_recognition/table_recognition_post_processing_v2.py +9 -5
- paddlex/inference/pipelines/ts_anomaly_detection/pipeline.py +2 -2
- paddlex/inference/pipelines/ts_classification/pipeline.py +2 -2
- paddlex/inference/pipelines/ts_forecasting/pipeline.py +2 -2
- paddlex/inference/pipelines/video_classification/pipeline.py +2 -2
- paddlex/inference/pipelines/video_detection/pipeline.py +2 -2
- paddlex/inference/serving/basic_serving/_app.py +46 -13
- paddlex/inference/serving/basic_serving/_pipeline_apps/_common/common.py +5 -1
- paddlex/inference/serving/basic_serving/_pipeline_apps/layout_parsing.py +0 -1
- paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv3_doc.py +0 -1
- paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv4_doc.py +1 -1
- paddlex/inference/serving/basic_serving/_pipeline_apps/pp_structurev3.py +6 -2
- paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition.py +1 -5
- paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition_v2.py +4 -5
- paddlex/inference/serving/infra/utils.py +20 -22
- paddlex/inference/serving/schemas/formula_recognition.py +1 -1
- paddlex/inference/serving/schemas/layout_parsing.py +1 -2
- paddlex/inference/serving/schemas/pp_chatocrv3_doc.py +1 -2
- paddlex/inference/serving/schemas/pp_chatocrv4_doc.py +2 -2
- paddlex/inference/serving/schemas/pp_structurev3.py +10 -6
- paddlex/inference/serving/schemas/seal_recognition.py +1 -1
- paddlex/inference/serving/schemas/table_recognition.py +2 -6
- paddlex/inference/serving/schemas/table_recognition_v2.py +5 -6
- paddlex/inference/utils/hpi.py +30 -16
- paddlex/inference/utils/hpi_model_info_collection.json +666 -162
- paddlex/inference/utils/io/readers.py +12 -12
- paddlex/inference/utils/misc.py +20 -0
- paddlex/inference/utils/mkldnn_blocklist.py +59 -0
- paddlex/inference/utils/official_models.py +140 -5
- paddlex/inference/utils/pp_option.py +74 -9
- paddlex/model.py +2 -2
- paddlex/modules/__init__.py +1 -1
- paddlex/modules/anomaly_detection/evaluator.py +2 -2
- paddlex/modules/base/__init__.py +1 -1
- paddlex/modules/base/evaluator.py +5 -5
- paddlex/modules/base/trainer.py +1 -1
- paddlex/modules/doc_vlm/dataset_checker.py +2 -2
- paddlex/modules/doc_vlm/evaluator.py +2 -2
- paddlex/modules/doc_vlm/exportor.py +2 -2
- paddlex/modules/doc_vlm/model_list.py +1 -1
- paddlex/modules/doc_vlm/trainer.py +2 -2
- paddlex/modules/face_recognition/evaluator.py +2 -2
- paddlex/modules/formula_recognition/evaluator.py +5 -2
- paddlex/modules/formula_recognition/model_list.py +3 -0
- paddlex/modules/formula_recognition/trainer.py +3 -0
- paddlex/modules/general_recognition/evaluator.py +1 -1
- paddlex/modules/image_classification/evaluator.py +2 -2
- paddlex/modules/image_classification/model_list.py +1 -0
- paddlex/modules/instance_segmentation/evaluator.py +1 -1
- paddlex/modules/keypoint_detection/evaluator.py +1 -1
- paddlex/modules/m_3d_bev_detection/evaluator.py +2 -2
- paddlex/modules/multilabel_classification/evaluator.py +2 -2
- paddlex/modules/object_detection/dataset_checker/dataset_src/convert_dataset.py +4 -4
- paddlex/modules/object_detection/evaluator.py +2 -2
- paddlex/modules/object_detection/model_list.py +2 -0
- paddlex/modules/semantic_segmentation/dataset_checker/__init__.py +12 -2
- paddlex/modules/semantic_segmentation/evaluator.py +2 -2
- paddlex/modules/table_recognition/evaluator.py +2 -2
- paddlex/modules/text_detection/evaluator.py +2 -2
- paddlex/modules/text_detection/model_list.py +2 -0
- paddlex/modules/text_recognition/evaluator.py +2 -2
- paddlex/modules/text_recognition/model_list.py +2 -0
- paddlex/modules/ts_anomaly_detection/evaluator.py +2 -2
- paddlex/modules/ts_classification/dataset_checker/dataset_src/split_dataset.py +1 -1
- paddlex/modules/ts_classification/evaluator.py +2 -2
- paddlex/modules/ts_forecast/evaluator.py +2 -2
- paddlex/modules/video_classification/evaluator.py +2 -2
- paddlex/modules/video_detection/evaluator.py +2 -2
- paddlex/ops/__init__.py +8 -5
- paddlex/paddlex_cli.py +19 -13
- paddlex/repo_apis/Paddle3D_api/bev_fusion/model.py +2 -2
- paddlex/repo_apis/PaddleClas_api/cls/config.py +1 -1
- paddlex/repo_apis/PaddleClas_api/cls/model.py +1 -1
- paddlex/repo_apis/PaddleClas_api/cls/register.py +10 -0
- paddlex/repo_apis/PaddleClas_api/cls/runner.py +1 -1
- paddlex/repo_apis/PaddleDetection_api/instance_seg/model.py +1 -1
- paddlex/repo_apis/PaddleDetection_api/instance_seg/runner.py +1 -1
- paddlex/repo_apis/PaddleDetection_api/object_det/config.py +1 -1
- paddlex/repo_apis/PaddleDetection_api/object_det/model.py +1 -1
- paddlex/repo_apis/PaddleDetection_api/object_det/official_categories.py +25 -0
- paddlex/repo_apis/PaddleDetection_api/object_det/register.py +30 -0
- paddlex/repo_apis/PaddleDetection_api/object_det/runner.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/formula_rec/config.py +3 -3
- paddlex/repo_apis/PaddleOCR_api/formula_rec/model.py +5 -9
- paddlex/repo_apis/PaddleOCR_api/formula_rec/register.py +27 -0
- paddlex/repo_apis/PaddleOCR_api/formula_rec/runner.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/table_rec/model.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/table_rec/runner.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/text_det/model.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/text_det/register.py +18 -0
- paddlex/repo_apis/PaddleOCR_api/text_det/runner.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/text_rec/config.py +3 -3
- paddlex/repo_apis/PaddleOCR_api/text_rec/model.py +5 -9
- paddlex/repo_apis/PaddleOCR_api/text_rec/register.py +18 -0
- paddlex/repo_apis/PaddleOCR_api/text_rec/runner.py +1 -1
- paddlex/repo_apis/PaddleSeg_api/seg/model.py +1 -1
- paddlex/repo_apis/PaddleSeg_api/seg/runner.py +1 -1
- paddlex/repo_apis/PaddleTS_api/ts_ad/config.py +3 -3
- paddlex/repo_apis/PaddleTS_api/ts_cls/config.py +2 -2
- paddlex/repo_apis/PaddleTS_api/ts_fc/config.py +4 -4
- paddlex/repo_apis/PaddleVideo_api/video_cls/config.py +1 -1
- paddlex/repo_apis/PaddleVideo_api/video_cls/model.py +1 -1
- paddlex/repo_apis/PaddleVideo_api/video_cls/runner.py +1 -1
- paddlex/repo_apis/PaddleVideo_api/video_det/config.py +1 -1
- paddlex/repo_apis/PaddleVideo_api/video_det/model.py +1 -1
- paddlex/repo_apis/PaddleVideo_api/video_det/runner.py +1 -1
- paddlex/repo_apis/base/config.py +1 -1
- paddlex/repo_manager/core.py +3 -3
- paddlex/repo_manager/meta.py +6 -2
- paddlex/repo_manager/repo.py +17 -16
- paddlex/utils/custom_device_list.py +26 -2
- paddlex/utils/deps.py +3 -3
- paddlex/utils/device.py +5 -13
- paddlex/utils/env.py +4 -0
- paddlex/utils/flags.py +11 -4
- paddlex/utils/fonts/__init__.py +34 -4
- paddlex/utils/misc.py +1 -1
- paddlex/utils/subclass_register.py +2 -2
- {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/METADATA +349 -208
- {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/RECORD +240 -211
- {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/WHEEL +1 -1
- {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/entry_points.txt +1 -0
- {paddlex-3.0.0rc1.dist-info/licenses → paddlex-3.0.2.dist-info}/LICENSE +0 -0
- {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/top_level.txt +0 -0
@@ -18,14 +18,26 @@ from .base_batch_sampler import BaseBatchSampler
|
|
18
18
|
|
19
19
|
|
20
20
|
class DocVLMBatchSampler(BaseBatchSampler):
|
21
|
-
|
21
|
+
|
22
|
+
model_names_only_supports_batchsize_of_one = {"PP-DocBee-2B", "PP-DocBee-7B"}
|
23
|
+
|
24
|
+
def __init__(self, model_name, batch_size: int = 1) -> None:
|
22
25
|
"""Initializes the BaseBatchSampler.
|
23
26
|
|
24
27
|
Args:
|
28
|
+
model_name (str): The name of the model.
|
25
29
|
batch_size (int, optional): The size of each batch. Only support 1.
|
26
30
|
"""
|
27
|
-
|
28
|
-
|
31
|
+
self.model_name = model_name
|
32
|
+
if (
|
33
|
+
self.model_name in self.model_names_only_supports_batchsize_of_one
|
34
|
+
and batch_size != 1
|
35
|
+
):
|
36
|
+
logging.warning(
|
37
|
+
f"doc vlm batch sampler only support batch size 1 for {self.model_name}, but got {batch_size} and it will not take effect."
|
38
|
+
)
|
39
|
+
batch_size = 1
|
40
|
+
super().__init__(batch_size)
|
29
41
|
|
30
42
|
def sample(self, inputs):
|
31
43
|
"""Generate list of input file path.
|
@@ -37,14 +49,22 @@ class DocVLMBatchSampler(BaseBatchSampler):
|
|
37
49
|
list: list of file path.
|
38
50
|
"""
|
39
51
|
if isinstance(inputs, dict):
|
40
|
-
|
41
|
-
|
42
|
-
yield inputs
|
43
|
-
else:
|
52
|
+
inputs = [inputs]
|
53
|
+
if not (isinstance(inputs, list) and all(isinstance(i, dict) for i in inputs)):
|
44
54
|
raise TypeError(
|
45
|
-
f"Not supported input data type! Only `
|
55
|
+
f"Not supported input data type! Only `Dict` or `List[Dict]` are supported, but got: {type(inputs)}."
|
46
56
|
)
|
47
57
|
|
58
|
+
batch = []
|
59
|
+
for input_ in inputs:
|
60
|
+
batch.append(input_)
|
61
|
+
if len(batch) == self.batch_size:
|
62
|
+
yield batch
|
63
|
+
batch = []
|
64
|
+
|
65
|
+
if len(batch) > 0:
|
66
|
+
yield batch
|
67
|
+
|
48
68
|
@BaseBatchSampler.batch_size.setter
|
49
69
|
def batch_size(self, batch_size):
|
50
70
|
"""Sets the batch size.
|
@@ -56,9 +76,12 @@ class DocVLMBatchSampler(BaseBatchSampler):
|
|
56
76
|
Warning: If the batch size is not equal 1.
|
57
77
|
"""
|
58
78
|
# only support batch size 1
|
59
|
-
if
|
79
|
+
if (
|
80
|
+
self.model_name in self.model_names_only_supports_batchsize_of_one
|
81
|
+
and batch_size != 1
|
82
|
+
):
|
60
83
|
logging.warning(
|
61
|
-
f"doc vlm batch sampler only support batch size 1, but got {batch_size}."
|
84
|
+
f"doc vlm batch sampler only support batch size 1 for {self.model_name}, but got {batch_size} and it will not take effect."
|
62
85
|
)
|
63
86
|
else:
|
64
87
|
self._batch_size = batch_size
|
@@ -40,7 +40,8 @@ class ImgBatch(Batch):
|
|
40
40
|
|
41
41
|
class ImageBatchSampler(BaseBatchSampler):
|
42
42
|
|
43
|
-
|
43
|
+
IMG_SUFFIX = ["jpg", "png", "jpeg", "bmp"]
|
44
|
+
PDF_SUFFIX = ["pdf"]
|
44
45
|
|
45
46
|
def __init__(self, *args, **kwargs):
|
46
47
|
super().__init__(*args, **kwargs)
|
@@ -54,16 +55,19 @@ class ImageBatchSampler(BaseBatchSampler):
|
|
54
55
|
return save_path.as_posix()
|
55
56
|
|
56
57
|
def _get_files_list(self, fp):
|
57
|
-
file_list = []
|
58
58
|
if fp is None or not os.path.exists(fp):
|
59
|
-
raise Exception(f"Not found any
|
59
|
+
raise Exception(f"Not found any files in path: {fp}")
|
60
|
+
if os.path.isfile(fp):
|
61
|
+
return [fp]
|
60
62
|
|
61
|
-
|
62
|
-
|
63
|
-
elif os.path.isdir(fp):
|
63
|
+
file_list = []
|
64
|
+
if os.path.isdir(fp):
|
64
65
|
for root, dirs, files in os.walk(fp):
|
65
66
|
for single_file in files:
|
66
|
-
if
|
67
|
+
if (
|
68
|
+
single_file.split(".")[-1].lower()
|
69
|
+
in self.IMG_SUFFIX + self.PDF_SUFFIX
|
70
|
+
):
|
67
71
|
file_list.append(os.path.join(root, single_file))
|
68
72
|
if len(file_list) == 0:
|
69
73
|
raise Exception("Not found any file in {}".format(fp))
|
@@ -81,29 +85,34 @@ class ImageBatchSampler(BaseBatchSampler):
|
|
81
85
|
if len(batch) == self.batch_size:
|
82
86
|
yield batch
|
83
87
|
batch = ImgBatch()
|
84
|
-
elif isinstance(input, str) and input.split(".")[-1] in ("PDF", "pdf"):
|
85
|
-
file_path = (
|
86
|
-
self._download_from_url(input)
|
87
|
-
if input.startswith("http")
|
88
|
-
else input
|
89
|
-
)
|
90
|
-
for page_idx, page_img in enumerate(self.pdf_reader.read(file_path)):
|
91
|
-
batch.append(page_img, file_path, page_idx)
|
92
|
-
if len(batch) == self.batch_size:
|
93
|
-
yield batch
|
94
|
-
batch = ImgBatch()
|
95
88
|
elif isinstance(input, str):
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
89
|
+
suffix = input.split(".")[-1].lower()
|
90
|
+
if suffix in self.PDF_SUFFIX:
|
91
|
+
file_path = (
|
92
|
+
self._download_from_url(input)
|
93
|
+
if input.startswith("http")
|
94
|
+
else input
|
95
|
+
)
|
96
|
+
for page_idx, page_img in enumerate(
|
97
|
+
self.pdf_reader.read(file_path)
|
98
|
+
):
|
99
|
+
batch.append(page_img, file_path, page_idx)
|
100
|
+
if len(batch) == self.batch_size:
|
101
|
+
yield batch
|
102
|
+
batch = ImgBatch()
|
103
|
+
elif suffix in self.IMG_SUFFIX:
|
104
|
+
file_path = (
|
105
|
+
self._download_from_url(input)
|
106
|
+
if input.startswith("http")
|
107
|
+
else input
|
108
|
+
)
|
103
109
|
batch.append(file_path, file_path, None)
|
104
110
|
if len(batch) == self.batch_size:
|
105
111
|
yield batch
|
106
112
|
batch = ImgBatch()
|
113
|
+
else:
|
114
|
+
file_list = self._get_files_list(input)
|
115
|
+
yield from self.sample(file_list)
|
107
116
|
else:
|
108
117
|
logging.warning(
|
109
118
|
f"Not supported input data type! Only `numpy.ndarray` and `str` are supported! So has been ignored: {input}."
|
@@ -161,7 +161,7 @@ class JsonMixin:
|
|
161
161
|
else:
|
162
162
|
if len(json_data) > 1:
|
163
163
|
logging.warning(
|
164
|
-
f"The result has multiple json files need to be saved. But the `save_path` has been
|
164
|
+
f"The result has multiple json files need to be saved. But the `save_path` has been specified as `{save_path}`!"
|
165
165
|
)
|
166
166
|
self._json_writer.write(
|
167
167
|
save_path,
|
@@ -264,7 +264,7 @@ class Base64Mixin:
|
|
264
264
|
else:
|
265
265
|
if len(base64) > 1:
|
266
266
|
logging.warning(
|
267
|
-
f"The result has multiple base64 files need to be saved. But the `save_path` has been
|
267
|
+
f"The result has multiple base64 files need to be saved. But the `save_path` has been specified as `{save_path}`!"
|
268
268
|
)
|
269
269
|
self._base64_writer.write(
|
270
270
|
save_path, base64[list(base64.keys())[0]], *args, **kwargs
|
@@ -328,7 +328,7 @@ class ImgMixin:
|
|
328
328
|
else:
|
329
329
|
if len(img) > 1:
|
330
330
|
logging.warning(
|
331
|
-
f"The result has multiple img files need to be saved. But the `save_path` has been
|
331
|
+
f"The result has multiple img files need to be saved. But the `save_path` has been specified as `{save_path}`!"
|
332
332
|
)
|
333
333
|
self._img_writer.write(save_path, img[list(img.keys())[0]], *args, **kwargs)
|
334
334
|
|
@@ -392,7 +392,7 @@ class CSVMixin:
|
|
392
392
|
else:
|
393
393
|
if len(csv) > 1:
|
394
394
|
logging.warning(
|
395
|
-
f"The result has multiple csv files need to be saved. But the `save_path` has been
|
395
|
+
f"The result has multiple csv files need to be saved. But the `save_path` has been specified as `{save_path}`!"
|
396
396
|
)
|
397
397
|
self._csv_writer.write(save_path, csv[list(csv.keys())[0]], *args, **kwargs)
|
398
398
|
|
@@ -455,7 +455,7 @@ class HtmlMixin:
|
|
455
455
|
else:
|
456
456
|
if len(html) > 1:
|
457
457
|
logging.warning(
|
458
|
-
f"The result has multiple html files need to be saved. But the `save_path` has been
|
458
|
+
f"The result has multiple html files need to be saved. But the `save_path` has been specified as `{save_path}`!"
|
459
459
|
)
|
460
460
|
self._html_writer.write(
|
461
461
|
save_path, html[list(html.keys())[0]], *args, **kwargs
|
@@ -524,7 +524,7 @@ class XlsxMixin:
|
|
524
524
|
else:
|
525
525
|
if len(xlsx) > 1:
|
526
526
|
logging.warning(
|
527
|
-
f"The result has multiple xlsx files need to be saved. But the `save_path` has been
|
527
|
+
f"The result has multiple xlsx files need to be saved. But the `save_path` has been specified as `{save_path}`!"
|
528
528
|
)
|
529
529
|
self._xlsx_writer.write(
|
530
530
|
save_path, xlsx[list(xlsx.keys())[0]], *args, **kwargs
|
@@ -589,7 +589,7 @@ class VideoMixin:
|
|
589
589
|
else:
|
590
590
|
if len(video) > 1:
|
591
591
|
logging.warning(
|
592
|
-
f"The result has multiple video files need to be saved. But the `save_path` has been
|
592
|
+
f"The result has multiple video files need to be saved. But the `save_path` has been specified as `{save_path}`!"
|
593
593
|
)
|
594
594
|
video_writer.write(save_path, video[list(video.keys())[0]], *args, **kwargs)
|
595
595
|
|
@@ -609,10 +609,13 @@ class MarkdownMixin:
|
|
609
609
|
self._save_funcs.append(self.save_to_markdown)
|
610
610
|
|
611
611
|
@abstractmethod
|
612
|
-
def _to_markdown(self) -> Dict[str, Union[str, Dict[str, Any]]]:
|
612
|
+
def _to_markdown(self, pretty=True) -> Dict[str, Union[str, Dict[str, Any]]]:
|
613
613
|
"""
|
614
614
|
Convert the result to markdown format.
|
615
615
|
|
616
|
+
Args:
|
617
|
+
pretty (Optional[bool]): whether to pretty markdown by HTML, default by True.
|
618
|
+
|
616
619
|
Returns:
|
617
620
|
Dict[str, Union[str, Dict[str, Any]]]: A dictionary containing markdown text and image data.
|
618
621
|
"""
|
@@ -627,7 +630,7 @@ class MarkdownMixin:
|
|
627
630
|
"""
|
628
631
|
return self._to_markdown()
|
629
632
|
|
630
|
-
def save_to_markdown(self, save_path, *args, **kwargs) -> None:
|
633
|
+
def save_to_markdown(self, save_path, pretty=True, *args, **kwargs) -> None:
|
631
634
|
"""Save the markdown data to a file.
|
632
635
|
|
633
636
|
Args:
|
@@ -665,7 +668,7 @@ class MarkdownMixin:
|
|
665
668
|
self._markdown_writer.write,
|
666
669
|
self._img_writer.write,
|
667
670
|
self.save_path,
|
668
|
-
self._to_markdown(),
|
671
|
+
self._to_markdown(pretty=pretty),
|
669
672
|
*args,
|
670
673
|
**kwargs,
|
671
674
|
)
|
@@ -698,5 +701,9 @@ class MarkdownMixin:
|
|
698
701
|
if isinstance(value, dict):
|
699
702
|
base_save_path = save_path.parent
|
700
703
|
for img_path, img_data in value.items():
|
701
|
-
|
702
|
-
|
704
|
+
save_img_func(
|
705
|
+
(base_save_path / img_path).as_posix(),
|
706
|
+
img_data,
|
707
|
+
*args,
|
708
|
+
**kwargs,
|
709
|
+
)
|
@@ -118,17 +118,9 @@ class BasePredictor(
|
|
118
118
|
self.batch_sampler.batch_size = batch_size
|
119
119
|
self._use_hpip = use_hpip
|
120
120
|
if not use_hpip:
|
121
|
-
if hpi_config is not None:
|
122
|
-
logging.warning(
|
123
|
-
"`hpi_config` will be ignored when not using the high-performance inference plugin."
|
124
|
-
)
|
125
121
|
self._pp_option = self._prepare_pp_option(pp_option, device)
|
126
122
|
else:
|
127
123
|
require_hpip()
|
128
|
-
if pp_option is not None:
|
129
|
-
logging.warning(
|
130
|
-
"`pp_option` will be ignored when using the high-performance inference plugin."
|
131
|
-
)
|
132
124
|
self._hpi_config = self._prepare_hpi_config(hpi_config, device)
|
133
125
|
|
134
126
|
logging.debug(f"{self.__class__.__name__}: {self.model_dir}")
|
@@ -343,6 +335,8 @@ class BasePredictor(
|
|
343
335
|
device_info = None
|
344
336
|
if pp_option is None:
|
345
337
|
pp_option = PaddlePredictorOption(model_name=self.model_name)
|
338
|
+
elif pp_option.model_name is None:
|
339
|
+
pp_option.model_name = self.model_name
|
346
340
|
if device_info:
|
347
341
|
pp_option.device_type = device_info[0]
|
348
342
|
pp_option.device_id = device_info[1]
|
@@ -22,8 +22,7 @@ import numpy as np
|
|
22
22
|
|
23
23
|
from ....utils import logging
|
24
24
|
from ....utils.deps import class_requires_deps
|
25
|
-
from ....utils.
|
26
|
-
from ....utils.flags import DEBUG, INFER_BENCHMARK_USE_NEW_INFER_API, USE_PIR_TRT
|
25
|
+
from ....utils.flags import DEBUG, USE_PIR_TRT
|
27
26
|
from ...utils.benchmark import benchmark, set_inference_operations
|
28
27
|
from ...utils.hpi import (
|
29
28
|
HPIConfig,
|
@@ -34,15 +33,12 @@ from ...utils.hpi import (
|
|
34
33
|
suggest_inference_backend_and_config,
|
35
34
|
)
|
36
35
|
from ...utils.model_paths import get_model_paths
|
37
|
-
from ...utils.pp_option import PaddlePredictorOption
|
36
|
+
from ...utils.pp_option import PaddlePredictorOption, get_default_run_mode
|
38
37
|
from ...utils.trt_config import DISABLE_TRT_HALF_OPS_CONFIG
|
39
38
|
|
40
39
|
CACHE_DIR = ".cache"
|
41
40
|
|
42
41
|
INFERENCE_OPERATIONS = [
|
43
|
-
"PaddleCopyToDevice",
|
44
|
-
"PaddleCopyToHost",
|
45
|
-
"PaddleModelInfer",
|
46
42
|
"PaddleInferChainLegacy",
|
47
43
|
"MultiBackendInfer",
|
48
44
|
]
|
@@ -233,47 +229,6 @@ def _sort_inputs(inputs, names):
|
|
233
229
|
return inputs
|
234
230
|
|
235
231
|
|
236
|
-
def _concatenate(*callables):
|
237
|
-
def _chain(x):
|
238
|
-
for c in callables:
|
239
|
-
x = c(x)
|
240
|
-
return x
|
241
|
-
|
242
|
-
return _chain
|
243
|
-
|
244
|
-
|
245
|
-
@benchmark.timeit
|
246
|
-
class PaddleCopyToDevice:
|
247
|
-
def __init__(self, device_type, device_id):
|
248
|
-
self.device_type = device_type
|
249
|
-
self.device_id = device_id
|
250
|
-
|
251
|
-
def __call__(self, arrs):
|
252
|
-
import paddle
|
253
|
-
|
254
|
-
device_id = [self.device_id] if self.device_id is not None else self.device_id
|
255
|
-
device = constr_device(self.device_type, device_id)
|
256
|
-
paddle_tensors = [paddle.to_tensor(i, place=device) for i in arrs]
|
257
|
-
return paddle_tensors
|
258
|
-
|
259
|
-
|
260
|
-
@benchmark.timeit
|
261
|
-
class PaddleCopyToHost:
|
262
|
-
def __call__(self, paddle_tensors):
|
263
|
-
arrs = [i.numpy() for i in paddle_tensors]
|
264
|
-
return arrs
|
265
|
-
|
266
|
-
|
267
|
-
@benchmark.timeit
|
268
|
-
class PaddleModelInfer:
|
269
|
-
def __init__(self, predictor):
|
270
|
-
super().__init__()
|
271
|
-
self.predictor = predictor
|
272
|
-
|
273
|
-
def __call__(self, x):
|
274
|
-
return self.predictor.run(x)
|
275
|
-
|
276
|
-
|
277
232
|
# FIXME: Name might be misleading
|
278
233
|
@benchmark.timeit
|
279
234
|
class PaddleInferChainLegacy:
|
@@ -317,15 +272,7 @@ class PaddleInfer(StaticInfer):
|
|
317
272
|
self.model_file_prefix = model_file_prefix
|
318
273
|
self._option = option
|
319
274
|
self.predictor = self._create()
|
320
|
-
|
321
|
-
device_type = self._option.device_type
|
322
|
-
device_type = "gpu" if device_type == "dcu" else device_type
|
323
|
-
copy_to_device = PaddleCopyToDevice(device_type, self._option.device_id)
|
324
|
-
copy_to_host = PaddleCopyToHost()
|
325
|
-
model_infer = PaddleModelInfer(self.predictor)
|
326
|
-
self.infer = _concatenate(copy_to_device, model_infer, copy_to_host)
|
327
|
-
else:
|
328
|
-
self.infer = PaddleInferChainLegacy(self.predictor)
|
275
|
+
self.infer = PaddleInferChainLegacy(self.predictor)
|
329
276
|
|
330
277
|
def __call__(self, x: Sequence[np.ndarray]) -> List[np.ndarray]:
|
331
278
|
names = self.predictor.get_input_names()
|
@@ -373,7 +320,7 @@ class PaddleInfer(StaticInfer):
|
|
373
320
|
logging.debug("`device_id` has been set to None")
|
374
321
|
|
375
322
|
if (
|
376
|
-
self._option.device_type in ("gpu", "dcu")
|
323
|
+
self._option.device_type in ("gpu", "dcu", "npu", "mlu", "gcu", "xpu")
|
377
324
|
and self._option.device_id is None
|
378
325
|
):
|
379
326
|
self._option.device_id = 0
|
@@ -402,6 +349,7 @@ class PaddleInfer(StaticInfer):
|
|
402
349
|
if self._option.run_mode == "paddle_fp16"
|
403
350
|
else PrecisionType.Float32
|
404
351
|
)
|
352
|
+
config.disable_mkldnn()
|
405
353
|
config.enable_use_gpu(100, self._option.device_id, precision)
|
406
354
|
if hasattr(config, "enable_new_ir"):
|
407
355
|
config.enable_new_ir(self._option.enable_new_ir)
|
@@ -417,12 +365,16 @@ class PaddleInfer(StaticInfer):
|
|
417
365
|
if hasattr(config, "enable_new_executor"):
|
418
366
|
config.enable_new_executor()
|
419
367
|
elif self._option.device_type == "xpu":
|
368
|
+
config.enable_xpu()
|
369
|
+
config.set_xpu_device_id(self._option.device_id)
|
420
370
|
if hasattr(config, "enable_new_ir"):
|
421
371
|
config.enable_new_ir(self._option.enable_new_ir)
|
422
372
|
if hasattr(config, "enable_new_executor"):
|
423
373
|
config.enable_new_executor()
|
374
|
+
config.delete_pass("conv2d_bn_xpu_fuse_pass")
|
375
|
+
config.delete_pass("transfer_layout_pass")
|
424
376
|
elif self._option.device_type == "mlu":
|
425
|
-
config.enable_custom_device("mlu")
|
377
|
+
config.enable_custom_device("mlu", self._option.device_id)
|
426
378
|
if hasattr(config, "enable_new_ir"):
|
427
379
|
config.enable_new_ir(self._option.enable_new_ir)
|
428
380
|
if hasattr(config, "enable_new_executor"):
|
@@ -431,7 +383,7 @@ class PaddleInfer(StaticInfer):
|
|
431
383
|
from paddle_custom_device.gcu import passes as gcu_passes
|
432
384
|
|
433
385
|
gcu_passes.setUp()
|
434
|
-
config.enable_custom_device("gcu")
|
386
|
+
config.enable_custom_device("gcu", self._option.device_id)
|
435
387
|
if hasattr(config, "enable_new_ir"):
|
436
388
|
config.enable_new_ir()
|
437
389
|
if hasattr(config, "enable_new_executor"):
|
@@ -455,15 +407,10 @@ class PaddleInfer(StaticInfer):
|
|
455
407
|
assert self._option.device_type == "cpu"
|
456
408
|
config.disable_gpu()
|
457
409
|
if "mkldnn" in self._option.run_mode:
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
except Exception:
|
463
|
-
logging.warning(
|
464
|
-
"MKL-DNN is not available. We will disable MKL-DNN."
|
465
|
-
)
|
466
|
-
config.set_mkldnn_cache_capacity(-1)
|
410
|
+
config.enable_mkldnn()
|
411
|
+
if "bf16" in self._option.run_mode:
|
412
|
+
config.enable_mkldnn_bfloat16()
|
413
|
+
config.set_mkldnn_cache_capacity(self._option.mkldnn_cache_capacity)
|
467
414
|
else:
|
468
415
|
if hasattr(config, "disable_mkldnn"):
|
469
416
|
config.disable_mkldnn()
|
@@ -687,10 +634,19 @@ class HPInfer(StaticInfer):
|
|
687
634
|
)
|
688
635
|
backend_config = self._config.backend_config or {}
|
689
636
|
|
690
|
-
if backend == "paddle"
|
691
|
-
|
692
|
-
|
693
|
-
)
|
637
|
+
if backend == "paddle":
|
638
|
+
if not backend_config:
|
639
|
+
is_default_config = True
|
640
|
+
elif backend_config.keys() != {"run_mode"}:
|
641
|
+
is_default_config = False
|
642
|
+
else:
|
643
|
+
is_default_config = backend_config["run_mode"] == get_default_run_mode(
|
644
|
+
self._config.pdx_model_name, self._config.device_type
|
645
|
+
)
|
646
|
+
if is_default_config:
|
647
|
+
logging.warning(
|
648
|
+
"The Paddle Inference backend is selected with the default configuration. This may not provide optimal performance."
|
649
|
+
)
|
694
650
|
|
695
651
|
return backend, backend_config
|
696
652
|
|
@@ -833,7 +789,7 @@ class HPInfer(StaticInfer):
|
|
833
789
|
for name, shapes in backend_config.dynamic_shapes.items():
|
834
790
|
ui_option.trt_option.set_shape(name, *shapes)
|
835
791
|
else:
|
836
|
-
logging.
|
792
|
+
logging.info(
|
837
793
|
"TensorRT dynamic shapes will be loaded from the file."
|
838
794
|
)
|
839
795
|
elif backend == "om":
|
@@ -15,5 +15,7 @@
|
|
15
15
|
from .bert_tokenizer import BertTokenizer
|
16
16
|
from .clip_tokenizer import CLIPTokenizer
|
17
17
|
from .gpt_tokenizer import GPTTokenizer
|
18
|
+
from .qwen2_5_tokenizer import MIXQwen2_5_Tokenizer
|
18
19
|
from .qwen2_tokenizer import MIXQwen2Tokenizer, Qwen2Tokenizer
|
20
|
+
from .qwen_tokenizer import QWenTokenizer
|
19
21
|
from .tokenizer_utils import PretrainedTokenizer
|
@@ -403,7 +403,7 @@ class CLIPTokenizer(PretrainedTokenizer):
|
|
403
403
|
Returns the size of vocabulary.
|
404
404
|
|
405
405
|
Returns:
|
406
|
-
int: The sum of size of vocabulary and the size of
|
406
|
+
int: The sum of size of vocabulary and the size of special tokens.
|
407
407
|
|
408
408
|
"""
|
409
409
|
return len(self.encoder)
|
@@ -41,7 +41,7 @@ def bytes_to_unicode():
|
|
41
41
|
The reversible bpe codes work on unicode strings.
|
42
42
|
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
43
43
|
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
44
|
-
This is a
|
44
|
+
This is a significant percentage of your normal, say, 32K bpe vocab.
|
45
45
|
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
46
46
|
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
47
47
|
"""
|
@@ -241,7 +241,7 @@ class GPTTokenizer(PretrainedTokenizer):
|
|
241
241
|
Returns the size of vocabulary.
|
242
242
|
|
243
243
|
Returns:
|
244
|
-
int: The sum of size of vocabulary and the size of
|
244
|
+
int: The sum of size of vocabulary and the size of special tokens.
|
245
245
|
|
246
246
|
"""
|
247
247
|
|
@@ -0,0 +1,112 @@
|
|
1
|
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
import re
|
16
|
+
from typing import List
|
17
|
+
|
18
|
+
from .qwen2_tokenizer import Qwen2Tokenizer
|
19
|
+
from .tokenizer_utils_base import AddedToken, TextInput
|
20
|
+
|
21
|
+
|
22
|
+
class MIXQwen2_5_Tokenizer(Qwen2Tokenizer):
|
23
|
+
def __init__(self, *args, **kwargs):
|
24
|
+
super(MIXQwen2_5_Tokenizer, self).__init__(*args, **kwargs)
|
25
|
+
|
26
|
+
def tokenize(self, text: TextInput, **kwargs) -> List[str]:
|
27
|
+
"""
|
28
|
+
Converts a string in a sequence of tokens, using the tokenizer.
|
29
|
+
|
30
|
+
Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
|
31
|
+
(BPE/SentencePieces/WordPieces). Takes care of added tokens.
|
32
|
+
|
33
|
+
Args:
|
34
|
+
text (`str`):
|
35
|
+
The sequence to be encoded.
|
36
|
+
**kwargs (additional keyword arguments):
|
37
|
+
Passed along to the model-specific `prepare_for_tokenization` preprocessing method.
|
38
|
+
|
39
|
+
Returns:
|
40
|
+
`List[str]`: The list of tokens.
|
41
|
+
"""
|
42
|
+
|
43
|
+
split_special_tokens = kwargs.pop(
|
44
|
+
"split_special_tokens", self.split_special_tokens
|
45
|
+
)
|
46
|
+
|
47
|
+
all_special_tokens_extended = dict(
|
48
|
+
(str(t), t)
|
49
|
+
for t in self.all_special_tokens_extended
|
50
|
+
if isinstance(t, AddedToken)
|
51
|
+
)
|
52
|
+
|
53
|
+
# Add special tokens
|
54
|
+
for t in self.added_tokens_decoder:
|
55
|
+
token = self.added_tokens_decoder[t]
|
56
|
+
if isinstance(token, AddedToken) and token.special:
|
57
|
+
all_special_tokens_extended[str(token)] = token
|
58
|
+
if str(token) not in self.all_special_tokens:
|
59
|
+
self.all_special_tokens.append(str(token))
|
60
|
+
if str(token) not in self.unique_no_split_tokens:
|
61
|
+
self.unique_no_split_tokens.append(str(token))
|
62
|
+
|
63
|
+
self._create_trie(self.unique_no_split_tokens)
|
64
|
+
|
65
|
+
text, kwargs = self.prepare_for_tokenization(text, **kwargs)
|
66
|
+
|
67
|
+
# TODO: should this be in the base class?
|
68
|
+
if hasattr(self, "do_lower_case") and self.do_lower_case:
|
69
|
+
# convert non-special tokens to lowercase
|
70
|
+
escaped_special_toks = [
|
71
|
+
re.escape(s_tok)
|
72
|
+
for s_tok in (self.unique_no_split_tokens + self.all_special_tokens)
|
73
|
+
]
|
74
|
+
pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
|
75
|
+
text = re.sub(
|
76
|
+
pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text
|
77
|
+
)
|
78
|
+
|
79
|
+
if split_special_tokens:
|
80
|
+
no_split_token = []
|
81
|
+
tokens = [text]
|
82
|
+
else:
|
83
|
+
no_split_token = set(
|
84
|
+
self.unique_no_split_tokens
|
85
|
+
) # don't split on any of the added tokens
|
86
|
+
tokens = self.tokens_trie.split(text)
|
87
|
+
|
88
|
+
for i, token in enumerate(tokens):
|
89
|
+
if token in no_split_token:
|
90
|
+
tok_extended = all_special_tokens_extended.get(token, None)
|
91
|
+
left = tokens[i - 1] if i > 0 else None
|
92
|
+
right = tokens[i + 1] if i < len(tokens) - 1 else None
|
93
|
+
if isinstance(tok_extended, AddedToken):
|
94
|
+
if tok_extended.rstrip and right:
|
95
|
+
# A bit counter-intuitive but we strip the left of the string
|
96
|
+
# since tok_extended.rstrip means the special token is eating all white spaces on its right
|
97
|
+
tokens[i + 1] = right.lstrip()
|
98
|
+
# Strip white spaces on the left
|
99
|
+
if tok_extended.lstrip and left:
|
100
|
+
tokens[i - 1] = left.rstrip() # Opposite here
|
101
|
+
|
102
|
+
tokenized_text = []
|
103
|
+
for token in tokens:
|
104
|
+
# Need to skip eventual empty (fully stripped) tokens
|
105
|
+
if not token:
|
106
|
+
continue
|
107
|
+
if token in no_split_token:
|
108
|
+
tokenized_text.append(token)
|
109
|
+
else:
|
110
|
+
tokenized_text.extend(self._tokenize(token))
|
111
|
+
|
112
|
+
return tokenized_text
|
@@ -18,6 +18,7 @@ import unicodedata
|
|
18
18
|
from functools import lru_cache
|
19
19
|
from typing import List, Optional, Tuple
|
20
20
|
|
21
|
+
from .....utils import logging
|
21
22
|
from .....utils.deps import is_dep_available
|
22
23
|
from .tokenizer_utils import PretrainedTokenizer
|
23
24
|
from .tokenizer_utils_base import AddedToken, TextInput
|
@@ -146,7 +147,12 @@ class Qwen2Tokenizer(PretrainedTokenizer):
|
|
146
147
|
split_special_tokens=False,
|
147
148
|
**kwargs,
|
148
149
|
):
|
149
|
-
|
150
|
+
if unk_token is None:
|
151
|
+
logging.info(
|
152
|
+
"The `unk_token` parameter needs to be defined: we use `eos_token` by default."
|
153
|
+
)
|
154
|
+
unk_token = eos_token
|
155
|
+
|
150
156
|
# Qwen vocab does not contain control tokens; added tokens need to be special
|
151
157
|
bos_token = (
|
152
158
|
AddedToken(
|