paddlex 3.0.0rc1__py3-none-any.whl → 3.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- paddlex/.version +1 -1
- paddlex/__init__.py +1 -1
- paddlex/configs/modules/chart_parsing/PP-Chart2Table.yaml +13 -0
- paddlex/configs/modules/doc_vlm/PP-DocBee2-3B.yaml +14 -0
- paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-L.yaml +40 -0
- paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-M.yaml +40 -0
- paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-S.yaml +40 -0
- paddlex/configs/modules/layout_detection/PP-DocBlockLayout.yaml +40 -0
- paddlex/configs/modules/layout_detection/PP-DocLayout-L.yaml +2 -2
- paddlex/configs/modules/layout_detection/PP-DocLayout-M.yaml +2 -2
- paddlex/configs/modules/layout_detection/PP-DocLayout-S.yaml +2 -2
- paddlex/configs/modules/layout_detection/PP-DocLayout_plus-L.yaml +40 -0
- paddlex/configs/modules/text_detection/PP-OCRv5_mobile_det.yaml +40 -0
- paddlex/configs/modules/text_detection/PP-OCRv5_server_det.yaml +40 -0
- paddlex/configs/modules/text_recognition/PP-OCRv5_mobile_rec.yaml +39 -0
- paddlex/configs/modules/text_recognition/PP-OCRv5_server_rec.yaml +39 -0
- paddlex/configs/modules/textline_orientation/PP-LCNet_x1_0_textline_ori.yaml +41 -0
- paddlex/configs/pipelines/OCR.yaml +7 -6
- paddlex/configs/pipelines/PP-ChatOCRv3-doc.yaml +3 -1
- paddlex/configs/pipelines/PP-ChatOCRv4-doc.yaml +91 -34
- paddlex/configs/pipelines/PP-StructureV3.yaml +72 -72
- paddlex/configs/pipelines/doc_understanding.yaml +1 -1
- paddlex/configs/pipelines/formula_recognition.yaml +2 -2
- paddlex/configs/pipelines/layout_parsing.yaml +3 -2
- paddlex/configs/pipelines/seal_recognition.yaml +1 -0
- paddlex/configs/pipelines/table_recognition.yaml +2 -1
- paddlex/configs/pipelines/table_recognition_v2.yaml +7 -1
- paddlex/hpip_links.html +20 -20
- paddlex/inference/common/batch_sampler/doc_vlm_batch_sampler.py +33 -10
- paddlex/inference/common/batch_sampler/image_batch_sampler.py +34 -25
- paddlex/inference/common/result/mixin.py +19 -12
- paddlex/inference/models/base/predictor/base_predictor.py +2 -8
- paddlex/inference/models/common/static_infer.py +11 -59
- paddlex/inference/models/common/tokenizer/__init__.py +2 -0
- paddlex/inference/models/common/tokenizer/clip_tokenizer.py +1 -1
- paddlex/inference/models/common/tokenizer/gpt_tokenizer.py +2 -2
- paddlex/inference/models/common/tokenizer/qwen2_5_tokenizer.py +112 -0
- paddlex/inference/models/common/tokenizer/qwen2_tokenizer.py +7 -1
- paddlex/inference/models/common/tokenizer/qwen_tokenizer.py +288 -0
- paddlex/inference/models/common/tokenizer/tokenizer_utils.py +13 -13
- paddlex/inference/models/common/tokenizer/tokenizer_utils_base.py +3 -3
- paddlex/inference/models/common/tokenizer/vocab.py +7 -7
- paddlex/inference/models/common/vlm/conversion_utils.py +99 -0
- paddlex/inference/models/common/vlm/fusion_ops.py +205 -0
- paddlex/inference/models/common/vlm/generation/configuration_utils.py +1 -1
- paddlex/inference/models/common/vlm/generation/logits_process.py +1 -1
- paddlex/inference/models/common/vlm/generation/utils.py +1 -1
- paddlex/inference/models/common/vlm/transformers/configuration_utils.py +3 -3
- paddlex/inference/models/common/vlm/transformers/conversion_utils.py +3 -3
- paddlex/inference/models/common/vlm/transformers/model_outputs.py +2 -2
- paddlex/inference/models/common/vlm/transformers/model_utils.py +7 -31
- paddlex/inference/models/doc_vlm/modeling/GOT_ocr_2_0.py +830 -0
- paddlex/inference/models/doc_vlm/modeling/__init__.py +2 -0
- paddlex/inference/models/doc_vlm/modeling/qwen2.py +1606 -0
- paddlex/inference/models/doc_vlm/modeling/qwen2_5_vl.py +3006 -0
- paddlex/inference/models/doc_vlm/modeling/qwen2_vl.py +0 -105
- paddlex/inference/models/doc_vlm/predictor.py +79 -24
- paddlex/inference/models/doc_vlm/processors/GOT_ocr_2_0.py +97 -0
- paddlex/inference/models/doc_vlm/processors/__init__.py +2 -0
- paddlex/inference/models/doc_vlm/processors/common.py +189 -0
- paddlex/inference/models/doc_vlm/processors/qwen2_5_vl.py +548 -0
- paddlex/inference/models/doc_vlm/processors/qwen2_vl.py +21 -176
- paddlex/inference/models/formula_recognition/predictor.py +7 -1
- paddlex/inference/models/formula_recognition/processors.py +92 -79
- paddlex/inference/models/formula_recognition/result.py +28 -27
- paddlex/inference/models/image_feature/processors.py +3 -4
- paddlex/inference/models/keypoint_detection/predictor.py +3 -0
- paddlex/inference/models/object_detection/predictor.py +2 -0
- paddlex/inference/models/object_detection/processors.py +28 -3
- paddlex/inference/models/object_detection/utils.py +2 -0
- paddlex/inference/models/table_structure_recognition/result.py +0 -10
- paddlex/inference/models/text_detection/predictor.py +8 -0
- paddlex/inference/models/text_detection/processors.py +44 -10
- paddlex/inference/models/text_detection/result.py +0 -10
- paddlex/inference/pipelines/__init__.py +9 -5
- paddlex/inference/pipelines/_parallel.py +172 -0
- paddlex/inference/pipelines/anomaly_detection/pipeline.py +16 -6
- paddlex/inference/pipelines/attribute_recognition/pipeline.py +11 -1
- paddlex/inference/pipelines/base.py +14 -4
- paddlex/inference/pipelines/components/faisser.py +1 -1
- paddlex/inference/pipelines/doc_preprocessor/pipeline.py +53 -27
- paddlex/inference/pipelines/formula_recognition/pipeline.py +120 -82
- paddlex/inference/pipelines/formula_recognition/result.py +1 -11
- paddlex/inference/pipelines/image_classification/pipeline.py +16 -6
- paddlex/inference/pipelines/image_multilabel_classification/pipeline.py +16 -6
- paddlex/inference/pipelines/instance_segmentation/pipeline.py +16 -6
- paddlex/inference/pipelines/keypoint_detection/pipeline.py +16 -6
- paddlex/inference/pipelines/layout_parsing/pipeline.py +34 -47
- paddlex/inference/pipelines/layout_parsing/pipeline_v2.py +893 -260
- paddlex/inference/pipelines/layout_parsing/result.py +4 -17
- paddlex/inference/pipelines/layout_parsing/result_v2.py +523 -245
- paddlex/inference/pipelines/layout_parsing/setting.py +87 -0
- paddlex/inference/pipelines/layout_parsing/utils.py +565 -1998
- paddlex/inference/pipelines/layout_parsing/xycut_enhanced/__init__.py +16 -0
- paddlex/inference/pipelines/layout_parsing/xycut_enhanced/utils.py +1144 -0
- paddlex/inference/pipelines/layout_parsing/xycut_enhanced/xycuts.py +563 -0
- paddlex/inference/pipelines/m_3d_bev_detection/pipeline.py +2 -2
- paddlex/inference/pipelines/multilingual_speech_recognition/pipeline.py +2 -2
- paddlex/inference/pipelines/object_detection/pipeline.py +16 -6
- paddlex/inference/pipelines/ocr/pipeline.py +127 -70
- paddlex/inference/pipelines/ocr/result.py +19 -16
- paddlex/inference/pipelines/open_vocabulary_detection/pipeline.py +2 -2
- paddlex/inference/pipelines/open_vocabulary_segmentation/pipeline.py +2 -2
- paddlex/inference/pipelines/pp_chatocr/pipeline_base.py +2 -2
- paddlex/inference/pipelines/pp_chatocr/pipeline_v3.py +2 -5
- paddlex/inference/pipelines/pp_chatocr/pipeline_v4.py +5 -5
- paddlex/inference/pipelines/rotated_object_detection/pipeline.py +16 -6
- paddlex/inference/pipelines/seal_recognition/pipeline.py +109 -53
- paddlex/inference/pipelines/semantic_segmentation/pipeline.py +16 -6
- paddlex/inference/pipelines/small_object_detection/pipeline.py +16 -6
- paddlex/inference/pipelines/table_recognition/pipeline.py +26 -18
- paddlex/inference/pipelines/table_recognition/pipeline_v2.py +624 -53
- paddlex/inference/pipelines/table_recognition/result.py +1 -1
- paddlex/inference/pipelines/table_recognition/table_recognition_post_processing_v2.py +9 -5
- paddlex/inference/pipelines/ts_anomaly_detection/pipeline.py +2 -2
- paddlex/inference/pipelines/ts_classification/pipeline.py +2 -2
- paddlex/inference/pipelines/ts_forecasting/pipeline.py +2 -2
- paddlex/inference/pipelines/video_classification/pipeline.py +2 -2
- paddlex/inference/pipelines/video_detection/pipeline.py +2 -2
- paddlex/inference/serving/basic_serving/_pipeline_apps/_common/common.py +5 -1
- paddlex/inference/serving/basic_serving/_pipeline_apps/layout_parsing.py +0 -1
- paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv3_doc.py +0 -1
- paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv4_doc.py +1 -1
- paddlex/inference/serving/basic_serving/_pipeline_apps/pp_structurev3.py +6 -2
- paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition.py +1 -5
- paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition_v2.py +4 -5
- paddlex/inference/serving/infra/utils.py +20 -22
- paddlex/inference/serving/schemas/formula_recognition.py +1 -1
- paddlex/inference/serving/schemas/layout_parsing.py +1 -2
- paddlex/inference/serving/schemas/pp_chatocrv3_doc.py +1 -2
- paddlex/inference/serving/schemas/pp_chatocrv4_doc.py +2 -2
- paddlex/inference/serving/schemas/pp_structurev3.py +10 -6
- paddlex/inference/serving/schemas/seal_recognition.py +1 -1
- paddlex/inference/serving/schemas/table_recognition.py +2 -6
- paddlex/inference/serving/schemas/table_recognition_v2.py +5 -6
- paddlex/inference/utils/hpi.py +8 -1
- paddlex/inference/utils/hpi_model_info_collection.json +81 -2
- paddlex/inference/utils/io/readers.py +12 -12
- paddlex/inference/utils/mkldnn_blocklist.py +25 -0
- paddlex/inference/utils/official_models.py +14 -0
- paddlex/inference/utils/pp_option.py +29 -8
- paddlex/model.py +2 -2
- paddlex/modules/__init__.py +1 -1
- paddlex/modules/anomaly_detection/evaluator.py +2 -2
- paddlex/modules/base/__init__.py +1 -1
- paddlex/modules/base/evaluator.py +5 -5
- paddlex/modules/base/trainer.py +1 -1
- paddlex/modules/doc_vlm/dataset_checker.py +2 -2
- paddlex/modules/doc_vlm/evaluator.py +2 -2
- paddlex/modules/doc_vlm/exportor.py +2 -2
- paddlex/modules/doc_vlm/model_list.py +1 -1
- paddlex/modules/doc_vlm/trainer.py +2 -2
- paddlex/modules/face_recognition/evaluator.py +2 -2
- paddlex/modules/formula_recognition/evaluator.py +5 -2
- paddlex/modules/formula_recognition/model_list.py +3 -0
- paddlex/modules/formula_recognition/trainer.py +3 -0
- paddlex/modules/general_recognition/evaluator.py +1 -1
- paddlex/modules/image_classification/evaluator.py +2 -2
- paddlex/modules/image_classification/model_list.py +1 -0
- paddlex/modules/instance_segmentation/evaluator.py +1 -1
- paddlex/modules/keypoint_detection/evaluator.py +1 -1
- paddlex/modules/m_3d_bev_detection/evaluator.py +2 -2
- paddlex/modules/multilabel_classification/evaluator.py +2 -2
- paddlex/modules/object_detection/dataset_checker/dataset_src/convert_dataset.py +4 -4
- paddlex/modules/object_detection/evaluator.py +2 -2
- paddlex/modules/object_detection/model_list.py +2 -0
- paddlex/modules/semantic_segmentation/evaluator.py +2 -2
- paddlex/modules/table_recognition/evaluator.py +2 -2
- paddlex/modules/text_detection/evaluator.py +2 -2
- paddlex/modules/text_detection/model_list.py +2 -0
- paddlex/modules/text_recognition/evaluator.py +2 -2
- paddlex/modules/text_recognition/model_list.py +2 -0
- paddlex/modules/ts_anomaly_detection/evaluator.py +2 -2
- paddlex/modules/ts_classification/dataset_checker/dataset_src/split_dataset.py +1 -1
- paddlex/modules/ts_classification/evaluator.py +2 -2
- paddlex/modules/ts_forecast/evaluator.py +2 -2
- paddlex/modules/video_classification/evaluator.py +2 -2
- paddlex/modules/video_detection/evaluator.py +2 -2
- paddlex/ops/__init__.py +2 -2
- paddlex/paddlex_cli.py +19 -13
- paddlex/repo_apis/Paddle3D_api/bev_fusion/model.py +2 -2
- paddlex/repo_apis/PaddleClas_api/cls/config.py +1 -1
- paddlex/repo_apis/PaddleClas_api/cls/model.py +1 -1
- paddlex/repo_apis/PaddleClas_api/cls/register.py +10 -0
- paddlex/repo_apis/PaddleClas_api/cls/runner.py +1 -1
- paddlex/repo_apis/PaddleDetection_api/instance_seg/model.py +1 -1
- paddlex/repo_apis/PaddleDetection_api/instance_seg/runner.py +1 -1
- paddlex/repo_apis/PaddleDetection_api/object_det/config.py +1 -1
- paddlex/repo_apis/PaddleDetection_api/object_det/model.py +1 -1
- paddlex/repo_apis/PaddleDetection_api/object_det/official_categories.py +25 -0
- paddlex/repo_apis/PaddleDetection_api/object_det/register.py +30 -0
- paddlex/repo_apis/PaddleDetection_api/object_det/runner.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/formula_rec/config.py +3 -3
- paddlex/repo_apis/PaddleOCR_api/formula_rec/model.py +5 -9
- paddlex/repo_apis/PaddleOCR_api/formula_rec/register.py +27 -0
- paddlex/repo_apis/PaddleOCR_api/formula_rec/runner.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/table_rec/model.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/table_rec/runner.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/text_det/model.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/text_det/register.py +18 -0
- paddlex/repo_apis/PaddleOCR_api/text_det/runner.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/text_rec/config.py +3 -3
- paddlex/repo_apis/PaddleOCR_api/text_rec/model.py +5 -9
- paddlex/repo_apis/PaddleOCR_api/text_rec/register.py +18 -0
- paddlex/repo_apis/PaddleOCR_api/text_rec/runner.py +1 -1
- paddlex/repo_apis/PaddleSeg_api/seg/model.py +1 -1
- paddlex/repo_apis/PaddleSeg_api/seg/runner.py +1 -1
- paddlex/repo_apis/PaddleTS_api/ts_ad/config.py +3 -3
- paddlex/repo_apis/PaddleTS_api/ts_cls/config.py +2 -2
- paddlex/repo_apis/PaddleTS_api/ts_fc/config.py +4 -4
- paddlex/repo_apis/PaddleVideo_api/video_cls/config.py +1 -1
- paddlex/repo_apis/PaddleVideo_api/video_cls/model.py +1 -1
- paddlex/repo_apis/PaddleVideo_api/video_cls/runner.py +1 -1
- paddlex/repo_apis/PaddleVideo_api/video_det/config.py +1 -1
- paddlex/repo_apis/PaddleVideo_api/video_det/model.py +1 -1
- paddlex/repo_apis/PaddleVideo_api/video_det/runner.py +1 -1
- paddlex/repo_apis/base/config.py +1 -1
- paddlex/repo_manager/core.py +3 -3
- paddlex/repo_manager/meta.py +6 -2
- paddlex/repo_manager/repo.py +17 -16
- paddlex/utils/custom_device_list.py +26 -2
- paddlex/utils/deps.py +1 -1
- paddlex/utils/device.py +15 -8
- paddlex/utils/env.py +4 -0
- paddlex/utils/flags.py +2 -4
- paddlex/utils/fonts/__init__.py +34 -4
- paddlex/utils/misc.py +1 -1
- {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/METADATA +52 -56
- {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/RECORD +233 -206
- {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/WHEEL +1 -1
- {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/entry_points.txt +0 -0
- {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/licenses/LICENSE +0 -0
- {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,288 @@
|
|
1
|
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
import base64
|
16
|
+
import importlib.util
|
17
|
+
import os
|
18
|
+
import unicodedata
|
19
|
+
from typing import Collection, Dict, List, Set, Tuple, Union
|
20
|
+
|
21
|
+
from .tokenizer_utils import PretrainedTokenizer
|
22
|
+
from .tokenizer_utils_base import AddedToken
|
23
|
+
|
24
|
+
__all__ = ["QWenTokenizer"]
|
25
|
+
|
26
|
+
|
27
|
+
VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken"}
|
28
|
+
|
29
|
+
PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
|
30
|
+
ENDOFTEXT = "<|endoftext|>"
|
31
|
+
IMSTART = "<|im_start|>"
|
32
|
+
IMEND = "<|im_end|>"
|
33
|
+
# as the default behavior is changed to allow special tokens in
|
34
|
+
# regular texts, the surface forms of special tokens need to be
|
35
|
+
# as different as possible to minimize the impact
|
36
|
+
EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
|
37
|
+
SPECIAL_TOKENS = (
|
38
|
+
ENDOFTEXT,
|
39
|
+
IMSTART,
|
40
|
+
IMEND,
|
41
|
+
) + EXTRAS
|
42
|
+
|
43
|
+
tiktoken = None
|
44
|
+
|
45
|
+
|
46
|
+
def is_tiktoken_available():
|
47
|
+
return importlib.util.find_spec("tiktoken") is not None
|
48
|
+
|
49
|
+
|
50
|
+
def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
|
51
|
+
with open(tiktoken_bpe_file, "rb") as f:
|
52
|
+
contents = f.read()
|
53
|
+
return {
|
54
|
+
base64.b64decode(token): int(rank)
|
55
|
+
for token, rank in (line.split() for line in contents.splitlines() if line)
|
56
|
+
}
|
57
|
+
|
58
|
+
|
59
|
+
class QWenTokenizer(PretrainedTokenizer):
|
60
|
+
"""QWen tokenizer."""
|
61
|
+
|
62
|
+
model_input_names = ["input_ids", "attention_mask", "position_ids"]
|
63
|
+
resource_files_names = VOCAB_FILES_NAMES
|
64
|
+
|
65
|
+
def __init__(
|
66
|
+
self,
|
67
|
+
vocab_file,
|
68
|
+
errors="replace",
|
69
|
+
padding_side="left",
|
70
|
+
**kwargs,
|
71
|
+
):
|
72
|
+
super().__init__(**kwargs)
|
73
|
+
if not is_tiktoken_available():
|
74
|
+
raise ValueError(
|
75
|
+
"tiktoken is not installed, please install it use: pip install tiktoken"
|
76
|
+
)
|
77
|
+
|
78
|
+
import tiktoken as tk
|
79
|
+
|
80
|
+
tiktoken = tk
|
81
|
+
|
82
|
+
self.errors = errors # how to handle errors in decoding
|
83
|
+
|
84
|
+
self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type: dict[bytes, int]
|
85
|
+
self.special_tokens = {
|
86
|
+
token: index
|
87
|
+
for index, token in enumerate(
|
88
|
+
SPECIAL_TOKENS, start=len(self.mergeable_ranks)
|
89
|
+
)
|
90
|
+
}
|
91
|
+
|
92
|
+
enc = tiktoken.Encoding(
|
93
|
+
"Qwen",
|
94
|
+
pat_str=PAT_STR,
|
95
|
+
mergeable_ranks=self.mergeable_ranks,
|
96
|
+
special_tokens=self.special_tokens,
|
97
|
+
)
|
98
|
+
assert (
|
99
|
+
len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
|
100
|
+
), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
|
101
|
+
|
102
|
+
self.decoder = {
|
103
|
+
v: k for k, v in self.mergeable_ranks.items()
|
104
|
+
} # type: dict[int, bytes|str]
|
105
|
+
self.decoder.update({v: k for k, v in self.special_tokens.items()})
|
106
|
+
|
107
|
+
self.tokenizer = enc # type: tiktoken.Encoding
|
108
|
+
|
109
|
+
self.eod_id = self.tokenizer.eot_token
|
110
|
+
self.im_start_id = self.special_tokens[IMSTART]
|
111
|
+
self.im_end_id = self.special_tokens[IMEND]
|
112
|
+
|
113
|
+
if "pad_token_id" in kwargs:
|
114
|
+
self.pad_token_id = kwargs["pad_token_id"]
|
115
|
+
if "eos_token_id" in kwargs:
|
116
|
+
self.eos_token_id = kwargs["eos_token_id"]
|
117
|
+
|
118
|
+
def __len__(self) -> int:
|
119
|
+
return self.tokenizer.n_vocab
|
120
|
+
|
121
|
+
def get_vocab(self) -> Dict[bytes, int]:
|
122
|
+
return self.mergeable_ranks
|
123
|
+
|
124
|
+
def convert_tokens_to_ids(
|
125
|
+
self, tokens: Union[bytes, str, List[Union[bytes, str]]]
|
126
|
+
) -> List[int]:
|
127
|
+
ids = []
|
128
|
+
if isinstance(tokens, (str, bytes)):
|
129
|
+
if tokens in self.special_tokens:
|
130
|
+
return self.special_tokens[tokens]
|
131
|
+
else:
|
132
|
+
return self.mergeable_ranks.get(tokens)
|
133
|
+
for token in tokens:
|
134
|
+
if token in self.special_tokens:
|
135
|
+
ids.append(self.special_tokens[token])
|
136
|
+
else:
|
137
|
+
ids.append(self.mergeable_ranks.get(token))
|
138
|
+
return ids
|
139
|
+
|
140
|
+
def _update_tiktoken(self, tokens: List[str], special_tokens: bool = False) -> int:
|
141
|
+
if special_tokens:
|
142
|
+
added_tokens = []
|
143
|
+
for token in tokens:
|
144
|
+
if token in self.special_tokens:
|
145
|
+
continue
|
146
|
+
|
147
|
+
token_id = len(self.mergeable_ranks) + len(self.special_tokens)
|
148
|
+
self.special_tokens[token] = token_id
|
149
|
+
self.decoder[token_id] = token
|
150
|
+
|
151
|
+
added_tokens.append(token)
|
152
|
+
|
153
|
+
import tiktoken
|
154
|
+
|
155
|
+
self.tokenizer = tiktoken.Encoding(
|
156
|
+
"Qwen",
|
157
|
+
pat_str=PAT_STR,
|
158
|
+
mergeable_ranks=self.mergeable_ranks,
|
159
|
+
special_tokens=self.special_tokens,
|
160
|
+
)
|
161
|
+
|
162
|
+
return len(added_tokens)
|
163
|
+
else:
|
164
|
+
raise ValueError("Adding regular tokens is not supported")
|
165
|
+
|
166
|
+
def _add_tokens(
|
167
|
+
self,
|
168
|
+
new_tokens: Union[List[str], List[AddedToken]],
|
169
|
+
special_tokens: bool = False,
|
170
|
+
) -> int:
|
171
|
+
if not special_tokens and new_tokens:
|
172
|
+
raise ValueError("Adding regular tokens is not supported")
|
173
|
+
new_tokens_str = []
|
174
|
+
for token in new_tokens:
|
175
|
+
surface_form = token.content if isinstance(token, AddedToken) else token
|
176
|
+
new_tokens_str.append(surface_form)
|
177
|
+
|
178
|
+
return self._update_tiktoken(new_tokens_str, special_tokens)
|
179
|
+
|
180
|
+
def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
|
181
|
+
"""
|
182
|
+
Save only the vocabulary of the tokenizer (vocabulary).
|
183
|
+
|
184
|
+
Returns:
|
185
|
+
`Tuple(str)`: Paths to the files saved.
|
186
|
+
"""
|
187
|
+
file_path = os.path.join(save_directory, "qwen.tiktoken")
|
188
|
+
with open(file_path, "w", encoding="utf8") as w:
|
189
|
+
for k, v in self.mergeable_ranks.items():
|
190
|
+
line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
|
191
|
+
w.write(line)
|
192
|
+
return (file_path,)
|
193
|
+
|
194
|
+
def tokenize(
|
195
|
+
self,
|
196
|
+
text: str,
|
197
|
+
allowed_special: Union[Set, str] = "all",
|
198
|
+
disallowed_special: Union[Collection, str] = (),
|
199
|
+
**kwargs,
|
200
|
+
) -> List[Union[bytes, str]]:
|
201
|
+
"""
|
202
|
+
Converts a string in a sequence of tokens.
|
203
|
+
|
204
|
+
Args:
|
205
|
+
text (`str`):
|
206
|
+
The sequence to be encoded.
|
207
|
+
allowed_special (`Literal["all"]` or `set`):
|
208
|
+
The surface forms of the tokens to be encoded as special tokens in regular texts.
|
209
|
+
Default to "all".
|
210
|
+
disallowed_special (`Literal["all"]` or `Collection`):
|
211
|
+
The surface forms of the tokens that should not be in regular texts and trigger errors.
|
212
|
+
Default to an empty tuple.
|
213
|
+
|
214
|
+
kwargs (additional keyword arguments, *optional*):
|
215
|
+
Will be passed to the underlying model specific encode method.
|
216
|
+
|
217
|
+
Returns:
|
218
|
+
`List[bytes|str]`: The list of tokens.
|
219
|
+
"""
|
220
|
+
tokens = []
|
221
|
+
text = unicodedata.normalize("NFC", text)
|
222
|
+
|
223
|
+
# this implementation takes a detour: text -> token id -> token surface forms
|
224
|
+
for t in self.tokenizer.encode(
|
225
|
+
text, allowed_special=allowed_special, disallowed_special=disallowed_special
|
226
|
+
):
|
227
|
+
tokens.append(self.decoder[t])
|
228
|
+
return tokens
|
229
|
+
|
230
|
+
def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
|
231
|
+
"""
|
232
|
+
Converts a sequence of tokens in a single string.
|
233
|
+
"""
|
234
|
+
text = ""
|
235
|
+
temp = b""
|
236
|
+
for t in tokens:
|
237
|
+
if isinstance(t, str):
|
238
|
+
if temp:
|
239
|
+
text += temp.decode("utf-8", errors=self.errors)
|
240
|
+
temp = b""
|
241
|
+
text += t
|
242
|
+
elif isinstance(t, bytes):
|
243
|
+
temp += t
|
244
|
+
else:
|
245
|
+
raise TypeError("token should only be of type types or str")
|
246
|
+
if temp:
|
247
|
+
text += temp.decode("utf-8", errors=self.errors)
|
248
|
+
return text
|
249
|
+
|
250
|
+
@property
|
251
|
+
def vocab_size(self):
|
252
|
+
return self.tokenizer.n_vocab
|
253
|
+
|
254
|
+
def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
|
255
|
+
"""Converts an id to a token, special tokens included"""
|
256
|
+
if index in self.decoder:
|
257
|
+
return self.decoder[index]
|
258
|
+
raise ValueError("unknown ids")
|
259
|
+
|
260
|
+
def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
|
261
|
+
"""Converts a token to an id using the vocab, special tokens included"""
|
262
|
+
if token in self.special_tokens:
|
263
|
+
return self.special_tokens[token]
|
264
|
+
if token in self.mergeable_ranks:
|
265
|
+
return self.mergeable_ranks[token]
|
266
|
+
raise ValueError("unknown token")
|
267
|
+
|
268
|
+
def _tokenize(self, text: str, **kwargs):
|
269
|
+
"""
|
270
|
+
Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
|
271
|
+
vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
|
272
|
+
|
273
|
+
Do NOT take care of added tokens.
|
274
|
+
"""
|
275
|
+
raise NotImplementedError
|
276
|
+
|
277
|
+
def _decode(
|
278
|
+
self,
|
279
|
+
token_ids: Union[int, List[int]],
|
280
|
+
skip_special_tokens: bool = False,
|
281
|
+
errors: str = None,
|
282
|
+
**kwargs,
|
283
|
+
) -> str:
|
284
|
+
if isinstance(token_ids, int):
|
285
|
+
token_ids = [token_ids]
|
286
|
+
if skip_special_tokens:
|
287
|
+
token_ids = [i for i in token_ids if i < self.eod_id]
|
288
|
+
return self.tokenizer.decode(token_ids, errors=errors or self.errors)
|
@@ -239,7 +239,7 @@ def adapt_stale_fwd_patch(self, name, value):
|
|
239
239
|
"might be based on an old oversion which missing some "
|
240
240
|
f"arguments compared with the latest, such as {new_args}. "
|
241
241
|
"We automatically add compatibility on the patch for "
|
242
|
-
"these
|
242
|
+
"these arguments, and maybe the patch should be updated."
|
243
243
|
)
|
244
244
|
else:
|
245
245
|
logging.warning(
|
@@ -247,7 +247,7 @@ def adapt_stale_fwd_patch(self, name, value):
|
|
247
247
|
"is patched and the patch might be conflict with patches made "
|
248
248
|
f"by paddlenlp which seems have more arguments such as {new_args}. "
|
249
249
|
"We automatically add compatibility on the patch for "
|
250
|
-
"these
|
250
|
+
"these arguments, and maybe the patch should be updated."
|
251
251
|
)
|
252
252
|
if isinstance(self, paddle.nn.Layer) and inspect.isfunction(value):
|
253
253
|
|
@@ -290,8 +290,8 @@ class InitTrackerMeta(type):
|
|
290
290
|
|
291
291
|
def __init__(cls, name, bases, attrs):
|
292
292
|
init_func = cls.__init__
|
293
|
-
# If attrs has `__init__`, wrap it using
|
294
|
-
# Otherwise, no need to wrap again since the super cls has been
|
293
|
+
# If attrs has `__init__`, wrap it using accessible `_pre_init, _post_init`.
|
294
|
+
# Otherwise, no need to wrap again since the super cls has been wrapped.
|
295
295
|
# TODO: remove reduplicated tracker if using super cls `__init__`
|
296
296
|
pre_init_func = getattr(cls, "_pre_init", None) if "__init__" in attrs else None
|
297
297
|
post_init_func = (
|
@@ -323,12 +323,12 @@ class InitTrackerMeta(type):
|
|
323
323
|
|
324
324
|
@functools.wraps(init_func)
|
325
325
|
def __impl__(self, *args, **kwargs):
|
326
|
-
#
|
326
|
+
# registered helper by `pre_init_func`
|
327
327
|
if pre_init_func:
|
328
328
|
pre_init_func(self, init_func, *args, **kwargs)
|
329
329
|
# keep full configuration
|
330
330
|
init_func(self, *args, **kwargs)
|
331
|
-
#
|
331
|
+
# registered helper by `post_init_func`
|
332
332
|
if post_init_func:
|
333
333
|
post_init_func(self, init_func, *args, **kwargs)
|
334
334
|
self.init_config = kwargs
|
@@ -588,7 +588,7 @@ def _is_control(char):
|
|
588
588
|
|
589
589
|
|
590
590
|
def _is_nonnormalized_char(char):
|
591
|
-
"""Check
|
591
|
+
"""Check whether `chars` is a non-normalized character."""
|
592
592
|
cp = ord(char)
|
593
593
|
if (
|
594
594
|
(0xFF00 <= cp <= 0xFFEF)
|
@@ -688,7 +688,7 @@ class ChatTemplateMixin:
|
|
688
688
|
conversation = [[conversation]]
|
689
689
|
elif isinstance(conversation, list) and isinstance(conversation[0], str):
|
690
690
|
raise ValueError(
|
691
|
-
"apply_chat_template do not support
|
691
|
+
"apply_chat_template do not support applying batch conversations, "
|
692
692
|
"so you should apply the conversation one by one."
|
693
693
|
)
|
694
694
|
|
@@ -710,7 +710,7 @@ class ChatTemplateMixin:
|
|
710
710
|
conversations = conversation
|
711
711
|
else:
|
712
712
|
raise ValueError(
|
713
|
-
"apply_chat_template do not support
|
713
|
+
"apply_chat_template do not support applying batch conversations, "
|
714
714
|
"so you should apply the conversation one by one."
|
715
715
|
)
|
716
716
|
query = self.chat_template.render(
|
@@ -847,7 +847,7 @@ class ChatTemplateMixin:
|
|
847
847
|
self, origin_msg: List[Dict[str, str]], split_s: List[str]
|
848
848
|
):
|
849
849
|
"""Split the entire chat by specified words. Extract the non-learnable parts."""
|
850
|
-
#
|
850
|
+
# distinguish and replace the special words in original string to an uncompiled form: Like | -> \|
|
851
851
|
regex_pattern = "|".join(map(re.escape, split_s))
|
852
852
|
# splited by replaced specified words
|
853
853
|
non_learnable_parts = re.split(
|
@@ -1738,7 +1738,7 @@ class PretrainedTokenizer(
|
|
1738
1738
|
[0] * len(pair_ids) if pair else []
|
1739
1739
|
)
|
1740
1740
|
encoded_inputs["offset_mapping"] = offset_mapping
|
1741
|
-
# Build output
|
1741
|
+
# Build output dictionary
|
1742
1742
|
encoded_inputs["input_ids"] = sequence
|
1743
1743
|
if return_token_type_ids:
|
1744
1744
|
encoded_inputs["token_type_ids"] = token_type_ids
|
@@ -2108,7 +2108,7 @@ def _is_whitespace(char):
|
|
2108
2108
|
"""
|
2109
2109
|
Checks whether `chars` is a whitespace character.
|
2110
2110
|
"""
|
2111
|
-
# \t, \n, and \r are technically
|
2111
|
+
# \t, \n, and \r are technically control characters but we treat them
|
2112
2112
|
# as whitespace since they are generally considered as such.
|
2113
2113
|
if char == " " or char == "\t" or char == "\n" or char == "\r":
|
2114
2114
|
return True
|
@@ -2136,7 +2136,7 @@ def convert_to_unicode(text):
|
|
2136
2136
|
|
2137
2137
|
def whitespace_tokenize(text):
|
2138
2138
|
"""
|
2139
|
-
Runs basic whitespace cleaning and splitting on a
|
2139
|
+
Runs basic whitespace cleaning and splitting on a piece of text.
|
2140
2140
|
Args:
|
2141
2141
|
text (str): Text to be tokenized.
|
2142
2142
|
Returns:
|
@@ -1634,7 +1634,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
1634
1634
|
# From HF Hub or AI Studio
|
1635
1635
|
if from_hf_hub or from_aistudio:
|
1636
1636
|
# Only include the necessary resource files specified by the tokenizer cls
|
1637
|
-
# Deep copy to avoid
|
1637
|
+
# Deep copy to avoid modifying the class attributes
|
1638
1638
|
vocab_files = copy.deepcopy(cls.resource_files_names)
|
1639
1639
|
vocab_files["tokenizer_config_file"] = cls.tokenizer_config_file
|
1640
1640
|
|
@@ -3110,7 +3110,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
3110
3110
|
sequence = ids + pair_ids if pair else ids
|
3111
3111
|
token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else [])
|
3112
3112
|
|
3113
|
-
# Build output
|
3113
|
+
# Build output dictionary
|
3114
3114
|
encoded_inputs["input_ids"] = sequence
|
3115
3115
|
if return_token_type_ids:
|
3116
3116
|
encoded_inputs["token_type_ids"] = token_type_ids
|
@@ -3531,7 +3531,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
|
|
3531
3531
|
prefix_offset: int = 0,
|
3532
3532
|
read_offset: int = 0,
|
3533
3533
|
) -> Tuple[str, int, int]:
|
3534
|
-
"""tokenizer decoding for the streaming generation use case. This method can be
|
3534
|
+
"""tokenizer decoding for the streaming generation use case. This method can be overridden for tokenizer that doesn't follow this API"""
|
3535
3535
|
prefix_text = self.decode(
|
3536
3536
|
all_input_ids[prefix_offset:read_offset],
|
3537
3537
|
skip_special_tokens=False,
|
@@ -27,8 +27,8 @@ class Vocab(object):
|
|
27
27
|
store/load functions.
|
28
28
|
|
29
29
|
Args:
|
30
|
-
counter (collections.Counter, optional): A Counter
|
31
|
-
the tokens and their frequencies. Its keys will be indexed
|
30
|
+
counter (collections.Counter, optional): A Counter instance describes
|
31
|
+
the tokens and their frequencies. Its keys will be indexed according
|
32
32
|
to the order of frequency sorting to construct mapping relationship.
|
33
33
|
If None, `token_to_idx` must be provided as the mapping relationship.
|
34
34
|
Default: None.
|
@@ -40,7 +40,7 @@ class Vocab(object):
|
|
40
40
|
between tokens and indices to be used. If provided, adjust the tokens
|
41
41
|
and indices mapping according to it. If None, counter must be provided.
|
42
42
|
Default: None.
|
43
|
-
unk_token (str, optional): Special token for
|
43
|
+
unk_token (str, optional): Special token for unknown token. If no need,
|
44
44
|
it also could be None. Default: None.
|
45
45
|
pad_token (str, optional): Special token for padding token. If no need,
|
46
46
|
it also could be None. Default: None.
|
@@ -231,7 +231,7 @@ class Vocab(object):
|
|
231
231
|
for idx in indices:
|
232
232
|
if not isinstance(idx, (int, np.integer)):
|
233
233
|
warnings.warn(
|
234
|
-
"The type of `to_tokens()`'s input `indices` is not `int` which will be forcibly
|
234
|
+
"The type of `to_tokens()`'s input `indices` is not `int` which will be forcibly transferred to `int`. "
|
235
235
|
)
|
236
236
|
idx = int(idx)
|
237
237
|
|
@@ -422,7 +422,7 @@ class Vocab(object):
|
|
422
422
|
Args:
|
423
423
|
token_to_idx (dict): A dict describes the mapping relationship between
|
424
424
|
tokens and indices.
|
425
|
-
unk_token (str, optional): The special token for
|
425
|
+
unk_token (str, optional): The special token for unknown token. If
|
426
426
|
no need, it also could be None. Default: None.
|
427
427
|
pad_token (str, optional): The special token for padding token. If
|
428
428
|
no need, it also could be None. Default: None.
|
@@ -480,7 +480,7 @@ class Vocab(object):
|
|
480
480
|
**kwargs
|
481
481
|
):
|
482
482
|
"""
|
483
|
-
Builds the :class:`Vocab`
|
483
|
+
Builds the :class:`Vocab` according to given iterator and other
|
484
484
|
information. Firstly, iterate over the `iterator` to construct a
|
485
485
|
:class:`collections.Counter` and used to init the as :class:`Vocab`.
|
486
486
|
|
@@ -495,7 +495,7 @@ class Vocab(object):
|
|
495
495
|
relationship between tokens and indices to be used. If provided,
|
496
496
|
adjust the tokens and indices mapping according to it. If None,
|
497
497
|
counter must be provided. Default: None.
|
498
|
-
unk_token (str, optional): The special token for
|
498
|
+
unk_token (str, optional): The special token for unknown token
|
499
499
|
'<unk>'. If no need, it also could be None. Default: None.
|
500
500
|
pad_token (str, optional): The special token for padding token
|
501
501
|
'<pad>'. If no need, it also could be None. Default: None.
|
@@ -0,0 +1,99 @@
|
|
1
|
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
|
16
|
+
import numpy as np
|
17
|
+
import paddle
|
18
|
+
|
19
|
+
|
20
|
+
def fuse_param_func():
|
21
|
+
def fn(fuse_params, is_qkv=False, num_heads=None, num_key_value_heads=None):
|
22
|
+
concat_fn = np.concatenate
|
23
|
+
split_fn = np.split
|
24
|
+
if isinstance(fuse_params[0], paddle.Tensor):
|
25
|
+
concat_fn = paddle.concat
|
26
|
+
split_fn = paddle.split
|
27
|
+
|
28
|
+
if is_qkv:
|
29
|
+
assert (
|
30
|
+
num_heads
|
31
|
+
), f"num_heads should be number of heads for Q, but got {num_heads}"
|
32
|
+
assert (
|
33
|
+
num_key_value_heads
|
34
|
+
), f"num_key_value_heads should be number of key_value_heads for K and V, but got {num_key_value_heads}"
|
35
|
+
assert (
|
36
|
+
len(fuse_params) == 3
|
37
|
+
), f"fuse_params length is not equal 3, it should be Q K V list. but got length {len(fuse_params)}"
|
38
|
+
num_query_groups = num_heads // num_key_value_heads
|
39
|
+
q_list = split_fn(fuse_params[0], num_heads, axis=-1)
|
40
|
+
k_list = split_fn(fuse_params[1], num_key_value_heads, axis=-1)
|
41
|
+
v_list = split_fn(fuse_params[2], num_key_value_heads, axis=-1)
|
42
|
+
|
43
|
+
qkv_pairs = []
|
44
|
+
for i in range(num_key_value_heads):
|
45
|
+
qkv_pairs += q_list[i * num_query_groups : (i + 1) * num_query_groups]
|
46
|
+
qkv_pairs.append(k_list[i])
|
47
|
+
qkv_pairs.append(v_list[i])
|
48
|
+
return concat_fn(qkv_pairs, axis=-1)
|
49
|
+
else:
|
50
|
+
return concat_fn(fuse_params, axis=-1)
|
51
|
+
|
52
|
+
return fn
|
53
|
+
|
54
|
+
|
55
|
+
def split_param_func():
|
56
|
+
def fn(
|
57
|
+
fused_param,
|
58
|
+
split_nums=2,
|
59
|
+
is_qkv=False,
|
60
|
+
num_heads=None,
|
61
|
+
num_key_value_heads=None,
|
62
|
+
):
|
63
|
+
concat_fn = np.concatenate
|
64
|
+
split_fn = np.split
|
65
|
+
if isinstance(fused_param, paddle.Tensor):
|
66
|
+
concat_fn = paddle.concat
|
67
|
+
split_fn = paddle.split
|
68
|
+
|
69
|
+
if is_qkv:
|
70
|
+
assert (
|
71
|
+
num_heads
|
72
|
+
), f"num_heads should be number of heads for Q, but got {num_heads}"
|
73
|
+
assert (
|
74
|
+
num_key_value_heads
|
75
|
+
), f"num_key_value_heads should be number of key_value_heads for K and V, but got {num_key_value_heads}"
|
76
|
+
num_query_groups = num_heads // num_key_value_heads
|
77
|
+
q_list, k_list, v_list = [], [], []
|
78
|
+
split_heads = split_fn(
|
79
|
+
fused_param, num_heads + 2 * num_key_value_heads, axis=-1
|
80
|
+
)
|
81
|
+
for i in range(num_key_value_heads):
|
82
|
+
q_list += split_heads[
|
83
|
+
i * (num_query_groups + 2) : (i + 1) * (num_query_groups + 2) - 2
|
84
|
+
]
|
85
|
+
k_list.append(split_heads[(i + 1) * (num_query_groups + 2) - 2])
|
86
|
+
v_list.append(split_heads[(i + 1) * (num_query_groups + 2) - 1])
|
87
|
+
return (
|
88
|
+
concat_fn(q_list, axis=-1),
|
89
|
+
concat_fn(k_list, axis=-1),
|
90
|
+
concat_fn(v_list, axis=-1),
|
91
|
+
)
|
92
|
+
else:
|
93
|
+
return split_fn(fused_param, split_nums, axis=-1)
|
94
|
+
|
95
|
+
return fn
|
96
|
+
|
97
|
+
|
98
|
+
def split_or_fuse_func(is_fuse=True):
|
99
|
+
return fuse_param_func() if is_fuse else split_param_func()
|