paddlex 3.0.0rc1__py3-none-any.whl → 3.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. paddlex/.version +1 -1
  2. paddlex/__init__.py +1 -1
  3. paddlex/configs/modules/chart_parsing/PP-Chart2Table.yaml +13 -0
  4. paddlex/configs/modules/doc_vlm/PP-DocBee2-3B.yaml +14 -0
  5. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-L.yaml +40 -0
  6. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-M.yaml +40 -0
  7. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-S.yaml +40 -0
  8. paddlex/configs/modules/layout_detection/PP-DocBlockLayout.yaml +40 -0
  9. paddlex/configs/modules/layout_detection/PP-DocLayout-L.yaml +2 -2
  10. paddlex/configs/modules/layout_detection/PP-DocLayout-M.yaml +2 -2
  11. paddlex/configs/modules/layout_detection/PP-DocLayout-S.yaml +2 -2
  12. paddlex/configs/modules/layout_detection/PP-DocLayout_plus-L.yaml +40 -0
  13. paddlex/configs/modules/text_detection/PP-OCRv5_mobile_det.yaml +40 -0
  14. paddlex/configs/modules/text_detection/PP-OCRv5_server_det.yaml +40 -0
  15. paddlex/configs/modules/text_recognition/PP-OCRv5_mobile_rec.yaml +39 -0
  16. paddlex/configs/modules/text_recognition/PP-OCRv5_server_rec.yaml +39 -0
  17. paddlex/configs/modules/textline_orientation/PP-LCNet_x1_0_textline_ori.yaml +41 -0
  18. paddlex/configs/pipelines/OCR.yaml +7 -6
  19. paddlex/configs/pipelines/PP-ChatOCRv3-doc.yaml +3 -1
  20. paddlex/configs/pipelines/PP-ChatOCRv4-doc.yaml +91 -34
  21. paddlex/configs/pipelines/PP-StructureV3.yaml +72 -72
  22. paddlex/configs/pipelines/doc_understanding.yaml +1 -1
  23. paddlex/configs/pipelines/formula_recognition.yaml +2 -2
  24. paddlex/configs/pipelines/layout_parsing.yaml +3 -2
  25. paddlex/configs/pipelines/seal_recognition.yaml +1 -0
  26. paddlex/configs/pipelines/table_recognition.yaml +2 -1
  27. paddlex/configs/pipelines/table_recognition_v2.yaml +7 -1
  28. paddlex/hpip_links.html +20 -20
  29. paddlex/inference/common/batch_sampler/doc_vlm_batch_sampler.py +33 -10
  30. paddlex/inference/common/batch_sampler/image_batch_sampler.py +34 -25
  31. paddlex/inference/common/result/mixin.py +19 -12
  32. paddlex/inference/models/base/predictor/base_predictor.py +2 -8
  33. paddlex/inference/models/common/static_infer.py +11 -59
  34. paddlex/inference/models/common/tokenizer/__init__.py +2 -0
  35. paddlex/inference/models/common/tokenizer/clip_tokenizer.py +1 -1
  36. paddlex/inference/models/common/tokenizer/gpt_tokenizer.py +2 -2
  37. paddlex/inference/models/common/tokenizer/qwen2_5_tokenizer.py +112 -0
  38. paddlex/inference/models/common/tokenizer/qwen2_tokenizer.py +7 -1
  39. paddlex/inference/models/common/tokenizer/qwen_tokenizer.py +288 -0
  40. paddlex/inference/models/common/tokenizer/tokenizer_utils.py +13 -13
  41. paddlex/inference/models/common/tokenizer/tokenizer_utils_base.py +3 -3
  42. paddlex/inference/models/common/tokenizer/vocab.py +7 -7
  43. paddlex/inference/models/common/vlm/conversion_utils.py +99 -0
  44. paddlex/inference/models/common/vlm/fusion_ops.py +205 -0
  45. paddlex/inference/models/common/vlm/generation/configuration_utils.py +1 -1
  46. paddlex/inference/models/common/vlm/generation/logits_process.py +1 -1
  47. paddlex/inference/models/common/vlm/generation/utils.py +1 -1
  48. paddlex/inference/models/common/vlm/transformers/configuration_utils.py +3 -3
  49. paddlex/inference/models/common/vlm/transformers/conversion_utils.py +3 -3
  50. paddlex/inference/models/common/vlm/transformers/model_outputs.py +2 -2
  51. paddlex/inference/models/common/vlm/transformers/model_utils.py +7 -31
  52. paddlex/inference/models/doc_vlm/modeling/GOT_ocr_2_0.py +830 -0
  53. paddlex/inference/models/doc_vlm/modeling/__init__.py +2 -0
  54. paddlex/inference/models/doc_vlm/modeling/qwen2.py +1606 -0
  55. paddlex/inference/models/doc_vlm/modeling/qwen2_5_vl.py +3006 -0
  56. paddlex/inference/models/doc_vlm/modeling/qwen2_vl.py +0 -105
  57. paddlex/inference/models/doc_vlm/predictor.py +79 -24
  58. paddlex/inference/models/doc_vlm/processors/GOT_ocr_2_0.py +97 -0
  59. paddlex/inference/models/doc_vlm/processors/__init__.py +2 -0
  60. paddlex/inference/models/doc_vlm/processors/common.py +189 -0
  61. paddlex/inference/models/doc_vlm/processors/qwen2_5_vl.py +548 -0
  62. paddlex/inference/models/doc_vlm/processors/qwen2_vl.py +21 -176
  63. paddlex/inference/models/formula_recognition/predictor.py +7 -1
  64. paddlex/inference/models/formula_recognition/processors.py +92 -79
  65. paddlex/inference/models/formula_recognition/result.py +28 -27
  66. paddlex/inference/models/image_feature/processors.py +3 -4
  67. paddlex/inference/models/keypoint_detection/predictor.py +3 -0
  68. paddlex/inference/models/object_detection/predictor.py +2 -0
  69. paddlex/inference/models/object_detection/processors.py +28 -3
  70. paddlex/inference/models/object_detection/utils.py +2 -0
  71. paddlex/inference/models/table_structure_recognition/result.py +0 -10
  72. paddlex/inference/models/text_detection/predictor.py +8 -0
  73. paddlex/inference/models/text_detection/processors.py +44 -10
  74. paddlex/inference/models/text_detection/result.py +0 -10
  75. paddlex/inference/pipelines/__init__.py +9 -5
  76. paddlex/inference/pipelines/_parallel.py +172 -0
  77. paddlex/inference/pipelines/anomaly_detection/pipeline.py +16 -6
  78. paddlex/inference/pipelines/attribute_recognition/pipeline.py +11 -1
  79. paddlex/inference/pipelines/base.py +14 -4
  80. paddlex/inference/pipelines/components/faisser.py +1 -1
  81. paddlex/inference/pipelines/doc_preprocessor/pipeline.py +53 -27
  82. paddlex/inference/pipelines/formula_recognition/pipeline.py +120 -82
  83. paddlex/inference/pipelines/formula_recognition/result.py +1 -11
  84. paddlex/inference/pipelines/image_classification/pipeline.py +16 -6
  85. paddlex/inference/pipelines/image_multilabel_classification/pipeline.py +16 -6
  86. paddlex/inference/pipelines/instance_segmentation/pipeline.py +16 -6
  87. paddlex/inference/pipelines/keypoint_detection/pipeline.py +16 -6
  88. paddlex/inference/pipelines/layout_parsing/pipeline.py +34 -47
  89. paddlex/inference/pipelines/layout_parsing/pipeline_v2.py +893 -260
  90. paddlex/inference/pipelines/layout_parsing/result.py +4 -17
  91. paddlex/inference/pipelines/layout_parsing/result_v2.py +523 -245
  92. paddlex/inference/pipelines/layout_parsing/setting.py +87 -0
  93. paddlex/inference/pipelines/layout_parsing/utils.py +565 -1998
  94. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/__init__.py +16 -0
  95. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/utils.py +1144 -0
  96. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/xycuts.py +563 -0
  97. paddlex/inference/pipelines/m_3d_bev_detection/pipeline.py +2 -2
  98. paddlex/inference/pipelines/multilingual_speech_recognition/pipeline.py +2 -2
  99. paddlex/inference/pipelines/object_detection/pipeline.py +16 -6
  100. paddlex/inference/pipelines/ocr/pipeline.py +127 -70
  101. paddlex/inference/pipelines/ocr/result.py +19 -16
  102. paddlex/inference/pipelines/open_vocabulary_detection/pipeline.py +2 -2
  103. paddlex/inference/pipelines/open_vocabulary_segmentation/pipeline.py +2 -2
  104. paddlex/inference/pipelines/pp_chatocr/pipeline_base.py +2 -2
  105. paddlex/inference/pipelines/pp_chatocr/pipeline_v3.py +2 -5
  106. paddlex/inference/pipelines/pp_chatocr/pipeline_v4.py +5 -5
  107. paddlex/inference/pipelines/rotated_object_detection/pipeline.py +16 -6
  108. paddlex/inference/pipelines/seal_recognition/pipeline.py +109 -53
  109. paddlex/inference/pipelines/semantic_segmentation/pipeline.py +16 -6
  110. paddlex/inference/pipelines/small_object_detection/pipeline.py +16 -6
  111. paddlex/inference/pipelines/table_recognition/pipeline.py +26 -18
  112. paddlex/inference/pipelines/table_recognition/pipeline_v2.py +624 -53
  113. paddlex/inference/pipelines/table_recognition/result.py +1 -1
  114. paddlex/inference/pipelines/table_recognition/table_recognition_post_processing_v2.py +9 -5
  115. paddlex/inference/pipelines/ts_anomaly_detection/pipeline.py +2 -2
  116. paddlex/inference/pipelines/ts_classification/pipeline.py +2 -2
  117. paddlex/inference/pipelines/ts_forecasting/pipeline.py +2 -2
  118. paddlex/inference/pipelines/video_classification/pipeline.py +2 -2
  119. paddlex/inference/pipelines/video_detection/pipeline.py +2 -2
  120. paddlex/inference/serving/basic_serving/_pipeline_apps/_common/common.py +5 -1
  121. paddlex/inference/serving/basic_serving/_pipeline_apps/layout_parsing.py +0 -1
  122. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv3_doc.py +0 -1
  123. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv4_doc.py +1 -1
  124. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_structurev3.py +6 -2
  125. paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition.py +1 -5
  126. paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition_v2.py +4 -5
  127. paddlex/inference/serving/infra/utils.py +20 -22
  128. paddlex/inference/serving/schemas/formula_recognition.py +1 -1
  129. paddlex/inference/serving/schemas/layout_parsing.py +1 -2
  130. paddlex/inference/serving/schemas/pp_chatocrv3_doc.py +1 -2
  131. paddlex/inference/serving/schemas/pp_chatocrv4_doc.py +2 -2
  132. paddlex/inference/serving/schemas/pp_structurev3.py +10 -6
  133. paddlex/inference/serving/schemas/seal_recognition.py +1 -1
  134. paddlex/inference/serving/schemas/table_recognition.py +2 -6
  135. paddlex/inference/serving/schemas/table_recognition_v2.py +5 -6
  136. paddlex/inference/utils/hpi.py +8 -1
  137. paddlex/inference/utils/hpi_model_info_collection.json +81 -2
  138. paddlex/inference/utils/io/readers.py +12 -12
  139. paddlex/inference/utils/mkldnn_blocklist.py +25 -0
  140. paddlex/inference/utils/official_models.py +14 -0
  141. paddlex/inference/utils/pp_option.py +29 -8
  142. paddlex/model.py +2 -2
  143. paddlex/modules/__init__.py +1 -1
  144. paddlex/modules/anomaly_detection/evaluator.py +2 -2
  145. paddlex/modules/base/__init__.py +1 -1
  146. paddlex/modules/base/evaluator.py +5 -5
  147. paddlex/modules/base/trainer.py +1 -1
  148. paddlex/modules/doc_vlm/dataset_checker.py +2 -2
  149. paddlex/modules/doc_vlm/evaluator.py +2 -2
  150. paddlex/modules/doc_vlm/exportor.py +2 -2
  151. paddlex/modules/doc_vlm/model_list.py +1 -1
  152. paddlex/modules/doc_vlm/trainer.py +2 -2
  153. paddlex/modules/face_recognition/evaluator.py +2 -2
  154. paddlex/modules/formula_recognition/evaluator.py +5 -2
  155. paddlex/modules/formula_recognition/model_list.py +3 -0
  156. paddlex/modules/formula_recognition/trainer.py +3 -0
  157. paddlex/modules/general_recognition/evaluator.py +1 -1
  158. paddlex/modules/image_classification/evaluator.py +2 -2
  159. paddlex/modules/image_classification/model_list.py +1 -0
  160. paddlex/modules/instance_segmentation/evaluator.py +1 -1
  161. paddlex/modules/keypoint_detection/evaluator.py +1 -1
  162. paddlex/modules/m_3d_bev_detection/evaluator.py +2 -2
  163. paddlex/modules/multilabel_classification/evaluator.py +2 -2
  164. paddlex/modules/object_detection/dataset_checker/dataset_src/convert_dataset.py +4 -4
  165. paddlex/modules/object_detection/evaluator.py +2 -2
  166. paddlex/modules/object_detection/model_list.py +2 -0
  167. paddlex/modules/semantic_segmentation/evaluator.py +2 -2
  168. paddlex/modules/table_recognition/evaluator.py +2 -2
  169. paddlex/modules/text_detection/evaluator.py +2 -2
  170. paddlex/modules/text_detection/model_list.py +2 -0
  171. paddlex/modules/text_recognition/evaluator.py +2 -2
  172. paddlex/modules/text_recognition/model_list.py +2 -0
  173. paddlex/modules/ts_anomaly_detection/evaluator.py +2 -2
  174. paddlex/modules/ts_classification/dataset_checker/dataset_src/split_dataset.py +1 -1
  175. paddlex/modules/ts_classification/evaluator.py +2 -2
  176. paddlex/modules/ts_forecast/evaluator.py +2 -2
  177. paddlex/modules/video_classification/evaluator.py +2 -2
  178. paddlex/modules/video_detection/evaluator.py +2 -2
  179. paddlex/ops/__init__.py +2 -2
  180. paddlex/paddlex_cli.py +19 -13
  181. paddlex/repo_apis/Paddle3D_api/bev_fusion/model.py +2 -2
  182. paddlex/repo_apis/PaddleClas_api/cls/config.py +1 -1
  183. paddlex/repo_apis/PaddleClas_api/cls/model.py +1 -1
  184. paddlex/repo_apis/PaddleClas_api/cls/register.py +10 -0
  185. paddlex/repo_apis/PaddleClas_api/cls/runner.py +1 -1
  186. paddlex/repo_apis/PaddleDetection_api/instance_seg/model.py +1 -1
  187. paddlex/repo_apis/PaddleDetection_api/instance_seg/runner.py +1 -1
  188. paddlex/repo_apis/PaddleDetection_api/object_det/config.py +1 -1
  189. paddlex/repo_apis/PaddleDetection_api/object_det/model.py +1 -1
  190. paddlex/repo_apis/PaddleDetection_api/object_det/official_categories.py +25 -0
  191. paddlex/repo_apis/PaddleDetection_api/object_det/register.py +30 -0
  192. paddlex/repo_apis/PaddleDetection_api/object_det/runner.py +1 -1
  193. paddlex/repo_apis/PaddleOCR_api/formula_rec/config.py +3 -3
  194. paddlex/repo_apis/PaddleOCR_api/formula_rec/model.py +5 -9
  195. paddlex/repo_apis/PaddleOCR_api/formula_rec/register.py +27 -0
  196. paddlex/repo_apis/PaddleOCR_api/formula_rec/runner.py +1 -1
  197. paddlex/repo_apis/PaddleOCR_api/table_rec/model.py +1 -1
  198. paddlex/repo_apis/PaddleOCR_api/table_rec/runner.py +1 -1
  199. paddlex/repo_apis/PaddleOCR_api/text_det/model.py +1 -1
  200. paddlex/repo_apis/PaddleOCR_api/text_det/register.py +18 -0
  201. paddlex/repo_apis/PaddleOCR_api/text_det/runner.py +1 -1
  202. paddlex/repo_apis/PaddleOCR_api/text_rec/config.py +3 -3
  203. paddlex/repo_apis/PaddleOCR_api/text_rec/model.py +5 -9
  204. paddlex/repo_apis/PaddleOCR_api/text_rec/register.py +18 -0
  205. paddlex/repo_apis/PaddleOCR_api/text_rec/runner.py +1 -1
  206. paddlex/repo_apis/PaddleSeg_api/seg/model.py +1 -1
  207. paddlex/repo_apis/PaddleSeg_api/seg/runner.py +1 -1
  208. paddlex/repo_apis/PaddleTS_api/ts_ad/config.py +3 -3
  209. paddlex/repo_apis/PaddleTS_api/ts_cls/config.py +2 -2
  210. paddlex/repo_apis/PaddleTS_api/ts_fc/config.py +4 -4
  211. paddlex/repo_apis/PaddleVideo_api/video_cls/config.py +1 -1
  212. paddlex/repo_apis/PaddleVideo_api/video_cls/model.py +1 -1
  213. paddlex/repo_apis/PaddleVideo_api/video_cls/runner.py +1 -1
  214. paddlex/repo_apis/PaddleVideo_api/video_det/config.py +1 -1
  215. paddlex/repo_apis/PaddleVideo_api/video_det/model.py +1 -1
  216. paddlex/repo_apis/PaddleVideo_api/video_det/runner.py +1 -1
  217. paddlex/repo_apis/base/config.py +1 -1
  218. paddlex/repo_manager/core.py +3 -3
  219. paddlex/repo_manager/meta.py +6 -2
  220. paddlex/repo_manager/repo.py +17 -16
  221. paddlex/utils/custom_device_list.py +26 -2
  222. paddlex/utils/deps.py +1 -1
  223. paddlex/utils/device.py +15 -8
  224. paddlex/utils/env.py +4 -0
  225. paddlex/utils/flags.py +2 -4
  226. paddlex/utils/fonts/__init__.py +34 -4
  227. paddlex/utils/misc.py +1 -1
  228. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/METADATA +52 -56
  229. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/RECORD +233 -206
  230. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/WHEEL +1 -1
  231. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/entry_points.txt +0 -0
  232. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/licenses/LICENSE +0 -0
  233. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,288 @@
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import base64
16
+ import importlib.util
17
+ import os
18
+ import unicodedata
19
+ from typing import Collection, Dict, List, Set, Tuple, Union
20
+
21
+ from .tokenizer_utils import PretrainedTokenizer
22
+ from .tokenizer_utils_base import AddedToken
23
+
24
+ __all__ = ["QWenTokenizer"]
25
+
26
+
27
+ VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken"}
28
+
29
+ PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
30
+ ENDOFTEXT = "<|endoftext|>"
31
+ IMSTART = "<|im_start|>"
32
+ IMEND = "<|im_end|>"
33
+ # as the default behavior is changed to allow special tokens in
34
+ # regular texts, the surface forms of special tokens need to be
35
+ # as different as possible to minimize the impact
36
+ EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
37
+ SPECIAL_TOKENS = (
38
+ ENDOFTEXT,
39
+ IMSTART,
40
+ IMEND,
41
+ ) + EXTRAS
42
+
43
+ tiktoken = None
44
+
45
+
46
+ def is_tiktoken_available():
47
+ return importlib.util.find_spec("tiktoken") is not None
48
+
49
+
50
+ def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
51
+ with open(tiktoken_bpe_file, "rb") as f:
52
+ contents = f.read()
53
+ return {
54
+ base64.b64decode(token): int(rank)
55
+ for token, rank in (line.split() for line in contents.splitlines() if line)
56
+ }
57
+
58
+
59
+ class QWenTokenizer(PretrainedTokenizer):
60
+ """QWen tokenizer."""
61
+
62
+ model_input_names = ["input_ids", "attention_mask", "position_ids"]
63
+ resource_files_names = VOCAB_FILES_NAMES
64
+
65
+ def __init__(
66
+ self,
67
+ vocab_file,
68
+ errors="replace",
69
+ padding_side="left",
70
+ **kwargs,
71
+ ):
72
+ super().__init__(**kwargs)
73
+ if not is_tiktoken_available():
74
+ raise ValueError(
75
+ "tiktoken is not installed, please install it use: pip install tiktoken"
76
+ )
77
+
78
+ import tiktoken as tk
79
+
80
+ tiktoken = tk
81
+
82
+ self.errors = errors # how to handle errors in decoding
83
+
84
+ self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type: dict[bytes, int]
85
+ self.special_tokens = {
86
+ token: index
87
+ for index, token in enumerate(
88
+ SPECIAL_TOKENS, start=len(self.mergeable_ranks)
89
+ )
90
+ }
91
+
92
+ enc = tiktoken.Encoding(
93
+ "Qwen",
94
+ pat_str=PAT_STR,
95
+ mergeable_ranks=self.mergeable_ranks,
96
+ special_tokens=self.special_tokens,
97
+ )
98
+ assert (
99
+ len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
100
+ ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
101
+
102
+ self.decoder = {
103
+ v: k for k, v in self.mergeable_ranks.items()
104
+ } # type: dict[int, bytes|str]
105
+ self.decoder.update({v: k for k, v in self.special_tokens.items()})
106
+
107
+ self.tokenizer = enc # type: tiktoken.Encoding
108
+
109
+ self.eod_id = self.tokenizer.eot_token
110
+ self.im_start_id = self.special_tokens[IMSTART]
111
+ self.im_end_id = self.special_tokens[IMEND]
112
+
113
+ if "pad_token_id" in kwargs:
114
+ self.pad_token_id = kwargs["pad_token_id"]
115
+ if "eos_token_id" in kwargs:
116
+ self.eos_token_id = kwargs["eos_token_id"]
117
+
118
+ def __len__(self) -> int:
119
+ return self.tokenizer.n_vocab
120
+
121
+ def get_vocab(self) -> Dict[bytes, int]:
122
+ return self.mergeable_ranks
123
+
124
+ def convert_tokens_to_ids(
125
+ self, tokens: Union[bytes, str, List[Union[bytes, str]]]
126
+ ) -> List[int]:
127
+ ids = []
128
+ if isinstance(tokens, (str, bytes)):
129
+ if tokens in self.special_tokens:
130
+ return self.special_tokens[tokens]
131
+ else:
132
+ return self.mergeable_ranks.get(tokens)
133
+ for token in tokens:
134
+ if token in self.special_tokens:
135
+ ids.append(self.special_tokens[token])
136
+ else:
137
+ ids.append(self.mergeable_ranks.get(token))
138
+ return ids
139
+
140
+ def _update_tiktoken(self, tokens: List[str], special_tokens: bool = False) -> int:
141
+ if special_tokens:
142
+ added_tokens = []
143
+ for token in tokens:
144
+ if token in self.special_tokens:
145
+ continue
146
+
147
+ token_id = len(self.mergeable_ranks) + len(self.special_tokens)
148
+ self.special_tokens[token] = token_id
149
+ self.decoder[token_id] = token
150
+
151
+ added_tokens.append(token)
152
+
153
+ import tiktoken
154
+
155
+ self.tokenizer = tiktoken.Encoding(
156
+ "Qwen",
157
+ pat_str=PAT_STR,
158
+ mergeable_ranks=self.mergeable_ranks,
159
+ special_tokens=self.special_tokens,
160
+ )
161
+
162
+ return len(added_tokens)
163
+ else:
164
+ raise ValueError("Adding regular tokens is not supported")
165
+
166
+ def _add_tokens(
167
+ self,
168
+ new_tokens: Union[List[str], List[AddedToken]],
169
+ special_tokens: bool = False,
170
+ ) -> int:
171
+ if not special_tokens and new_tokens:
172
+ raise ValueError("Adding regular tokens is not supported")
173
+ new_tokens_str = []
174
+ for token in new_tokens:
175
+ surface_form = token.content if isinstance(token, AddedToken) else token
176
+ new_tokens_str.append(surface_form)
177
+
178
+ return self._update_tiktoken(new_tokens_str, special_tokens)
179
+
180
+ def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
181
+ """
182
+ Save only the vocabulary of the tokenizer (vocabulary).
183
+
184
+ Returns:
185
+ `Tuple(str)`: Paths to the files saved.
186
+ """
187
+ file_path = os.path.join(save_directory, "qwen.tiktoken")
188
+ with open(file_path, "w", encoding="utf8") as w:
189
+ for k, v in self.mergeable_ranks.items():
190
+ line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
191
+ w.write(line)
192
+ return (file_path,)
193
+
194
+ def tokenize(
195
+ self,
196
+ text: str,
197
+ allowed_special: Union[Set, str] = "all",
198
+ disallowed_special: Union[Collection, str] = (),
199
+ **kwargs,
200
+ ) -> List[Union[bytes, str]]:
201
+ """
202
+ Converts a string in a sequence of tokens.
203
+
204
+ Args:
205
+ text (`str`):
206
+ The sequence to be encoded.
207
+ allowed_special (`Literal["all"]` or `set`):
208
+ The surface forms of the tokens to be encoded as special tokens in regular texts.
209
+ Default to "all".
210
+ disallowed_special (`Literal["all"]` or `Collection`):
211
+ The surface forms of the tokens that should not be in regular texts and trigger errors.
212
+ Default to an empty tuple.
213
+
214
+ kwargs (additional keyword arguments, *optional*):
215
+ Will be passed to the underlying model specific encode method.
216
+
217
+ Returns:
218
+ `List[bytes|str]`: The list of tokens.
219
+ """
220
+ tokens = []
221
+ text = unicodedata.normalize("NFC", text)
222
+
223
+ # this implementation takes a detour: text -> token id -> token surface forms
224
+ for t in self.tokenizer.encode(
225
+ text, allowed_special=allowed_special, disallowed_special=disallowed_special
226
+ ):
227
+ tokens.append(self.decoder[t])
228
+ return tokens
229
+
230
+ def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
231
+ """
232
+ Converts a sequence of tokens in a single string.
233
+ """
234
+ text = ""
235
+ temp = b""
236
+ for t in tokens:
237
+ if isinstance(t, str):
238
+ if temp:
239
+ text += temp.decode("utf-8", errors=self.errors)
240
+ temp = b""
241
+ text += t
242
+ elif isinstance(t, bytes):
243
+ temp += t
244
+ else:
245
+ raise TypeError("token should only be of type types or str")
246
+ if temp:
247
+ text += temp.decode("utf-8", errors=self.errors)
248
+ return text
249
+
250
+ @property
251
+ def vocab_size(self):
252
+ return self.tokenizer.n_vocab
253
+
254
+ def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
255
+ """Converts an id to a token, special tokens included"""
256
+ if index in self.decoder:
257
+ return self.decoder[index]
258
+ raise ValueError("unknown ids")
259
+
260
+ def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
261
+ """Converts a token to an id using the vocab, special tokens included"""
262
+ if token in self.special_tokens:
263
+ return self.special_tokens[token]
264
+ if token in self.mergeable_ranks:
265
+ return self.mergeable_ranks[token]
266
+ raise ValueError("unknown token")
267
+
268
+ def _tokenize(self, text: str, **kwargs):
269
+ """
270
+ Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
271
+ vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
272
+
273
+ Do NOT take care of added tokens.
274
+ """
275
+ raise NotImplementedError
276
+
277
+ def _decode(
278
+ self,
279
+ token_ids: Union[int, List[int]],
280
+ skip_special_tokens: bool = False,
281
+ errors: str = None,
282
+ **kwargs,
283
+ ) -> str:
284
+ if isinstance(token_ids, int):
285
+ token_ids = [token_ids]
286
+ if skip_special_tokens:
287
+ token_ids = [i for i in token_ids if i < self.eod_id]
288
+ return self.tokenizer.decode(token_ids, errors=errors or self.errors)
@@ -239,7 +239,7 @@ def adapt_stale_fwd_patch(self, name, value):
239
239
  "might be based on an old oversion which missing some "
240
240
  f"arguments compared with the latest, such as {new_args}. "
241
241
  "We automatically add compatibility on the patch for "
242
- "these arguemnts, and maybe the patch should be updated."
242
+ "these arguments, and maybe the patch should be updated."
243
243
  )
244
244
  else:
245
245
  logging.warning(
@@ -247,7 +247,7 @@ def adapt_stale_fwd_patch(self, name, value):
247
247
  "is patched and the patch might be conflict with patches made "
248
248
  f"by paddlenlp which seems have more arguments such as {new_args}. "
249
249
  "We automatically add compatibility on the patch for "
250
- "these arguemnts, and maybe the patch should be updated."
250
+ "these arguments, and maybe the patch should be updated."
251
251
  )
252
252
  if isinstance(self, paddle.nn.Layer) and inspect.isfunction(value):
253
253
 
@@ -290,8 +290,8 @@ class InitTrackerMeta(type):
290
290
 
291
291
  def __init__(cls, name, bases, attrs):
292
292
  init_func = cls.__init__
293
- # If attrs has `__init__`, wrap it using accessable `_pre_init, _post_init`.
294
- # Otherwise, no need to wrap again since the super cls has been wraped.
293
+ # If attrs has `__init__`, wrap it using accessible `_pre_init, _post_init`.
294
+ # Otherwise, no need to wrap again since the super cls has been wrapped.
295
295
  # TODO: remove reduplicated tracker if using super cls `__init__`
296
296
  pre_init_func = getattr(cls, "_pre_init", None) if "__init__" in attrs else None
297
297
  post_init_func = (
@@ -323,12 +323,12 @@ class InitTrackerMeta(type):
323
323
 
324
324
  @functools.wraps(init_func)
325
325
  def __impl__(self, *args, **kwargs):
326
- # registed helper by `pre_init_func`
326
+ # registered helper by `pre_init_func`
327
327
  if pre_init_func:
328
328
  pre_init_func(self, init_func, *args, **kwargs)
329
329
  # keep full configuration
330
330
  init_func(self, *args, **kwargs)
331
- # registed helper by `post_init_func`
331
+ # registered helper by `post_init_func`
332
332
  if post_init_func:
333
333
  post_init_func(self, init_func, *args, **kwargs)
334
334
  self.init_config = kwargs
@@ -588,7 +588,7 @@ def _is_control(char):
588
588
 
589
589
 
590
590
  def _is_nonnormalized_char(char):
591
- """Check whther `chars` is a non-normalized character."""
591
+ """Check whether `chars` is a non-normalized character."""
592
592
  cp = ord(char)
593
593
  if (
594
594
  (0xFF00 <= cp <= 0xFFEF)
@@ -688,7 +688,7 @@ class ChatTemplateMixin:
688
688
  conversation = [[conversation]]
689
689
  elif isinstance(conversation, list) and isinstance(conversation[0], str):
690
690
  raise ValueError(
691
- "apply_chat_template do not support appling batch conversations, "
691
+ "apply_chat_template do not support applying batch conversations, "
692
692
  "so you should apply the conversation one by one."
693
693
  )
694
694
 
@@ -710,7 +710,7 @@ class ChatTemplateMixin:
710
710
  conversations = conversation
711
711
  else:
712
712
  raise ValueError(
713
- "apply_chat_template do not support appling batch conversations, "
713
+ "apply_chat_template do not support applying batch conversations, "
714
714
  "so you should apply the conversation one by one."
715
715
  )
716
716
  query = self.chat_template.render(
@@ -847,7 +847,7 @@ class ChatTemplateMixin:
847
847
  self, origin_msg: List[Dict[str, str]], split_s: List[str]
848
848
  ):
849
849
  """Split the entire chat by specified words. Extract the non-learnable parts."""
850
- # distingish and replace the special words in original string to an uncompiled form: Like | -> \|
850
+ # distinguish and replace the special words in original string to an uncompiled form: Like | -> \|
851
851
  regex_pattern = "|".join(map(re.escape, split_s))
852
852
  # splited by replaced specified words
853
853
  non_learnable_parts = re.split(
@@ -1738,7 +1738,7 @@ class PretrainedTokenizer(
1738
1738
  [0] * len(pair_ids) if pair else []
1739
1739
  )
1740
1740
  encoded_inputs["offset_mapping"] = offset_mapping
1741
- # Build output dictionnary
1741
+ # Build output dictionary
1742
1742
  encoded_inputs["input_ids"] = sequence
1743
1743
  if return_token_type_ids:
1744
1744
  encoded_inputs["token_type_ids"] = token_type_ids
@@ -2108,7 +2108,7 @@ def _is_whitespace(char):
2108
2108
  """
2109
2109
  Checks whether `chars` is a whitespace character.
2110
2110
  """
2111
- # \t, \n, and \r are technically contorl characters but we treat them
2111
+ # \t, \n, and \r are technically control characters but we treat them
2112
2112
  # as whitespace since they are generally considered as such.
2113
2113
  if char == " " or char == "\t" or char == "\n" or char == "\r":
2114
2114
  return True
@@ -2136,7 +2136,7 @@ def convert_to_unicode(text):
2136
2136
 
2137
2137
  def whitespace_tokenize(text):
2138
2138
  """
2139
- Runs basic whitespace cleaning and splitting on a peice of text.
2139
+ Runs basic whitespace cleaning and splitting on a piece of text.
2140
2140
  Args:
2141
2141
  text (str): Text to be tokenized.
2142
2142
  Returns:
@@ -1634,7 +1634,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
1634
1634
  # From HF Hub or AI Studio
1635
1635
  if from_hf_hub or from_aistudio:
1636
1636
  # Only include the necessary resource files specified by the tokenizer cls
1637
- # Deep copy to avoid modifiying the class attributes
1637
+ # Deep copy to avoid modifying the class attributes
1638
1638
  vocab_files = copy.deepcopy(cls.resource_files_names)
1639
1639
  vocab_files["tokenizer_config_file"] = cls.tokenizer_config_file
1640
1640
 
@@ -3110,7 +3110,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
3110
3110
  sequence = ids + pair_ids if pair else ids
3111
3111
  token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else [])
3112
3112
 
3113
- # Build output dictionnary
3113
+ # Build output dictionary
3114
3114
  encoded_inputs["input_ids"] = sequence
3115
3115
  if return_token_type_ids:
3116
3116
  encoded_inputs["token_type_ids"] = token_type_ids
@@ -3531,7 +3531,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
3531
3531
  prefix_offset: int = 0,
3532
3532
  read_offset: int = 0,
3533
3533
  ) -> Tuple[str, int, int]:
3534
- """tokenizer decoding for the streaming generation use case. This method can be overrided for tokenizer that doesn't follow this API"""
3534
+ """tokenizer decoding for the streaming generation use case. This method can be overridden for tokenizer that doesn't follow this API"""
3535
3535
  prefix_text = self.decode(
3536
3536
  all_input_ids[prefix_offset:read_offset],
3537
3537
  skip_special_tokens=False,
@@ -27,8 +27,8 @@ class Vocab(object):
27
27
  store/load functions.
28
28
 
29
29
  Args:
30
- counter (collections.Counter, optional): A Counter intance describes
31
- the tokens and their frequencies. Its keys will be indexed accroding
30
+ counter (collections.Counter, optional): A Counter instance describes
31
+ the tokens and their frequencies. Its keys will be indexed according
32
32
  to the order of frequency sorting to construct mapping relationship.
33
33
  If None, `token_to_idx` must be provided as the mapping relationship.
34
34
  Default: None.
@@ -40,7 +40,7 @@ class Vocab(object):
40
40
  between tokens and indices to be used. If provided, adjust the tokens
41
41
  and indices mapping according to it. If None, counter must be provided.
42
42
  Default: None.
43
- unk_token (str, optional): Special token for unknow token. If no need,
43
+ unk_token (str, optional): Special token for unknown token. If no need,
44
44
  it also could be None. Default: None.
45
45
  pad_token (str, optional): Special token for padding token. If no need,
46
46
  it also could be None. Default: None.
@@ -231,7 +231,7 @@ class Vocab(object):
231
231
  for idx in indices:
232
232
  if not isinstance(idx, (int, np.integer)):
233
233
  warnings.warn(
234
- "The type of `to_tokens()`'s input `indices` is not `int` which will be forcibly transfered to `int`. "
234
+ "The type of `to_tokens()`'s input `indices` is not `int` which will be forcibly transferred to `int`. "
235
235
  )
236
236
  idx = int(idx)
237
237
 
@@ -422,7 +422,7 @@ class Vocab(object):
422
422
  Args:
423
423
  token_to_idx (dict): A dict describes the mapping relationship between
424
424
  tokens and indices.
425
- unk_token (str, optional): The special token for unknow token. If
425
+ unk_token (str, optional): The special token for unknown token. If
426
426
  no need, it also could be None. Default: None.
427
427
  pad_token (str, optional): The special token for padding token. If
428
428
  no need, it also could be None. Default: None.
@@ -480,7 +480,7 @@ class Vocab(object):
480
480
  **kwargs
481
481
  ):
482
482
  """
483
- Builds the :class:`Vocab` accoring to given iterator and other
483
+ Builds the :class:`Vocab` according to given iterator and other
484
484
  information. Firstly, iterate over the `iterator` to construct a
485
485
  :class:`collections.Counter` and used to init the as :class:`Vocab`.
486
486
 
@@ -495,7 +495,7 @@ class Vocab(object):
495
495
  relationship between tokens and indices to be used. If provided,
496
496
  adjust the tokens and indices mapping according to it. If None,
497
497
  counter must be provided. Default: None.
498
- unk_token (str, optional): The special token for unknow token
498
+ unk_token (str, optional): The special token for unknown token
499
499
  '<unk>'. If no need, it also could be None. Default: None.
500
500
  pad_token (str, optional): The special token for padding token
501
501
  '<pad>'. If no need, it also could be None. Default: None.
@@ -0,0 +1,99 @@
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ import numpy as np
17
+ import paddle
18
+
19
+
20
+ def fuse_param_func():
21
+ def fn(fuse_params, is_qkv=False, num_heads=None, num_key_value_heads=None):
22
+ concat_fn = np.concatenate
23
+ split_fn = np.split
24
+ if isinstance(fuse_params[0], paddle.Tensor):
25
+ concat_fn = paddle.concat
26
+ split_fn = paddle.split
27
+
28
+ if is_qkv:
29
+ assert (
30
+ num_heads
31
+ ), f"num_heads should be number of heads for Q, but got {num_heads}"
32
+ assert (
33
+ num_key_value_heads
34
+ ), f"num_key_value_heads should be number of key_value_heads for K and V, but got {num_key_value_heads}"
35
+ assert (
36
+ len(fuse_params) == 3
37
+ ), f"fuse_params length is not equal 3, it should be Q K V list. but got length {len(fuse_params)}"
38
+ num_query_groups = num_heads // num_key_value_heads
39
+ q_list = split_fn(fuse_params[0], num_heads, axis=-1)
40
+ k_list = split_fn(fuse_params[1], num_key_value_heads, axis=-1)
41
+ v_list = split_fn(fuse_params[2], num_key_value_heads, axis=-1)
42
+
43
+ qkv_pairs = []
44
+ for i in range(num_key_value_heads):
45
+ qkv_pairs += q_list[i * num_query_groups : (i + 1) * num_query_groups]
46
+ qkv_pairs.append(k_list[i])
47
+ qkv_pairs.append(v_list[i])
48
+ return concat_fn(qkv_pairs, axis=-1)
49
+ else:
50
+ return concat_fn(fuse_params, axis=-1)
51
+
52
+ return fn
53
+
54
+
55
+ def split_param_func():
56
+ def fn(
57
+ fused_param,
58
+ split_nums=2,
59
+ is_qkv=False,
60
+ num_heads=None,
61
+ num_key_value_heads=None,
62
+ ):
63
+ concat_fn = np.concatenate
64
+ split_fn = np.split
65
+ if isinstance(fused_param, paddle.Tensor):
66
+ concat_fn = paddle.concat
67
+ split_fn = paddle.split
68
+
69
+ if is_qkv:
70
+ assert (
71
+ num_heads
72
+ ), f"num_heads should be number of heads for Q, but got {num_heads}"
73
+ assert (
74
+ num_key_value_heads
75
+ ), f"num_key_value_heads should be number of key_value_heads for K and V, but got {num_key_value_heads}"
76
+ num_query_groups = num_heads // num_key_value_heads
77
+ q_list, k_list, v_list = [], [], []
78
+ split_heads = split_fn(
79
+ fused_param, num_heads + 2 * num_key_value_heads, axis=-1
80
+ )
81
+ for i in range(num_key_value_heads):
82
+ q_list += split_heads[
83
+ i * (num_query_groups + 2) : (i + 1) * (num_query_groups + 2) - 2
84
+ ]
85
+ k_list.append(split_heads[(i + 1) * (num_query_groups + 2) - 2])
86
+ v_list.append(split_heads[(i + 1) * (num_query_groups + 2) - 1])
87
+ return (
88
+ concat_fn(q_list, axis=-1),
89
+ concat_fn(k_list, axis=-1),
90
+ concat_fn(v_list, axis=-1),
91
+ )
92
+ else:
93
+ return split_fn(fused_param, split_nums, axis=-1)
94
+
95
+ return fn
96
+
97
+
98
+ def split_or_fuse_func(is_fuse=True):
99
+ return fuse_param_func() if is_fuse else split_param_func()