paddlex 3.0.0rc1__py3-none-any.whl → 3.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (240) hide show
  1. paddlex/.version +1 -1
  2. paddlex/__init__.py +1 -1
  3. paddlex/configs/modules/chart_parsing/PP-Chart2Table.yaml +13 -0
  4. paddlex/configs/modules/doc_vlm/PP-DocBee2-3B.yaml +14 -0
  5. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-L.yaml +40 -0
  6. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-M.yaml +40 -0
  7. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-S.yaml +40 -0
  8. paddlex/configs/modules/layout_detection/PP-DocBlockLayout.yaml +40 -0
  9. paddlex/configs/modules/layout_detection/PP-DocLayout-L.yaml +2 -2
  10. paddlex/configs/modules/layout_detection/PP-DocLayout-M.yaml +2 -2
  11. paddlex/configs/modules/layout_detection/PP-DocLayout-S.yaml +2 -2
  12. paddlex/configs/modules/layout_detection/PP-DocLayout_plus-L.yaml +40 -0
  13. paddlex/configs/modules/text_detection/PP-OCRv5_mobile_det.yaml +40 -0
  14. paddlex/configs/modules/text_detection/PP-OCRv5_server_det.yaml +40 -0
  15. paddlex/configs/modules/text_recognition/PP-OCRv5_mobile_rec.yaml +39 -0
  16. paddlex/configs/modules/text_recognition/PP-OCRv5_server_rec.yaml +39 -0
  17. paddlex/configs/modules/textline_orientation/PP-LCNet_x1_0_textline_ori.yaml +41 -0
  18. paddlex/configs/pipelines/OCR.yaml +7 -6
  19. paddlex/configs/pipelines/PP-ChatOCRv3-doc.yaml +3 -1
  20. paddlex/configs/pipelines/PP-ChatOCRv4-doc.yaml +91 -34
  21. paddlex/configs/pipelines/PP-StructureV3.yaml +72 -72
  22. paddlex/configs/pipelines/doc_understanding.yaml +1 -1
  23. paddlex/configs/pipelines/formula_recognition.yaml +2 -2
  24. paddlex/configs/pipelines/layout_parsing.yaml +3 -2
  25. paddlex/configs/pipelines/seal_recognition.yaml +1 -0
  26. paddlex/configs/pipelines/table_recognition.yaml +2 -1
  27. paddlex/configs/pipelines/table_recognition_v2.yaml +7 -1
  28. paddlex/hpip_links.html +20 -20
  29. paddlex/inference/common/batch_sampler/doc_vlm_batch_sampler.py +33 -10
  30. paddlex/inference/common/batch_sampler/image_batch_sampler.py +34 -25
  31. paddlex/inference/common/result/mixin.py +19 -12
  32. paddlex/inference/models/base/predictor/base_predictor.py +2 -8
  33. paddlex/inference/models/common/static_infer.py +29 -73
  34. paddlex/inference/models/common/tokenizer/__init__.py +2 -0
  35. paddlex/inference/models/common/tokenizer/clip_tokenizer.py +1 -1
  36. paddlex/inference/models/common/tokenizer/gpt_tokenizer.py +2 -2
  37. paddlex/inference/models/common/tokenizer/qwen2_5_tokenizer.py +112 -0
  38. paddlex/inference/models/common/tokenizer/qwen2_tokenizer.py +7 -1
  39. paddlex/inference/models/common/tokenizer/qwen_tokenizer.py +288 -0
  40. paddlex/inference/models/common/tokenizer/tokenizer_utils.py +13 -13
  41. paddlex/inference/models/common/tokenizer/tokenizer_utils_base.py +3 -3
  42. paddlex/inference/models/common/tokenizer/vocab.py +7 -7
  43. paddlex/inference/models/common/ts/funcs.py +19 -8
  44. paddlex/inference/models/common/vlm/conversion_utils.py +99 -0
  45. paddlex/inference/models/common/vlm/fusion_ops.py +205 -0
  46. paddlex/inference/models/common/vlm/generation/configuration_utils.py +1 -1
  47. paddlex/inference/models/common/vlm/generation/logits_process.py +1 -1
  48. paddlex/inference/models/common/vlm/generation/utils.py +1 -1
  49. paddlex/inference/models/common/vlm/transformers/configuration_utils.py +3 -3
  50. paddlex/inference/models/common/vlm/transformers/conversion_utils.py +3 -3
  51. paddlex/inference/models/common/vlm/transformers/model_outputs.py +2 -2
  52. paddlex/inference/models/common/vlm/transformers/model_utils.py +7 -31
  53. paddlex/inference/models/doc_vlm/modeling/GOT_ocr_2_0.py +830 -0
  54. paddlex/inference/models/doc_vlm/modeling/__init__.py +2 -0
  55. paddlex/inference/models/doc_vlm/modeling/qwen2.py +1606 -0
  56. paddlex/inference/models/doc_vlm/modeling/qwen2_5_vl.py +3006 -0
  57. paddlex/inference/models/doc_vlm/modeling/qwen2_vl.py +0 -105
  58. paddlex/inference/models/doc_vlm/predictor.py +79 -24
  59. paddlex/inference/models/doc_vlm/processors/GOT_ocr_2_0.py +97 -0
  60. paddlex/inference/models/doc_vlm/processors/__init__.py +2 -0
  61. paddlex/inference/models/doc_vlm/processors/common.py +189 -0
  62. paddlex/inference/models/doc_vlm/processors/qwen2_5_vl.py +548 -0
  63. paddlex/inference/models/doc_vlm/processors/qwen2_vl.py +21 -176
  64. paddlex/inference/models/formula_recognition/predictor.py +8 -2
  65. paddlex/inference/models/formula_recognition/processors.py +90 -77
  66. paddlex/inference/models/formula_recognition/result.py +28 -27
  67. paddlex/inference/models/image_feature/processors.py +3 -4
  68. paddlex/inference/models/keypoint_detection/predictor.py +3 -0
  69. paddlex/inference/models/object_detection/predictor.py +2 -0
  70. paddlex/inference/models/object_detection/processors.py +28 -3
  71. paddlex/inference/models/object_detection/utils.py +2 -0
  72. paddlex/inference/models/table_structure_recognition/result.py +0 -10
  73. paddlex/inference/models/text_detection/predictor.py +8 -0
  74. paddlex/inference/models/text_detection/processors.py +44 -10
  75. paddlex/inference/models/text_detection/result.py +0 -10
  76. paddlex/inference/models/text_recognition/result.py +1 -1
  77. paddlex/inference/pipelines/__init__.py +9 -5
  78. paddlex/inference/pipelines/_parallel.py +172 -0
  79. paddlex/inference/pipelines/anomaly_detection/pipeline.py +16 -6
  80. paddlex/inference/pipelines/attribute_recognition/pipeline.py +11 -1
  81. paddlex/inference/pipelines/base.py +14 -4
  82. paddlex/inference/pipelines/components/faisser.py +1 -1
  83. paddlex/inference/pipelines/doc_preprocessor/pipeline.py +53 -27
  84. paddlex/inference/pipelines/formula_recognition/pipeline.py +120 -82
  85. paddlex/inference/pipelines/formula_recognition/result.py +1 -11
  86. paddlex/inference/pipelines/image_classification/pipeline.py +16 -6
  87. paddlex/inference/pipelines/image_multilabel_classification/pipeline.py +16 -6
  88. paddlex/inference/pipelines/instance_segmentation/pipeline.py +16 -6
  89. paddlex/inference/pipelines/keypoint_detection/pipeline.py +16 -6
  90. paddlex/inference/pipelines/layout_parsing/layout_objects.py +859 -0
  91. paddlex/inference/pipelines/layout_parsing/pipeline.py +34 -47
  92. paddlex/inference/pipelines/layout_parsing/pipeline_v2.py +832 -260
  93. paddlex/inference/pipelines/layout_parsing/result.py +4 -17
  94. paddlex/inference/pipelines/layout_parsing/result_v2.py +259 -245
  95. paddlex/inference/pipelines/layout_parsing/setting.py +88 -0
  96. paddlex/inference/pipelines/layout_parsing/utils.py +391 -2028
  97. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/__init__.py +16 -0
  98. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/utils.py +1199 -0
  99. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/xycuts.py +615 -0
  100. paddlex/inference/pipelines/m_3d_bev_detection/pipeline.py +2 -2
  101. paddlex/inference/pipelines/multilingual_speech_recognition/pipeline.py +2 -2
  102. paddlex/inference/pipelines/object_detection/pipeline.py +16 -6
  103. paddlex/inference/pipelines/ocr/pipeline.py +127 -70
  104. paddlex/inference/pipelines/ocr/result.py +21 -18
  105. paddlex/inference/pipelines/open_vocabulary_detection/pipeline.py +2 -2
  106. paddlex/inference/pipelines/open_vocabulary_segmentation/pipeline.py +2 -2
  107. paddlex/inference/pipelines/pp_chatocr/pipeline_base.py +2 -2
  108. paddlex/inference/pipelines/pp_chatocr/pipeline_v3.py +2 -5
  109. paddlex/inference/pipelines/pp_chatocr/pipeline_v4.py +6 -6
  110. paddlex/inference/pipelines/rotated_object_detection/pipeline.py +16 -6
  111. paddlex/inference/pipelines/seal_recognition/pipeline.py +109 -53
  112. paddlex/inference/pipelines/semantic_segmentation/pipeline.py +16 -6
  113. paddlex/inference/pipelines/small_object_detection/pipeline.py +16 -6
  114. paddlex/inference/pipelines/table_recognition/pipeline.py +26 -18
  115. paddlex/inference/pipelines/table_recognition/pipeline_v2.py +624 -53
  116. paddlex/inference/pipelines/table_recognition/result.py +1 -1
  117. paddlex/inference/pipelines/table_recognition/table_recognition_post_processing_v2.py +9 -5
  118. paddlex/inference/pipelines/ts_anomaly_detection/pipeline.py +2 -2
  119. paddlex/inference/pipelines/ts_classification/pipeline.py +2 -2
  120. paddlex/inference/pipelines/ts_forecasting/pipeline.py +2 -2
  121. paddlex/inference/pipelines/video_classification/pipeline.py +2 -2
  122. paddlex/inference/pipelines/video_detection/pipeline.py +2 -2
  123. paddlex/inference/serving/basic_serving/_app.py +46 -13
  124. paddlex/inference/serving/basic_serving/_pipeline_apps/_common/common.py +5 -1
  125. paddlex/inference/serving/basic_serving/_pipeline_apps/layout_parsing.py +0 -1
  126. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv3_doc.py +0 -1
  127. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv4_doc.py +1 -1
  128. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_structurev3.py +6 -2
  129. paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition.py +1 -5
  130. paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition_v2.py +4 -5
  131. paddlex/inference/serving/infra/utils.py +20 -22
  132. paddlex/inference/serving/schemas/formula_recognition.py +1 -1
  133. paddlex/inference/serving/schemas/layout_parsing.py +1 -2
  134. paddlex/inference/serving/schemas/pp_chatocrv3_doc.py +1 -2
  135. paddlex/inference/serving/schemas/pp_chatocrv4_doc.py +2 -2
  136. paddlex/inference/serving/schemas/pp_structurev3.py +10 -6
  137. paddlex/inference/serving/schemas/seal_recognition.py +1 -1
  138. paddlex/inference/serving/schemas/table_recognition.py +2 -6
  139. paddlex/inference/serving/schemas/table_recognition_v2.py +5 -6
  140. paddlex/inference/utils/hpi.py +30 -16
  141. paddlex/inference/utils/hpi_model_info_collection.json +666 -162
  142. paddlex/inference/utils/io/readers.py +12 -12
  143. paddlex/inference/utils/misc.py +20 -0
  144. paddlex/inference/utils/mkldnn_blocklist.py +59 -0
  145. paddlex/inference/utils/official_models.py +140 -5
  146. paddlex/inference/utils/pp_option.py +74 -9
  147. paddlex/model.py +2 -2
  148. paddlex/modules/__init__.py +1 -1
  149. paddlex/modules/anomaly_detection/evaluator.py +2 -2
  150. paddlex/modules/base/__init__.py +1 -1
  151. paddlex/modules/base/evaluator.py +5 -5
  152. paddlex/modules/base/trainer.py +1 -1
  153. paddlex/modules/doc_vlm/dataset_checker.py +2 -2
  154. paddlex/modules/doc_vlm/evaluator.py +2 -2
  155. paddlex/modules/doc_vlm/exportor.py +2 -2
  156. paddlex/modules/doc_vlm/model_list.py +1 -1
  157. paddlex/modules/doc_vlm/trainer.py +2 -2
  158. paddlex/modules/face_recognition/evaluator.py +2 -2
  159. paddlex/modules/formula_recognition/evaluator.py +5 -2
  160. paddlex/modules/formula_recognition/model_list.py +3 -0
  161. paddlex/modules/formula_recognition/trainer.py +3 -0
  162. paddlex/modules/general_recognition/evaluator.py +1 -1
  163. paddlex/modules/image_classification/evaluator.py +2 -2
  164. paddlex/modules/image_classification/model_list.py +1 -0
  165. paddlex/modules/instance_segmentation/evaluator.py +1 -1
  166. paddlex/modules/keypoint_detection/evaluator.py +1 -1
  167. paddlex/modules/m_3d_bev_detection/evaluator.py +2 -2
  168. paddlex/modules/multilabel_classification/evaluator.py +2 -2
  169. paddlex/modules/object_detection/dataset_checker/dataset_src/convert_dataset.py +4 -4
  170. paddlex/modules/object_detection/evaluator.py +2 -2
  171. paddlex/modules/object_detection/model_list.py +2 -0
  172. paddlex/modules/semantic_segmentation/dataset_checker/__init__.py +12 -2
  173. paddlex/modules/semantic_segmentation/evaluator.py +2 -2
  174. paddlex/modules/table_recognition/evaluator.py +2 -2
  175. paddlex/modules/text_detection/evaluator.py +2 -2
  176. paddlex/modules/text_detection/model_list.py +2 -0
  177. paddlex/modules/text_recognition/evaluator.py +2 -2
  178. paddlex/modules/text_recognition/model_list.py +2 -0
  179. paddlex/modules/ts_anomaly_detection/evaluator.py +2 -2
  180. paddlex/modules/ts_classification/dataset_checker/dataset_src/split_dataset.py +1 -1
  181. paddlex/modules/ts_classification/evaluator.py +2 -2
  182. paddlex/modules/ts_forecast/evaluator.py +2 -2
  183. paddlex/modules/video_classification/evaluator.py +2 -2
  184. paddlex/modules/video_detection/evaluator.py +2 -2
  185. paddlex/ops/__init__.py +8 -5
  186. paddlex/paddlex_cli.py +19 -13
  187. paddlex/repo_apis/Paddle3D_api/bev_fusion/model.py +2 -2
  188. paddlex/repo_apis/PaddleClas_api/cls/config.py +1 -1
  189. paddlex/repo_apis/PaddleClas_api/cls/model.py +1 -1
  190. paddlex/repo_apis/PaddleClas_api/cls/register.py +10 -0
  191. paddlex/repo_apis/PaddleClas_api/cls/runner.py +1 -1
  192. paddlex/repo_apis/PaddleDetection_api/instance_seg/model.py +1 -1
  193. paddlex/repo_apis/PaddleDetection_api/instance_seg/runner.py +1 -1
  194. paddlex/repo_apis/PaddleDetection_api/object_det/config.py +1 -1
  195. paddlex/repo_apis/PaddleDetection_api/object_det/model.py +1 -1
  196. paddlex/repo_apis/PaddleDetection_api/object_det/official_categories.py +25 -0
  197. paddlex/repo_apis/PaddleDetection_api/object_det/register.py +30 -0
  198. paddlex/repo_apis/PaddleDetection_api/object_det/runner.py +1 -1
  199. paddlex/repo_apis/PaddleOCR_api/formula_rec/config.py +3 -3
  200. paddlex/repo_apis/PaddleOCR_api/formula_rec/model.py +5 -9
  201. paddlex/repo_apis/PaddleOCR_api/formula_rec/register.py +27 -0
  202. paddlex/repo_apis/PaddleOCR_api/formula_rec/runner.py +1 -1
  203. paddlex/repo_apis/PaddleOCR_api/table_rec/model.py +1 -1
  204. paddlex/repo_apis/PaddleOCR_api/table_rec/runner.py +1 -1
  205. paddlex/repo_apis/PaddleOCR_api/text_det/model.py +1 -1
  206. paddlex/repo_apis/PaddleOCR_api/text_det/register.py +18 -0
  207. paddlex/repo_apis/PaddleOCR_api/text_det/runner.py +1 -1
  208. paddlex/repo_apis/PaddleOCR_api/text_rec/config.py +3 -3
  209. paddlex/repo_apis/PaddleOCR_api/text_rec/model.py +5 -9
  210. paddlex/repo_apis/PaddleOCR_api/text_rec/register.py +18 -0
  211. paddlex/repo_apis/PaddleOCR_api/text_rec/runner.py +1 -1
  212. paddlex/repo_apis/PaddleSeg_api/seg/model.py +1 -1
  213. paddlex/repo_apis/PaddleSeg_api/seg/runner.py +1 -1
  214. paddlex/repo_apis/PaddleTS_api/ts_ad/config.py +3 -3
  215. paddlex/repo_apis/PaddleTS_api/ts_cls/config.py +2 -2
  216. paddlex/repo_apis/PaddleTS_api/ts_fc/config.py +4 -4
  217. paddlex/repo_apis/PaddleVideo_api/video_cls/config.py +1 -1
  218. paddlex/repo_apis/PaddleVideo_api/video_cls/model.py +1 -1
  219. paddlex/repo_apis/PaddleVideo_api/video_cls/runner.py +1 -1
  220. paddlex/repo_apis/PaddleVideo_api/video_det/config.py +1 -1
  221. paddlex/repo_apis/PaddleVideo_api/video_det/model.py +1 -1
  222. paddlex/repo_apis/PaddleVideo_api/video_det/runner.py +1 -1
  223. paddlex/repo_apis/base/config.py +1 -1
  224. paddlex/repo_manager/core.py +3 -3
  225. paddlex/repo_manager/meta.py +6 -2
  226. paddlex/repo_manager/repo.py +17 -16
  227. paddlex/utils/custom_device_list.py +26 -2
  228. paddlex/utils/deps.py +3 -3
  229. paddlex/utils/device.py +5 -13
  230. paddlex/utils/env.py +4 -0
  231. paddlex/utils/flags.py +11 -4
  232. paddlex/utils/fonts/__init__.py +34 -4
  233. paddlex/utils/misc.py +1 -1
  234. paddlex/utils/subclass_register.py +2 -2
  235. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/METADATA +349 -208
  236. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/RECORD +240 -211
  237. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/WHEEL +1 -1
  238. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/entry_points.txt +1 -0
  239. {paddlex-3.0.0rc1.dist-info/licenses → paddlex-3.0.2.dist-info}/LICENSE +0 -0
  240. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1606 @@
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import math
16
+ from functools import partial
17
+ from typing import List, Optional, Tuple, Union
18
+
19
+ import paddle
20
+ import paddle.distributed.fleet.meta_parallel as mpu
21
+ import paddle.nn as nn
22
+ import paddle.nn.functional as F
23
+ from paddle import Tensor
24
+ from paddle.distributed import fleet
25
+ from paddle.distributed.fleet.utils import sequence_parallel_utils
26
+
27
+ from .....utils import logging
28
+ from .....utils.env import get_device_type
29
+ from ...common.vlm import fusion_ops
30
+ from ...common.vlm.activations import ACT2FN
31
+ from ...common.vlm.transformers import PretrainedConfig, PretrainedModel
32
+ from ...common.vlm.transformers.model_outputs import (
33
+ BaseModelOutputWithPast,
34
+ CausalLMOutputWithPast,
35
+ )
36
+
37
+ try:
38
+ from paddle.incubate.nn.functional import fused_rotary_position_embedding
39
+ except ImportError:
40
+ fused_rotary_position_embedding = None
41
+
42
+ try:
43
+ from paddle.distributed.fleet.utils.sequence_parallel_utils import (
44
+ GatherOp,
45
+ ScatterOp,
46
+ mark_as_sequence_parallel_parameter,
47
+ )
48
+ except:
49
+ pass
50
+
51
+ try:
52
+ from paddle.nn.functional.flash_attention import flash_attention
53
+ except:
54
+ flash_attention = None
55
+
56
+
57
+ Linear = nn.Linear
58
+ ColumnParallelLinear = mpu.ColumnParallelLinear
59
+ RowParallelLinear = mpu.RowParallelLinear
60
+ ColumnSequenceParallelLinear = sequence_parallel_utils.ColumnSequenceParallelLinear
61
+ RowSequenceParallelLinear = sequence_parallel_utils.RowSequenceParallelLinear
62
+
63
+
64
+ class Qwen2Config(PretrainedConfig):
65
+ r"""
66
+ This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a
67
+ Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
68
+ with the defaults will yield a similar configuration to that of
69
+ Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta).
70
+
71
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
72
+ documentation from [`PretrainedConfig`] for more information.
73
+
74
+
75
+ Args:
76
+ vocab_size (`int`, *optional*, defaults to 151936):
77
+ Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the
78
+ `inputs_ids` passed when calling [`Qwen2Model`]
79
+ hidden_size (`int`, *optional*, defaults to 4096):
80
+ Dimension of the hidden representations.
81
+ intermediate_size (`int`, *optional*, defaults to 22016):
82
+ Dimension of the MLP representations.
83
+ num_hidden_layers (`int`, *optional*, defaults to 32):
84
+ Number of hidden layers in the Transformer encoder.
85
+ num_attention_heads (`int`, *optional*, defaults to 32):
86
+ Number of attention heads for each attention layer in the Transformer encoder.
87
+ num_key_value_heads (`int`, *optional*, defaults to 32):
88
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
89
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
90
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
91
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
92
+ by meanpooling all the original heads within that group. For more details checkout [this
93
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
94
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
95
+ The non-linear activation function (function or string) in the decoder.
96
+ max_position_embeddings (`int`, *optional*, defaults to 32768):
97
+ The maximum sequence length that this model might ever be used with.
98
+ initializer_range (`float`, *optional*, defaults to 0.02):
99
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
100
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
101
+ The epsilon used by the rms normalization layers.
102
+ use_cache (`bool`, *optional*, defaults to `True`):
103
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
104
+ relevant if `config.is_decoder=True`.
105
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
106
+ Whether the model's input and output word embeddings should be tied.
107
+ rope_theta (`float`, *optional*, defaults to 10000.0):
108
+ The base period of the RoPE embeddings.
109
+ use_sliding_window (`bool`, *optional*, defaults to `False`):
110
+ Whether to use sliding window attention.
111
+ sliding_window (`int`, *optional*, defaults to 4096):
112
+ Sliding window attention (SWA) window size. If not specified, will default to `4096`.
113
+ max_window_layers (`int`, *optional*, defaults to 28):
114
+ The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
115
+ attention_dropout (`float`, *optional*, defaults to 0.0):
116
+ The dropout ratio for the attention probabilities.
117
+ """
118
+
119
+ model_type = "qwen2"
120
+ keys_to_ignore_at_inference = ["past_key_values"]
121
+
122
+ def __init__(
123
+ self,
124
+ vocab_size=151936,
125
+ hidden_size=4096,
126
+ intermediate_size=22016,
127
+ num_hidden_layers=32,
128
+ num_attention_heads=32,
129
+ num_key_value_heads=32,
130
+ hidden_act="silu",
131
+ max_position_embeddings=32768,
132
+ seq_length=32768,
133
+ initializer_range=0.02,
134
+ rms_norm_eps=1e-6,
135
+ use_cache=True,
136
+ tie_word_embeddings=False,
137
+ rope_theta=10000.0,
138
+ pad_token_id=0,
139
+ bos_token_id=151643,
140
+ eos_token_id=151643,
141
+ use_sliding_window=False,
142
+ sliding_window=4096,
143
+ max_window_layers=28,
144
+ attention_dropout=0.0,
145
+ rope_scaling_factor=1.0,
146
+ rope_scaling_type=None,
147
+ dpo_config=None,
148
+ **kwargs,
149
+ ):
150
+ self.vocab_size = vocab_size
151
+ self.max_position_embeddings = max_position_embeddings
152
+ self.seq_length = seq_length
153
+ self.hidden_size = hidden_size
154
+ self.intermediate_size = intermediate_size
155
+ self.num_hidden_layers = num_hidden_layers
156
+ self.num_attention_heads = num_attention_heads
157
+ self.use_sliding_window = use_sliding_window
158
+ self.sliding_window = sliding_window
159
+ self.max_window_layers = max_window_layers
160
+
161
+ # for backward compatibility
162
+ if num_key_value_heads is None:
163
+ num_key_value_heads = num_attention_heads
164
+
165
+ self.num_key_value_heads = num_key_value_heads
166
+ self.hidden_act = hidden_act
167
+ self.initializer_range = initializer_range
168
+ self.rms_norm_eps = rms_norm_eps
169
+ self.use_cache = use_cache
170
+ self.rope_theta = rope_theta
171
+ self.attention_dropout = attention_dropout
172
+
173
+ self.use_cache = use_cache
174
+ self.rope_scaling_factor = rope_scaling_factor
175
+ self.rope_scaling_type = rope_scaling_type
176
+
177
+ self.pad_token_id = pad_token_id
178
+ self.bos_token_id = bos_token_id
179
+ self.eos_token_id = eos_token_id
180
+ self.dpo_config = dpo_config
181
+
182
+ super().__init__(
183
+ pad_token_id=pad_token_id,
184
+ bos_token_id=bos_token_id,
185
+ eos_token_id=eos_token_id,
186
+ tie_word_embeddings=tie_word_embeddings,
187
+ **kwargs,
188
+ )
189
+
190
+
191
+ def get_triangle_upper_mask(x, mask=None):
192
+ if mask is not None:
193
+ return mask
194
+ # [bsz, n_head, q_len, kv_seq_len]
195
+ shape = x.shape
196
+ # [bsz, 1, q_len, kv_seq_len]
197
+ shape[1] = 1
198
+ mask = paddle.full(shape, paddle.finfo(x.dtype).min, dtype=x.dtype)
199
+ mask = paddle.triu(mask, diagonal=1)
200
+ mask.stop_gradient = True
201
+ return mask
202
+
203
+
204
+ def parallel_matmul(
205
+ x: Tensor, y: Tensor, transpose_y=True, tensor_parallel_output=True
206
+ ):
207
+ is_fleet_init = True
208
+ tensor_parallel_degree = 1
209
+ try:
210
+ hcg = fleet.get_hybrid_communicate_group()
211
+ model_parallel_group = hcg.get_model_parallel_group()
212
+ tensor_parallel_degree = hcg.get_model_parallel_world_size()
213
+ except:
214
+ is_fleet_init = False
215
+
216
+ if paddle.in_dynamic_mode():
217
+ y_is_distributed = y.is_distributed
218
+ else:
219
+ y_is_distributed = tensor_parallel_degree > 1
220
+
221
+ if is_fleet_init and tensor_parallel_degree > 1 and y_is_distributed:
222
+ # if not running under distributed.launch, it will raise AttributeError: 'Fleet' object has no attribute '_hcg'
223
+ input_parallel = paddle.distributed.collective._c_identity(
224
+ x, group=model_parallel_group
225
+ )
226
+ logits = paddle.matmul(input_parallel, y, transpose_y=transpose_y)
227
+
228
+ if tensor_parallel_output:
229
+ return logits
230
+
231
+ return paddle.distributed.collective._c_concat(
232
+ logits, group=model_parallel_group
233
+ )
234
+
235
+ else:
236
+ logits = paddle.matmul(x, y, transpose_y=transpose_y)
237
+ return logits
238
+
239
+
240
+ def scaled_dot_product_attention(
241
+ query_states,
242
+ config,
243
+ key_states,
244
+ value_states,
245
+ attention_mask,
246
+ output_attentions,
247
+ attn_mask_startend_row_indices=None,
248
+ training=True,
249
+ sequence_parallel=False,
250
+ skip_recompute=False,
251
+ ):
252
+ bsz, q_len, num_heads, head_dim = query_states.shape
253
+ _, kv_seq_len, _, _ = value_states.shape
254
+
255
+ # [ bz, seqlen, nhead, head_dim] -> [bs, nhead, seq_len, head_dim]
256
+ query_states = paddle.transpose(query_states, [0, 2, 1, 3])
257
+ # merge with the next transpose
258
+ key_states = paddle.transpose(key_states, [0, 2, 1, 3])
259
+ value_states = paddle.transpose(value_states, [0, 2, 1, 3])
260
+
261
+ # Add pre divided factor to fix nan under float16.
262
+ if paddle.in_dynamic_mode() and query_states.dtype == paddle.float16:
263
+ pre_divided_factor = 32
264
+ else:
265
+ pre_divided_factor = 1
266
+
267
+ attn_weights = paddle.matmul(
268
+ query_states / (math.sqrt(head_dim) * pre_divided_factor),
269
+ key_states.transpose([0, 1, 3, 2]),
270
+ )
271
+
272
+ if attn_weights.shape != [bsz, num_heads, q_len, kv_seq_len]:
273
+ raise ValueError(
274
+ f"Attention weights should be of shape {(bsz, num_heads, q_len, kv_seq_len)}, but is"
275
+ f" {attn_weights.shape}"
276
+ )
277
+
278
+ if attention_mask is None:
279
+ attention_mask = get_triangle_upper_mask(attn_weights)
280
+
281
+ attention_mask = attention_mask.reshape([bsz, 1, q_len, kv_seq_len])
282
+ if attention_mask.shape != [bsz, 1, q_len, kv_seq_len]:
283
+ raise ValueError(
284
+ f"Attention mask should be of shape {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.shape}"
285
+ )
286
+
287
+ attn_weights = attn_weights + attention_mask
288
+
289
+ if not paddle.in_dynamic_mode():
290
+ attn_weights = F.softmax(
291
+ attn_weights * pre_divided_factor, axis=-1, dtype="float32"
292
+ ).astype(query_states.dtype)
293
+ else:
294
+ with paddle.amp.auto_cast(False):
295
+ attn_weights = F.softmax(
296
+ attn_weights.astype("float32") * pre_divided_factor,
297
+ axis=-1,
298
+ dtype="float32",
299
+ ).astype(query_states.dtype)
300
+
301
+ attn_weights = F.dropout(
302
+ attn_weights, p=config.attention_dropout, training=training
303
+ )
304
+
305
+ attn_output = paddle.matmul(attn_weights, value_states)
306
+ attn_output = attn_output.transpose([0, 2, 1, 3])
307
+
308
+ if sequence_parallel:
309
+ attn_output = attn_output.reshape([bsz * q_len, head_dim * num_heads])
310
+ else:
311
+ attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads])
312
+ return (attn_output, attn_weights) if output_attentions else attn_output
313
+
314
+
315
+ def is_casual_mask(attention_mask):
316
+ """
317
+ Upper triangular of attention_mask equals to attention_mask is casual
318
+ """
319
+ return (paddle.triu(attention_mask) == attention_mask).all().item()
320
+
321
+
322
+ def _make_causal_mask(input_ids_shape, past_key_values_length):
323
+ """
324
+ Make causal mask used for self-attention
325
+ """
326
+ batch_size, target_length = input_ids_shape # target_length: seq_len
327
+
328
+ mask = paddle.tril(paddle.ones((target_length, target_length), dtype="bool"))
329
+
330
+ if past_key_values_length > 0:
331
+ # [tgt_len, tgt_len + past_len]
332
+ mask = paddle.concat(
333
+ [paddle.ones([target_length, past_key_values_length], dtype="bool"), mask],
334
+ axis=-1,
335
+ )
336
+
337
+ # [bs, 1, tgt_len, tgt_len + past_len]
338
+ return mask[None, None, :, :].expand(
339
+ [batch_size, 1, target_length, target_length + past_key_values_length]
340
+ )
341
+
342
+
343
+ def _expand_2d_mask(mask, dtype, tgt_length):
344
+ """
345
+ Expands attention_mask from `[batch_size, src_length]` to `[batch_size, 1, tgt_length, src_length]`.
346
+ """
347
+ batch_size, src_length = mask.shape[0], mask.shape[-1]
348
+ tgt_length = tgt_length if tgt_length is not None else src_length
349
+
350
+ mask = mask[:, None, None, :].astype("bool")
351
+ mask.stop_gradient = True
352
+ expanded_mask = mask.expand([batch_size, 1, tgt_length, src_length])
353
+
354
+ return expanded_mask
355
+
356
+
357
+ class Qwen2RMSNorm(nn.Layer):
358
+ def __init__(self, config: Qwen2Config):
359
+ """
360
+ Qwen2RMSNorm is equivalent to T5LayerNorm
361
+ """
362
+ super().__init__()
363
+ self.hidden_size = config.hidden_size
364
+ self.weight = paddle.create_parameter(
365
+ shape=[self.hidden_size],
366
+ dtype=paddle.get_default_dtype(),
367
+ default_initializer=nn.initializer.Constant(1.0),
368
+ )
369
+ self.variance_epsilon = config.rms_norm_eps
370
+ self.config = config
371
+
372
+ if config.sequence_parallel:
373
+ mark_as_sequence_parallel_parameter(self.weight)
374
+
375
+ def forward(self, hidden_states):
376
+ if self.config.use_fused_rms_norm:
377
+ return fusion_ops.fusion_rms_norm(
378
+ hidden_states, self.weight, self.variance_epsilon, False
379
+ )
380
+
381
+ if paddle.in_dynamic_mode():
382
+ with paddle.amp.auto_cast(False):
383
+ variance = hidden_states.astype("float32").pow(2).mean(-1, keepdim=True)
384
+ hidden_states = (
385
+ paddle.rsqrt(variance + self.variance_epsilon) * hidden_states
386
+ )
387
+ else:
388
+ variance = hidden_states.astype("float32").pow(2).mean(-1, keepdim=True)
389
+ hidden_states = (
390
+ paddle.rsqrt(variance + self.variance_epsilon) * hidden_states
391
+ )
392
+
393
+ if self.weight.dtype in [paddle.float16, paddle.bfloat16]:
394
+ hidden_states = paddle.cast(hidden_states, self.weight.dtype)
395
+ return hidden_states * self.weight
396
+
397
+
398
+ class Qwen2RotaryEmbedding(nn.Layer):
399
+ def __init__(self, dim, max_position_embeddings=2048, base=10000):
400
+ super().__init__()
401
+ self.dim = dim
402
+ self.max_position_embeddings = max_position_embeddings
403
+ self.base = base
404
+ # [dim / 2]
405
+ self.inv_freq = 1.0 / (
406
+ self.base
407
+ ** (paddle.cast(paddle.arange(0, self.dim, 2), dtype="float32") / self.dim)
408
+ )
409
+ self._set_cos_sin_cache(seq_len=max_position_embeddings)
410
+
411
+ def _set_cos_sin_cache(self, seq_len):
412
+ self.max_seq_len_cached = seq_len
413
+ # [seq_len]
414
+ t = paddle.arange(seq_len, dtype="float32")
415
+ # [seq_len, dim/2]
416
+ freqs = paddle.einsum("i,j->ij", t, self.inv_freq)
417
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
418
+ # [seq_len, dim]
419
+ emb = paddle.concat([freqs, freqs], axis=-1)
420
+ # [1, seqlen, 1, dim]
421
+ self.cos_cached = emb.cos()[None, :, None, :]
422
+ self.sin_cached = emb.sin()[None, :, None, :]
423
+
424
+ def forward(self, x, seq_len=None):
425
+ # x: [bs, num_attention_heads, seq_len, head_size]
426
+ if seq_len > self.max_seq_len_cached:
427
+ self._set_cos_sin_cache(seq_len)
428
+ cos = self.cos_cached[:, :seq_len, :, :]
429
+ sin = self.sin_cached[:, :seq_len, :, :]
430
+ return (
431
+ cos.cast(x.dtype) if cos.dtype != x.dtype else cos,
432
+ sin.cast(x.dtype) if sin.dtype != x.dtype else sin,
433
+ )
434
+
435
+
436
+ def rotate_half(x):
437
+ """Rotates half the hidden dims of the input."""
438
+ x1 = x[..., : x.shape[-1] // 2]
439
+ x2 = x[..., x.shape[-1] // 2 :]
440
+ return paddle.concat([-x2, x1], axis=-1) # shape is the same as x
441
+
442
+
443
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
444
+ if position_ids is None:
445
+ # Note: Only for Qwen2MoEForCausalLMPipe model pretraining
446
+ cos = cos[:, : q.shape[1], :, :] # [bs, seq_len, 1, dim]
447
+ sin = sin[:, : q.shape[1], :, :] # [bs, seq_len, 1, dim]
448
+ else:
449
+ cos = cos.squeeze(axis=[0, 2]) # [seq_len, dim]
450
+ sin = sin.squeeze(axis=[0, 2]) # [seq_len, dim]
451
+ cos = cos[position_ids].unsqueeze(2) # [bs, seq_len, 1, dim]
452
+ sin = sin[position_ids].unsqueeze(2) # [bs, seq_len, 1, dim]
453
+ q_embed = (q * cos) + (rotate_half(q) * sin)
454
+ k_embed = (k * cos) + (rotate_half(k) * sin)
455
+ return q_embed, k_embed
456
+
457
+
458
+ class Qwen2MLP(nn.Layer):
459
+ def __init__(self, config: Qwen2Config, is_shared=False, skip_recompute_ops=None):
460
+ super().__init__()
461
+ if skip_recompute_ops is None:
462
+ skip_recompute_ops = {}
463
+ self.skip_recompute_ops = skip_recompute_ops
464
+ self.hidden_size = config.hidden_size
465
+ self.intermediate_size = config.intermediate_size
466
+ self.fuse_attention_ffn = config.fuse_attention_ffn
467
+
468
+ self.tensor_parallel_degree = config.tensor_parallel_degree
469
+
470
+ if config.sequence_parallel:
471
+ ColumnParallelLinear = ColumnSequenceParallelLinear
472
+ RowParallelLinear = RowSequenceParallelLinear
473
+
474
+ if config.tensor_parallel_degree > 1:
475
+ if self.fuse_attention_ffn:
476
+ self.gate_up_fused_proj = ColumnParallelLinear(
477
+ self.hidden_size,
478
+ self.intermediate_size * 2,
479
+ gather_output=False,
480
+ has_bias=False,
481
+ )
482
+ else:
483
+ self.gate_proj = ColumnParallelLinear(
484
+ self.hidden_size,
485
+ self.intermediate_size,
486
+ gather_output=False,
487
+ has_bias=False,
488
+ )
489
+ self.up_proj = ColumnParallelLinear(
490
+ self.hidden_size,
491
+ self.intermediate_size,
492
+ gather_output=False,
493
+ has_bias=False,
494
+ )
495
+ self.down_proj = RowParallelLinear(
496
+ self.intermediate_size,
497
+ self.hidden_size,
498
+ input_is_parallel=True,
499
+ has_bias=False,
500
+ )
501
+ else:
502
+ if self.fuse_attention_ffn:
503
+ self.gate_up_fused_proj = Linear(
504
+ self.hidden_size, self.intermediate_size * 2, bias_attr=False
505
+ )
506
+ else:
507
+ self.gate_proj = Linear(
508
+ self.hidden_size, self.intermediate_size, bias_attr=False
509
+ ) # w1
510
+ self.up_proj = Linear(
511
+ self.hidden_size, self.intermediate_size, bias_attr=False
512
+ ) # w3
513
+ self.down_proj = Linear(
514
+ self.intermediate_size, self.hidden_size, bias_attr=False
515
+ ) # w2
516
+
517
+ if config.hidden_act == "silu":
518
+ self.act_fn = fusion_ops.swiglu
519
+ self.fuse_swiglu = True
520
+ else:
521
+ self.act_fn = ACT2FN[config.hidden_act]
522
+ self.fuse_swiglu = False
523
+
524
+ def forward(self, x):
525
+ if self.fuse_attention_ffn:
526
+ x = self.gate_up_fused_proj(x)
527
+ if self.fuse_swiglu:
528
+ y = None
529
+ else:
530
+ x, y = x.chunk(2, axis=-1)
531
+ else:
532
+ x, y = self.gate_proj(x), self.up_proj(x)
533
+
534
+ if self.fuse_swiglu:
535
+ x = self.act_fn(x, y)
536
+ else:
537
+ x = self.act_fn(x) * y
538
+
539
+ return self.down_proj(x)
540
+
541
+
542
+ def repeat_kv(hidden_states: paddle.Tensor, n_rep: int) -> paddle.Tensor:
543
+ """
544
+ This is the equivalent of paddle.repeat_interleave(hidden_states, n_rep, axis=1). The hidden states go from (batch,
545
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
546
+ """
547
+ batch, slen, num_key_value_heads, head_dim = hidden_states.shape
548
+ if n_rep == 1:
549
+ return hidden_states
550
+
551
+ hidden_states = hidden_states.unsqueeze(-2).tile([1, 1, 1, n_rep, 1])
552
+ return hidden_states.reshape([batch, slen, num_key_value_heads * n_rep, head_dim])
553
+
554
+
555
+ class Qwen2Attention(nn.Layer):
556
+ """
557
+ Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
558
+ and "Generating Long Sequences with Sparse Transformers".
559
+ """
560
+
561
+ def __init__(
562
+ self,
563
+ config: Qwen2Config,
564
+ layerwise_recompute: bool = True,
565
+ skip_recompute_ops=None,
566
+ ):
567
+ super().__init__()
568
+ if skip_recompute_ops is None:
569
+ skip_recompute_ops = {}
570
+ self.config = config
571
+ self.skip_recompute_ops = skip_recompute_ops
572
+ self.hidden_size = config.hidden_size
573
+ self.num_heads = config.num_attention_heads
574
+
575
+ self.head_dim = self.hidden_size // config.num_attention_heads
576
+
577
+ self.num_key_value_heads = config.num_key_value_heads
578
+ assert config.num_attention_heads // config.num_key_value_heads
579
+ self.num_key_value_groups = (
580
+ config.num_attention_heads // config.num_key_value_heads
581
+ )
582
+ self.gqa_or_mqa = config.num_attention_heads != config.num_key_value_heads
583
+ self.max_position_embeddings = config.max_position_embeddings
584
+ self.rope_theta = config.rope_theta
585
+ self.is_causal = True
586
+ self.attention_dropout = config.attention_dropout
587
+
588
+ self.seq_length = config.seq_length
589
+ self.sequence_parallel = config.sequence_parallel
590
+
591
+ self.fuse_attention_qkv = config.fuse_attention_qkv
592
+
593
+ # Note that we will actually perform a recompute only if both enable_recompute and layerwise_recompute are set to True
594
+ # Enable_recompute defaults to False and is controlled by Trainer
595
+ self.enable_recompute = False
596
+ self.layerwise_recompute = layerwise_recompute
597
+ self.recompute_granularity = config.recompute_granularity
598
+ if config.tensor_parallel_degree > 1:
599
+ assert (
600
+ self.num_heads % config.tensor_parallel_degree == 0
601
+ ), f"num_heads: {self.num_heads}, tensor_parallel_degree: {config.tensor_parallel_degree}"
602
+ self.num_heads = self.num_heads // config.tensor_parallel_degree
603
+
604
+ assert (
605
+ self.num_key_value_heads % config.tensor_parallel_degree == 0
606
+ ), f"num_key_value_heads: {self.num_key_value_heads}, tensor_parallel_degree: {config.tensor_parallel_degree}"
607
+ self.num_key_value_heads = (
608
+ self.num_key_value_heads // config.tensor_parallel_degree
609
+ )
610
+
611
+ self.use_fused_rope = config.use_fused_rope
612
+ if self.use_fused_rope:
613
+ if (
614
+ get_device_type() not in ["gpu", "xpu"]
615
+ or fused_rotary_position_embedding is None
616
+ ):
617
+ logging.warning(
618
+ "Enable fuse rope in the config, but fuse rope is not available. "
619
+ "Will disable fuse rope. Try using latest gpu version of Paddle."
620
+ )
621
+ self.use_fused_rope = False
622
+
623
+ if config.sequence_parallel:
624
+ ColumnParallelLinear = ColumnSequenceParallelLinear
625
+ RowParallelLinear = RowSequenceParallelLinear
626
+
627
+ if config.tensor_parallel_degree > 1:
628
+ if self.fuse_attention_qkv:
629
+ self.qkv_proj = ColumnParallelLinear(
630
+ self.hidden_size,
631
+ self.hidden_size
632
+ + 2 * self.config.num_key_value_heads * self.head_dim,
633
+ has_bias=True,
634
+ gather_output=False,
635
+ )
636
+ else:
637
+ self.q_proj = ColumnParallelLinear(
638
+ self.hidden_size,
639
+ self.hidden_size,
640
+ has_bias=True,
641
+ gather_output=False,
642
+ )
643
+ self.k_proj = ColumnParallelLinear(self.hidden_size, self.config.num_key_value_heads * self.head_dim, has_bias=True, gather_output=False) # fmt:skip
644
+ self.v_proj = ColumnParallelLinear(self.hidden_size, self.config.num_key_value_heads * self.head_dim, has_bias=True, gather_output=False) # fmt:skip
645
+ self.o_proj = RowParallelLinear(
646
+ self.hidden_size,
647
+ self.hidden_size,
648
+ has_bias=False,
649
+ input_is_parallel=True,
650
+ )
651
+ else:
652
+ if self.fuse_attention_qkv:
653
+ self.qkv_proj = Linear(
654
+ self.hidden_size,
655
+ self.hidden_size
656
+ + 2 * self.config.num_key_value_heads * self.head_dim,
657
+ )
658
+ else:
659
+ self.q_proj = Linear(self.hidden_size, self.hidden_size, bias_attr=True)
660
+ self.k_proj = Linear(
661
+ self.hidden_size,
662
+ self.config.num_key_value_heads * self.head_dim,
663
+ bias_attr=True,
664
+ )
665
+ self.v_proj = Linear(
666
+ self.hidden_size,
667
+ self.config.num_key_value_heads * self.head_dim,
668
+ bias_attr=True,
669
+ )
670
+ self.o_proj = Linear(self.hidden_size, self.hidden_size, bias_attr=False)
671
+
672
+ self.rotary_emb = Qwen2RotaryEmbedding(
673
+ self.head_dim,
674
+ max_position_embeddings=self.max_position_embeddings,
675
+ base=self.rope_theta,
676
+ )
677
+
678
+ self.attn_func = scaled_dot_product_attention
679
+
680
+ def forward(
681
+ self,
682
+ hidden_states,
683
+ position_ids: Optional[Tuple[paddle.Tensor]] = None,
684
+ past_key_value: Optional[Tuple[paddle.Tensor]] = None,
685
+ attention_mask: Optional[paddle.Tensor] = None,
686
+ output_attentions: bool = False,
687
+ use_cache: bool = False,
688
+ attn_mask_startend_row_indices: Optional[paddle.Tensor] = None,
689
+ **kwargs,
690
+ ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
691
+ """Input shape: Batch x Time x Channel"""
692
+ # [bs, seq_len, num_head * head_dim] -> [seq_len / n, bs, num_head * head_dim] (n is model parallelism)
693
+
694
+ if self.fuse_attention_qkv:
695
+ mix_layer = self.qkv_proj(hidden_states)
696
+ if self.sequence_parallel:
697
+ target_shape = [
698
+ -1,
699
+ self.seq_length,
700
+ self.num_key_value_heads,
701
+ (self.num_key_value_groups + 2) * self.head_dim,
702
+ ]
703
+ else:
704
+ target_shape = [
705
+ 0,
706
+ 0,
707
+ self.num_key_value_heads,
708
+ (self.num_key_value_groups + 2) * self.head_dim,
709
+ ]
710
+ mix_layer = paddle.reshape_(mix_layer, target_shape)
711
+ query_states, key_states, value_states = paddle.split(
712
+ mix_layer,
713
+ num_or_sections=[
714
+ self.num_key_value_groups * self.head_dim,
715
+ self.head_dim,
716
+ self.head_dim,
717
+ ],
718
+ axis=-1,
719
+ )
720
+ if self.gqa_or_mqa:
721
+ query_states = paddle.reshape_(
722
+ query_states, [0, 0, self.num_heads, self.head_dim]
723
+ )
724
+ else:
725
+ query_states = self.q_proj(hidden_states)
726
+ key_states = self.k_proj(hidden_states)
727
+ value_states = self.v_proj(hidden_states)
728
+
729
+ if self.sequence_parallel:
730
+ target_query_shape = [
731
+ -1,
732
+ self.seq_length,
733
+ self.num_heads,
734
+ self.head_dim,
735
+ ]
736
+ target_key_value_shape = [
737
+ -1,
738
+ self.seq_length,
739
+ self.num_key_value_heads,
740
+ self.head_dim,
741
+ ]
742
+ else:
743
+ target_query_shape = [0, 0, self.num_heads, self.head_dim]
744
+ target_key_value_shape = [0, 0, self.num_key_value_heads, self.head_dim]
745
+ query_states = query_states.reshape(shape=target_query_shape)
746
+ key_states = key_states.reshape(shape=target_key_value_shape)
747
+ value_states = value_states.reshape(shape=target_key_value_shape)
748
+
749
+ kv_seq_len = key_states.shape[-3]
750
+ if past_key_value is not None:
751
+ kv_seq_len += past_key_value[0].shape[-3]
752
+ if self.use_fused_rope:
753
+ assert past_key_value is None, "fuse rotary not support cache kv for now"
754
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
755
+ query_states, key_states, _ = fused_rotary_position_embedding(
756
+ query_states,
757
+ key_states,
758
+ v=None,
759
+ sin=sin,
760
+ cos=cos,
761
+ position_ids=position_ids,
762
+ use_neox_rotary_style=False,
763
+ )
764
+ else:
765
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
766
+ query_states, key_states = apply_rotary_pos_emb(
767
+ query_states, key_states, cos, sin, position_ids
768
+ )
769
+
770
+ # [bs, seq_len, num_head, head_dim]
771
+ if past_key_value is not None:
772
+ key_states = paddle.concat([past_key_value[0], key_states], axis=1)
773
+ value_states = paddle.concat([past_key_value[1], value_states], axis=1)
774
+ past_key_value = (key_states, value_states) if use_cache else None
775
+
776
+ # TODO(wj-Mcat): use broadcast strategy when n_kv_heads = 1
777
+ # repeat k/v heads if n_kv_heads < n_heads
778
+ paddle_version = float(paddle.__version__[:3])
779
+ if not self.config.use_flash_attention or (
780
+ (paddle_version != 0.0) and (paddle_version <= 2.6)
781
+ ):
782
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
783
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
784
+
785
+ outputs = self.attn_func(
786
+ query_states,
787
+ self.config,
788
+ key_states,
789
+ value_states,
790
+ attention_mask,
791
+ output_attentions,
792
+ attn_mask_startend_row_indices=attn_mask_startend_row_indices,
793
+ training=self.training,
794
+ sequence_parallel=self.sequence_parallel,
795
+ )
796
+ if output_attentions:
797
+ attn_output, attn_weights = outputs
798
+ else:
799
+ attn_output = outputs
800
+
801
+ # if sequence_parallel is true, out shape are [q_len / n, bs, num_head * head_dim]
802
+ # else their shape are [bs, q_len, num_head * head_dim], n is mp parallelism.
803
+ attn_output = self.o_proj(attn_output)
804
+
805
+ if not output_attentions:
806
+ attn_weights = None
807
+
808
+ outputs = (attn_output,)
809
+
810
+ if output_attentions:
811
+ outputs += (attn_weights,)
812
+
813
+ if use_cache:
814
+ outputs += (past_key_value,)
815
+
816
+ if type(outputs) is tuple and len(outputs) == 1:
817
+ outputs = outputs[0]
818
+
819
+ return outputs
820
+
821
+
822
+ class Qwen2DecoderLayer(nn.Layer):
823
+ def __init__(
824
+ self,
825
+ config: Qwen2Config,
826
+ layerwise_recompute: bool = False,
827
+ skip_recompute_ops=None,
828
+ ):
829
+ super().__init__()
830
+ if skip_recompute_ops is None:
831
+ skip_recompute_ops = {}
832
+ self.config = config
833
+ self.skip_recompute_ops = skip_recompute_ops
834
+ self.hidden_size = config.hidden_size
835
+ self.self_attn = Qwen2Attention(
836
+ config, layerwise_recompute, skip_recompute_ops=skip_recompute_ops
837
+ )
838
+
839
+ self.mlp = Qwen2MLP(config, skip_recompute_ops=skip_recompute_ops)
840
+ self.input_layernorm = Qwen2RMSNorm(config)
841
+ self.post_attention_layernorm = Qwen2RMSNorm(config)
842
+
843
+ # Note that we will actually perform a recompute only if both enable_recompute and layerwise_recompute are set to True
844
+ # Enable_recompute defaults to False and is controlled by Trainer
845
+ self.enable_recompute = False
846
+ self.layerwise_recompute = layerwise_recompute
847
+ self.recompute_granularity = config.recompute_granularity
848
+
849
+ def forward(
850
+ self,
851
+ hidden_states: paddle.Tensor,
852
+ position_ids: Optional[paddle.Tensor] = None,
853
+ attention_mask: Optional[paddle.Tensor] = None,
854
+ output_attentions: Optional[bool] = False,
855
+ past_key_value: Optional[Tuple[paddle.Tensor]] = None,
856
+ use_cache: Optional[bool] = False,
857
+ attn_mask_startend_row_indices: Optional[paddle.Tensor] = None,
858
+ **kwargs,
859
+ ) -> Tuple[paddle.Tensor, Optional[Tuple[paddle.Tensor, paddle.Tensor]]]:
860
+ """
861
+ Args:
862
+ hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
863
+ attention_mask (`paddle.Tensor`, *optional*): attention mask of size
864
+ `(batch, sequence_length)` where padding elements are indicated by 0.
865
+ output_attentions (`bool`, *optional*):
866
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
867
+ returned tensors for more detail.
868
+ use_cache (`bool`, *optional*):
869
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
870
+ (see `past_key_values`).
871
+ past_key_value (`Tuple(paddle.Tensor)`, *optional*): cached past key and value projection states
872
+ """
873
+
874
+ # [bs * seq_len, embed_dim] -> [seq_len * bs / n, embed_dim] (sequence_parallel)
875
+ residual = hidden_states
876
+
877
+ hidden_states = self.input_layernorm(hidden_states)
878
+
879
+ # Self Attention
880
+ outputs = self.self_attn(
881
+ hidden_states,
882
+ position_ids,
883
+ past_key_value,
884
+ attention_mask,
885
+ output_attentions,
886
+ use_cache,
887
+ attn_mask_startend_row_indices=attn_mask_startend_row_indices,
888
+ )
889
+
890
+ if type(outputs) is tuple:
891
+ hidden_states = outputs[0]
892
+ else:
893
+ hidden_states = outputs
894
+
895
+ if output_attentions:
896
+ self_attn_weights = outputs[1]
897
+
898
+ if use_cache:
899
+ present_key_value = outputs[2 if output_attentions else 1]
900
+
901
+ hidden_states = residual + hidden_states
902
+
903
+ # Fully Connected
904
+ residual = hidden_states
905
+ hidden_states = self.post_attention_layernorm(hidden_states)
906
+ hidden_states = self.mlp(hidden_states)
907
+
908
+ hidden_states = residual + hidden_states
909
+
910
+ outputs = (hidden_states,)
911
+
912
+ if output_attentions:
913
+ outputs += (self_attn_weights,)
914
+
915
+ if use_cache:
916
+ outputs += (present_key_value,)
917
+
918
+ if type(outputs) is tuple and len(outputs) == 1:
919
+ outputs = outputs[0]
920
+
921
+ return outputs
922
+
923
+
924
+ class Qwen2PretrainedModel(PretrainedModel):
925
+ config_class = Qwen2Config
926
+ base_model_prefix = "qwen2"
927
+ _keys_to_ignore_on_load_unexpected = [r"self_attn.rotary_emb.inv_freq"]
928
+
929
+ @classmethod
930
+ def _get_fuse_or_split_param_mappings(cls, config: Qwen2Config, is_fuse=False):
931
+ # return parameter fuse utils
932
+ from ...common.vlm.conversion_utils import split_or_fuse_func
933
+
934
+ fn = split_or_fuse_func(is_fuse=is_fuse)
935
+
936
+ # last key is fused key, other keys are to be fused.
937
+ fuse_qkv_keys = [
938
+ (
939
+ "layers.0.self_attn.q_proj.weight",
940
+ "layers.0.self_attn.k_proj.weight",
941
+ "layers.0.self_attn.v_proj.weight",
942
+ "layers.0.self_attn.qkv_proj.weight",
943
+ ),
944
+ (
945
+ "layers.0.self_attn.q_proj.bias",
946
+ "layers.0.self_attn.k_proj.bias",
947
+ "layers.0.self_attn.v_proj.bias",
948
+ "layers.0.self_attn.qkv_proj.bias",
949
+ ),
950
+ ]
951
+
952
+ fuse_gate_up_keys = (
953
+ "layers.0.mlp.gate_proj.weight",
954
+ "layers.0.mlp.up_proj.weight",
955
+ "layers.0.mlp.gate_up_fused_proj.weight",
956
+ )
957
+ num_heads = config.num_attention_heads
958
+ num_key_value_heads = getattr(config, "num_key_value_heads", num_heads)
959
+ fuse_attention_qkv = getattr(config, "fuse_attention_qkv", False)
960
+ fuse_attention_ffn = getattr(config, "fuse_attention_ffn", False)
961
+
962
+ final_actions = {}
963
+ if is_fuse:
964
+ if fuse_attention_qkv:
965
+ for i in range(config.num_hidden_layers):
966
+ for fuse_keys in fuse_qkv_keys:
967
+ keys = tuple(
968
+ [
969
+ key.replace("layers.0.", f"layers.{i}.")
970
+ for key in fuse_keys
971
+ ]
972
+ )
973
+ final_actions[keys] = partial(
974
+ fn,
975
+ is_qkv=True,
976
+ num_heads=num_heads,
977
+ num_key_value_heads=num_key_value_heads,
978
+ )
979
+ if fuse_attention_ffn:
980
+ for i in range(config.num_hidden_layers):
981
+ keys = tuple(
982
+ [
983
+ key.replace("layers.0.", f"layers.{i}.")
984
+ for key in fuse_gate_up_keys
985
+ ]
986
+ )
987
+ final_actions[keys] = fn
988
+ else:
989
+ if not fuse_attention_qkv:
990
+ for i in range(config.num_hidden_layers):
991
+ for fuse_keys in fuse_qkv_keys:
992
+ keys = tuple(
993
+ [
994
+ key.replace("layers.0.", f"layers.{i}.")
995
+ for key in fuse_keys
996
+ ]
997
+ )
998
+ final_actions[keys] = partial(
999
+ fn,
1000
+ split_nums=3,
1001
+ is_qkv=True,
1002
+ num_heads=num_heads,
1003
+ num_key_value_heads=num_key_value_heads,
1004
+ )
1005
+ if not fuse_attention_ffn:
1006
+ for i in range(config.num_hidden_layers):
1007
+ keys = tuple(
1008
+ [
1009
+ key.replace("layers.0.", f"layers.{i}.")
1010
+ for key in fuse_gate_up_keys
1011
+ ]
1012
+ )
1013
+ final_actions[keys] = partial(fn, split_nums=2)
1014
+ return final_actions
1015
+
1016
+
1017
+ class Qwen2Model(Qwen2PretrainedModel):
1018
+ """
1019
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`]
1020
+
1021
+ Args:
1022
+ config: Qwen2Config
1023
+ """
1024
+
1025
+ def __init__(self, config: Qwen2Config):
1026
+ super().__init__(config)
1027
+ self.padding_idx = config.pad_token_id
1028
+ self.vocab_size = config.vocab_size
1029
+
1030
+ self.hidden_size = config.hidden_size
1031
+ self.sequence_parallel = config.sequence_parallel
1032
+ self.recompute_granularity = config.recompute_granularity
1033
+ self.no_recompute_layers = (
1034
+ config.no_recompute_layers if config.no_recompute_layers is not None else []
1035
+ )
1036
+
1037
+ # Recompute defaults to False and is controlled by Trainer
1038
+ self.enable_recompute = False
1039
+ if (
1040
+ config.tensor_parallel_degree > 1
1041
+ and config.vocab_size % config.tensor_parallel_degree == 0
1042
+ ):
1043
+ self.embed_tokens = mpu.VocabParallelEmbedding(
1044
+ self.vocab_size,
1045
+ self.hidden_size,
1046
+ weight_attr=paddle.ParamAttr(initializer=nn.initializer.XavierNormal()),
1047
+ )
1048
+ else:
1049
+ self.embed_tokens = nn.Embedding(
1050
+ self.vocab_size,
1051
+ self.hidden_size,
1052
+ )
1053
+
1054
+ self.layers = nn.LayerList(
1055
+ [
1056
+ Qwen2DecoderLayer(
1057
+ config=config,
1058
+ layerwise_recompute=layer_idx not in self.no_recompute_layers,
1059
+ )
1060
+ for layer_idx in range(config.num_hidden_layers)
1061
+ ]
1062
+ )
1063
+ self.norm = Qwen2RMSNorm(config)
1064
+
1065
+ def get_input_embeddings(self):
1066
+ return self.embed_tokens
1067
+
1068
+ def set_input_embeddings(self, value):
1069
+ self.embed_tokens = value
1070
+
1071
+ @staticmethod
1072
+ def _prepare_decoder_attention_mask(
1073
+ attention_mask, input_shape, past_key_values_length, dtype
1074
+ ):
1075
+ if attention_mask is not None:
1076
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
1077
+ if len(attention_mask.shape) == 2:
1078
+ expanded_attn_mask = _expand_2d_mask(
1079
+ attention_mask, dtype, tgt_length=input_shape[-1]
1080
+ )
1081
+ # For decoding phase in generation, seq_length = 1, we don't need to add causal mask
1082
+ if input_shape[-1] > 1:
1083
+ combined_attention_mask = _make_causal_mask(
1084
+ input_shape,
1085
+ past_key_values_length=past_key_values_length,
1086
+ )
1087
+ expanded_attn_mask = expanded_attn_mask & combined_attention_mask
1088
+ # [bsz, seq_len, seq_len] -> [bsz, 1, seq_len, seq_len]
1089
+ elif len(attention_mask.shape) == 3:
1090
+ expanded_attn_mask = attention_mask.unsqueeze(1).astype("bool")
1091
+ # if attention_mask is already 4-D, do nothing
1092
+ else:
1093
+ expanded_attn_mask = attention_mask
1094
+ else:
1095
+ expanded_attn_mask = _make_causal_mask(
1096
+ input_shape,
1097
+ past_key_values_length=past_key_values_length,
1098
+ )
1099
+ # Convert bool attention_mask to float attention mask, which will be added to attention_scores later
1100
+ if get_device_type() == "xpu":
1101
+ x = paddle.to_tensor(0.0, dtype="float32")
1102
+ y = paddle.to_tensor(-1.7005809656952787e38, dtype="float32")
1103
+ expanded_attn_mask = paddle.where(expanded_attn_mask, x, y)
1104
+ else:
1105
+ expanded_attn_mask = paddle.where(
1106
+ expanded_attn_mask.cast("bool"), 0.0, paddle.finfo(dtype).min
1107
+ ).astype(dtype)
1108
+ return expanded_attn_mask
1109
+
1110
+ def forward(
1111
+ self,
1112
+ input_ids: paddle.Tensor = None,
1113
+ position_ids: Optional[paddle.Tensor] = None,
1114
+ attention_mask: Optional[paddle.Tensor] = None,
1115
+ inputs_embeds: Optional[paddle.Tensor] = None,
1116
+ use_cache: Optional[bool] = None,
1117
+ past_key_values: Optional[List[paddle.Tensor]] = None,
1118
+ output_attentions: Optional[bool] = None,
1119
+ output_hidden_states: Optional[bool] = None,
1120
+ return_dict: Optional[bool] = None,
1121
+ attn_mask_startend_row_indices=None,
1122
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
1123
+
1124
+ output_attentions = (
1125
+ output_attentions
1126
+ if output_attentions is not None
1127
+ else self.config.output_attentions
1128
+ )
1129
+ output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states # fmt:skip
1130
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
1131
+ return_dict = (
1132
+ return_dict if return_dict is not None else self.config.use_return_dict
1133
+ )
1134
+
1135
+ # retrieve input_ids and inputs_embeds
1136
+ if input_ids is not None and inputs_embeds is not None:
1137
+ raise ValueError(
1138
+ "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
1139
+ )
1140
+ elif input_ids is not None:
1141
+ batch_size, seq_length = input_ids.shape
1142
+ elif inputs_embeds is not None:
1143
+ batch_size, seq_length, _ = inputs_embeds.shape
1144
+ else:
1145
+ raise ValueError(
1146
+ "You have to specify either decoder_input_ids or decoder_inputs_embeds"
1147
+ )
1148
+
1149
+ if past_key_values is None:
1150
+ past_key_values = tuple([None] * len(self.layers))
1151
+ # NOTE: to make cache can be clear in-time
1152
+ past_key_values = list(past_key_values)
1153
+
1154
+ seq_length_with_past = seq_length
1155
+ cache_length = 0
1156
+ if past_key_values[0] is not None:
1157
+ cache_length = past_key_values[0][0].shape[1]
1158
+ seq_length_with_past += cache_length
1159
+ if inputs_embeds is None:
1160
+ # [bs, seq_len, dim]
1161
+ inputs_embeds = self.embed_tokens(input_ids)
1162
+
1163
+ if self.sequence_parallel:
1164
+ # [bs, seq_len, num_head * head_dim] -> [bs * seq_len, num_head * head_dim]
1165
+ bs, seq_len, hidden_size = inputs_embeds.shape
1166
+ inputs_embeds = paddle.reshape_(inputs_embeds, [bs * seq_len, hidden_size])
1167
+ # [seq_len * bs / n, num_head * head_dim] (n is mp parallelism)
1168
+ inputs_embeds = ScatterOp.apply(inputs_embeds)
1169
+
1170
+ # [bs, seq_len]
1171
+ attention_mask = (
1172
+ paddle.ones((batch_size, seq_length_with_past), dtype=paddle.bool)
1173
+ if attention_mask is None
1174
+ else attention_mask
1175
+ )
1176
+ attention_mask = self._prepare_decoder_attention_mask(
1177
+ attention_mask, (batch_size, seq_length), cache_length, inputs_embeds.dtype
1178
+ ) # [bs, 1, seq_len, seq_len]
1179
+ if self.config.use_flash_attention:
1180
+ attention_mask = None if is_casual_mask(attention_mask) else attention_mask
1181
+
1182
+ if position_ids is None:
1183
+ position_ids = paddle.arange(seq_length, dtype="int64").expand(
1184
+ (batch_size, seq_length)
1185
+ )
1186
+
1187
+ hidden_states = inputs_embeds
1188
+
1189
+ # decoder layers
1190
+ all_hidden_states = () if output_hidden_states else None
1191
+ all_self_attns = () if output_attentions else None
1192
+ next_decoder_cache = () if use_cache else None
1193
+
1194
+ for idx, (decoder_layer) in enumerate(self.layers):
1195
+ if output_hidden_states:
1196
+ all_hidden_states += (hidden_states,)
1197
+ past_key_value = (
1198
+ past_key_values[idx] if past_key_values is not None else None
1199
+ )
1200
+
1201
+ has_gradient = not hidden_states.stop_gradient
1202
+ if (
1203
+ self.enable_recompute
1204
+ and idx not in self.no_recompute_layers
1205
+ and has_gradient
1206
+ and self.recompute_granularity == "full"
1207
+ ):
1208
+ layer_outputs = self.recompute_training_full(
1209
+ decoder_layer,
1210
+ hidden_states,
1211
+ position_ids,
1212
+ attention_mask,
1213
+ output_attentions,
1214
+ past_key_value,
1215
+ use_cache,
1216
+ attn_mask_startend_row_indices=attn_mask_startend_row_indices,
1217
+ )
1218
+ else:
1219
+ layer_outputs = decoder_layer(
1220
+ hidden_states,
1221
+ position_ids,
1222
+ attention_mask,
1223
+ output_attentions,
1224
+ past_key_value,
1225
+ use_cache,
1226
+ attn_mask_startend_row_indices=attn_mask_startend_row_indices,
1227
+ )
1228
+
1229
+ # NOTE: clear outdate cache after it has been used for memory saving
1230
+ past_key_value = past_key_values[idx] = None
1231
+ if type(layer_outputs) is tuple:
1232
+ hidden_states = layer_outputs[0]
1233
+ else:
1234
+ hidden_states = layer_outputs
1235
+
1236
+ if output_attentions:
1237
+ all_self_attns += (layer_outputs[1],)
1238
+
1239
+ if use_cache:
1240
+ next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
1241
+
1242
+ hidden_states = self.norm(hidden_states)
1243
+
1244
+ # add hidden states from the last decoder layer
1245
+ if output_hidden_states:
1246
+ all_hidden_states += (hidden_states,)
1247
+
1248
+ next_cache = next_decoder_cache if use_cache else None
1249
+
1250
+ if not return_dict:
1251
+ return tuple(
1252
+ v
1253
+ for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
1254
+ if v is not None
1255
+ )
1256
+ return BaseModelOutputWithPast(
1257
+ last_hidden_state=hidden_states,
1258
+ past_key_values=next_cache,
1259
+ hidden_states=all_hidden_states,
1260
+ attentions=all_self_attns,
1261
+ )
1262
+
1263
+
1264
+ class Qwen2PretrainingCriterion(nn.Layer):
1265
+ """
1266
+ Criterion for Mixtral.
1267
+ It calculates the final loss.
1268
+ """
1269
+
1270
+ def __init__(self, config: Qwen2Config):
1271
+ super(Qwen2PretrainingCriterion, self).__init__()
1272
+ self.ignore_index = getattr(config, "ignore_index", -100)
1273
+ self.config = config
1274
+ self.enable_parallel_cross_entropy = (
1275
+ config.tensor_parallel_degree > 1 and config.tensor_parallel_output
1276
+ )
1277
+
1278
+ if (
1279
+ self.enable_parallel_cross_entropy
1280
+ ): # and False: # and lm_head is distributed
1281
+ self.loss_func = mpu.ParallelCrossEntropy(ignore_index=self.ignore_index)
1282
+ else:
1283
+ self.loss_func = paddle.nn.CrossEntropyLoss(
1284
+ reduction="none", ignore_index=self.ignore_index
1285
+ )
1286
+
1287
+ def forward(self, prediction_scores, masked_lm_labels):
1288
+ if self.enable_parallel_cross_entropy:
1289
+ if prediction_scores.shape[-1] == self.config.vocab_size:
1290
+ logging.warning(
1291
+ f"enable_parallel_cross_entropy, the vocab_size should be splitted: {prediction_scores.shape[-1]}, {self.config.vocab_size}"
1292
+ )
1293
+ self.loss_func = paddle.nn.CrossEntropyLoss(
1294
+ reduction="none", ignore_index=self.ignore_index
1295
+ )
1296
+
1297
+ with paddle.amp.auto_cast(False):
1298
+ masked_lm_loss = self.loss_func(
1299
+ prediction_scores.astype("float32"), masked_lm_labels.unsqueeze(2)
1300
+ )
1301
+
1302
+ # skip ignore_index which loss == 0
1303
+ # masked_lm_loss = masked_lm_loss[masked_lm_loss > 0]
1304
+ # loss = paddle.mean(masked_lm_loss)
1305
+ binary_sequence = paddle.where(
1306
+ masked_lm_loss > 0,
1307
+ paddle.ones_like(masked_lm_loss),
1308
+ paddle.zeros_like(masked_lm_loss),
1309
+ )
1310
+ count = paddle.sum(binary_sequence)
1311
+ if count == 0:
1312
+ loss = paddle.sum(masked_lm_loss * binary_sequence)
1313
+ else:
1314
+ loss = paddle.sum(masked_lm_loss * binary_sequence) / count
1315
+
1316
+ return loss
1317
+
1318
+
1319
+ class Qwen2LMHead(nn.Layer):
1320
+ def __init__(self, config: Qwen2Config, embedding_weights=None, transpose_y=False):
1321
+ super(Qwen2LMHead, self).__init__()
1322
+ self.config = config
1323
+ if (
1324
+ config.tensor_parallel_degree > 1
1325
+ and config.vocab_size % config.tensor_parallel_degree == 0
1326
+ ):
1327
+ vocab_size = config.vocab_size // config.tensor_parallel_degree
1328
+ else:
1329
+ vocab_size = config.vocab_size
1330
+
1331
+ self.transpose_y = transpose_y
1332
+ if transpose_y:
1333
+ if embedding_weights is not None:
1334
+ self.weight = embedding_weights
1335
+ else:
1336
+ self.weight = self.create_parameter(
1337
+ shape=[vocab_size, config.hidden_size],
1338
+ dtype=paddle.get_default_dtype(),
1339
+ )
1340
+ else:
1341
+ if vocab_size != config.vocab_size:
1342
+ self.weight = self.create_parameter(
1343
+ shape=[config.hidden_size, vocab_size],
1344
+ dtype=paddle.get_default_dtype(),
1345
+ )
1346
+ else:
1347
+ self.weight = self.create_parameter(
1348
+ shape=[config.hidden_size, vocab_size],
1349
+ dtype=paddle.get_default_dtype(),
1350
+ )
1351
+
1352
+ # Must set distributed attr for Tensor Parallel !
1353
+ self.weight.is_distributed = (
1354
+ True if (vocab_size != config.vocab_size) else False
1355
+ )
1356
+ if self.weight.is_distributed:
1357
+ # for tie_word_embeddings
1358
+ self.weight.split_axis = 0 if self.transpose_y else 1
1359
+
1360
+ def forward(self, hidden_states, tensor_parallel_output=None):
1361
+ if self.config.sequence_parallel:
1362
+ hidden_states = GatherOp.apply(hidden_states)
1363
+ seq_length = self.config.seq_length
1364
+ hidden_states = paddle.reshape_(
1365
+ hidden_states, [-1, seq_length, self.config.hidden_size]
1366
+ )
1367
+
1368
+ if tensor_parallel_output is None:
1369
+ tensor_parallel_output = self.config.tensor_parallel_output
1370
+
1371
+ logits = parallel_matmul(
1372
+ hidden_states,
1373
+ self.weight,
1374
+ transpose_y=self.transpose_y,
1375
+ tensor_parallel_output=tensor_parallel_output,
1376
+ )
1377
+ return logits
1378
+
1379
+
1380
+ class Qwen2ForCausalLM(Qwen2PretrainedModel):
1381
+ enable_to_static_method = True
1382
+ _tied_weights_keys = ["lm_head.weight"]
1383
+
1384
+ def __init__(self, config: Qwen2Config):
1385
+ super().__init__(config)
1386
+ self.qwen2 = Qwen2Model(config)
1387
+ if config.tie_word_embeddings:
1388
+ self.lm_head = Qwen2LMHead(
1389
+ config,
1390
+ embedding_weights=self.qwen2.embed_tokens.weight,
1391
+ transpose_y=True,
1392
+ )
1393
+ self.tie_weights()
1394
+ else:
1395
+ self.lm_head = Qwen2LMHead(config)
1396
+ self.criterion = Qwen2PretrainingCriterion(config)
1397
+ self.vocab_size = config.vocab_size
1398
+
1399
+ def get_input_embeddings(self):
1400
+ return self.qwen2.embed_tokens
1401
+
1402
+ def set_input_embeddings(self, value):
1403
+ self.qwen2.embed_tokens = value
1404
+
1405
+ def get_output_embeddings(self):
1406
+ return self.lm_head
1407
+
1408
+ def set_output_embeddings(self, new_embeddings):
1409
+ self.lm_head = new_embeddings
1410
+
1411
+ def set_decoder(self, decoder):
1412
+ self.qwen2 = decoder
1413
+
1414
+ def get_decoder(self):
1415
+ return self.qwen2
1416
+
1417
+ def prepare_inputs_for_generation(
1418
+ self,
1419
+ input_ids,
1420
+ use_cache=False,
1421
+ past_key_values=None,
1422
+ attention_mask=None,
1423
+ inputs_embeds=None,
1424
+ **kwargs,
1425
+ ):
1426
+ batch_size, seq_length = input_ids.shape
1427
+ position_ids = kwargs.get(
1428
+ "position_ids", paddle.arange(seq_length).expand((batch_size, seq_length))
1429
+ )
1430
+ if past_key_values:
1431
+ input_ids = input_ids[:, -1].unsqueeze(axis=-1)
1432
+ position_ids = position_ids[:, -1].unsqueeze(-1)
1433
+
1434
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
1435
+ if inputs_embeds is not None and past_key_values is None:
1436
+ model_inputs = {"inputs_embeds": inputs_embeds}
1437
+ else:
1438
+ model_inputs = {"input_ids": input_ids}
1439
+
1440
+ model_inputs.update(
1441
+ {
1442
+ "position_ids": position_ids,
1443
+ "past_key_values": past_key_values,
1444
+ "use_cache": use_cache,
1445
+ "attention_mask": attention_mask,
1446
+ }
1447
+ )
1448
+ return model_inputs
1449
+
1450
+ def _get_model_inputs_spec(self, dtype: str):
1451
+ return {
1452
+ "input_ids": paddle.static.InputSpec(shape=[None, None], dtype="int64"),
1453
+ "attention_mask": paddle.static.InputSpec(
1454
+ shape=[None, None], dtype="int64"
1455
+ ),
1456
+ "position_ids": paddle.static.InputSpec(shape=[None, None], dtype="int64"),
1457
+ }
1458
+
1459
+ @staticmethod
1460
+ def update_model_kwargs_for_generation(
1461
+ outputs, model_kwargs, is_encoder_decoder=False
1462
+ ):
1463
+ # update cache
1464
+ if (
1465
+ isinstance(outputs, tuple)
1466
+ and len(outputs) > 1
1467
+ and not isinstance(outputs[1], paddle.Tensor)
1468
+ ):
1469
+ model_kwargs["past_key_values"] = outputs[1]
1470
+
1471
+ if isinstance(outputs, CausalLMOutputWithPast) and "past_key_values" in outputs:
1472
+ model_kwargs["past_key_values"] = outputs.past_key_values
1473
+
1474
+ # update position_ids
1475
+ if "position_ids" in model_kwargs and model_kwargs["position_ids"] is not None:
1476
+ position_ids = model_kwargs["position_ids"]
1477
+ model_kwargs["position_ids"] = paddle.concat(
1478
+ [position_ids, position_ids[..., -1:] + 1], axis=-1
1479
+ )
1480
+
1481
+ if not is_encoder_decoder and "attention_mask" in model_kwargs:
1482
+ # TODO: support attention mask for other models
1483
+ attention_mask = model_kwargs["attention_mask"]
1484
+ if len(attention_mask.shape) == 2:
1485
+ model_kwargs["attention_mask"] = paddle.concat(
1486
+ [
1487
+ attention_mask,
1488
+ paddle.ones(
1489
+ [attention_mask.shape[0], 1], dtype=attention_mask.dtype
1490
+ ),
1491
+ ],
1492
+ axis=-1,
1493
+ )
1494
+ elif len(attention_mask.shape) == 4:
1495
+ model_kwargs["attention_mask"] = paddle.concat(
1496
+ [
1497
+ attention_mask,
1498
+ paddle.ones(
1499
+ [*attention_mask.shape[:3], 1], dtype=attention_mask.dtype
1500
+ ),
1501
+ ],
1502
+ axis=-1,
1503
+ )[:, :, -1:, :]
1504
+
1505
+ return model_kwargs
1506
+
1507
+ def forward(
1508
+ self,
1509
+ input_ids: paddle.Tensor = None,
1510
+ position_ids: Optional[paddle.Tensor] = None,
1511
+ attention_mask: Optional[paddle.Tensor] = None,
1512
+ inputs_embeds: Optional[paddle.Tensor] = None,
1513
+ labels: Optional[paddle.Tensor] = None,
1514
+ use_cache: Optional[bool] = None,
1515
+ past_key_values: Optional[List[paddle.Tensor]] = None,
1516
+ output_attentions: Optional[bool] = None,
1517
+ output_hidden_states: Optional[bool] = None,
1518
+ return_dict: Optional[bool] = None,
1519
+ attn_mask_startend_row_indices=None,
1520
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
1521
+ r"""
1522
+ Args:
1523
+ labels (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
1524
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
1525
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1526
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
1527
+
1528
+ Returns:
1529
+
1530
+ Example:
1531
+
1532
+ ```python
1533
+ >>> from transformers import AutoTokenizer, Qwen2ForCausalLM
1534
+
1535
+ >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
1536
+ >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
1537
+
1538
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
1539
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
1540
+
1541
+ >>> # Generate
1542
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1543
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
1544
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
1545
+ ```"""
1546
+
1547
+ output_attentions = (
1548
+ output_attentions
1549
+ if output_attentions is not None
1550
+ else self.config.output_attentions
1551
+ )
1552
+ output_hidden_states = (
1553
+ output_hidden_states
1554
+ if output_hidden_states is not None
1555
+ else self.config.output_hidden_states
1556
+ )
1557
+ return_dict = (
1558
+ return_dict if return_dict is not None else self.config.use_return_dict
1559
+ )
1560
+
1561
+ if attn_mask_startend_row_indices is not None and attention_mask is not None:
1562
+ logging.warning(
1563
+ "You have provided both attn_mask_startend_row_indices and attention_mask. "
1564
+ "The attn_mask_startend_row_indices will be used."
1565
+ )
1566
+ attention_mask = None
1567
+
1568
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
1569
+ outputs = self.qwen2(
1570
+ input_ids=input_ids,
1571
+ position_ids=position_ids,
1572
+ attention_mask=attention_mask,
1573
+ inputs_embeds=inputs_embeds,
1574
+ use_cache=use_cache,
1575
+ past_key_values=past_key_values,
1576
+ output_attentions=output_attentions,
1577
+ output_hidden_states=output_hidden_states,
1578
+ return_dict=return_dict,
1579
+ attn_mask_startend_row_indices=attn_mask_startend_row_indices,
1580
+ )
1581
+
1582
+ hidden_states = outputs[0]
1583
+
1584
+ # if labels is None,means we need full output, instead of tensor_parallel_output
1585
+ # tensor_parallel_output is together with ParallelCrossEntropy
1586
+ tensor_parallel_output = (
1587
+ self.config.tensor_parallel_output
1588
+ and self.config.tensor_parallel_degree > 1
1589
+ )
1590
+
1591
+ logits = self.lm_head(
1592
+ hidden_states, tensor_parallel_output=tensor_parallel_output
1593
+ )
1594
+ loss = None
1595
+
1596
+ if not return_dict:
1597
+ output = (logits,) + outputs[1:]
1598
+ return (loss,) + output if loss is not None else output
1599
+
1600
+ return CausalLMOutputWithPast(
1601
+ loss=loss,
1602
+ logits=logits,
1603
+ past_key_values=outputs.past_key_values,
1604
+ hidden_states=outputs.hidden_states,
1605
+ attentions=outputs.attentions,
1606
+ )