mindspore 2.2.0__cp38-cp38-manylinux1_x86_64.whl → 2.2.11__cp38-cp38-manylinux1_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mindspore might be problematic. Click here for more details.

Files changed (170) hide show
  1. mindspore/.commit_id +1 -1
  2. mindspore/_akg/akg/composite/build_module.py +104 -20
  3. mindspore/_akg/akg/utils/ascend_profilier/cann_file_parser.py +76 -0
  4. mindspore/_akg/akg/utils/ascend_profilier/file_manager.py +56 -0
  5. mindspore/_akg/akg/utils/ascend_profilier/op_summary_bean.py +23 -0
  6. mindspore/_akg/akg/utils/ascend_profilier/op_summary_headers.py +8 -0
  7. mindspore/_akg/akg/utils/ascend_profilier/op_summary_parser.py +42 -0
  8. mindspore/_akg/akg/utils/ascend_profilier/path_manager.py +65 -0
  9. mindspore/_akg/akg/utils/composite_op_helper.py +7 -2
  10. mindspore/_akg/akg/utils/dump_ascend_meta.py +22 -3
  11. mindspore/_akg/akg/utils/kernel_exec.py +41 -15
  12. mindspore/_akg/akg/utils/tbe_codegen_utils.py +27 -6
  13. mindspore/_akg/akg/utils/util.py +56 -1
  14. mindspore/_c_dataengine.cpython-38-x86_64-linux-gnu.so +0 -0
  15. mindspore/_c_expression.cpython-38-x86_64-linux-gnu.so +0 -0
  16. mindspore/_checkparam.py +3 -3
  17. mindspore/_extends/graph_kernel/model/graph_split.py +84 -76
  18. mindspore/_extends/graph_kernel/splitter.py +3 -2
  19. mindspore/_extends/parallel_compile/akg_compiler/build_tbe_kernel.py +83 -66
  20. mindspore/_extends/parallel_compile/akg_compiler/tbe_topi.py +4 -4
  21. mindspore/_extends/parallel_compile/akg_compiler/util.py +10 -7
  22. mindspore/_extends/parallel_compile/tbe_compiler/tbe_helper.py +2 -1
  23. mindspore/_extends/parse/__init__.py +3 -2
  24. mindspore/_extends/parse/parser.py +6 -1
  25. mindspore/_extends/parse/standard_method.py +14 -11
  26. mindspore/_extends/remote/kernel_build_server.py +2 -1
  27. mindspore/_mindspore_offline_debug.cpython-38-x86_64-linux-gnu.so +0 -0
  28. mindspore/bin/cache_admin +0 -0
  29. mindspore/bin/cache_server +0 -0
  30. mindspore/common/_utils.py +16 -0
  31. mindspore/common/api.py +1 -1
  32. mindspore/common/auto_dynamic_shape.py +81 -85
  33. mindspore/common/dump.py +1 -1
  34. mindspore/common/tensor.py +3 -20
  35. mindspore/config/op_info.config +1 -1
  36. mindspore/context.py +11 -4
  37. mindspore/dataset/engine/cache_client.py +8 -5
  38. mindspore/dataset/engine/datasets_standard_format.py +5 -0
  39. mindspore/dataset/vision/transforms.py +21 -21
  40. mindspore/experimental/optim/adam.py +1 -1
  41. mindspore/gen_ops.py +1 -1
  42. mindspore/include/api/model.h +17 -0
  43. mindspore/include/api/status.h +8 -3
  44. mindspore/lib/libdnnl.so.2 +0 -0
  45. mindspore/lib/libmindspore.so +0 -0
  46. mindspore/lib/libmindspore_backend.so +0 -0
  47. mindspore/lib/libmindspore_common.so +0 -0
  48. mindspore/lib/libmindspore_core.so +0 -0
  49. mindspore/lib/libmindspore_glog.so.0 +0 -0
  50. mindspore/lib/libmindspore_gpr.so.15 +0 -0
  51. mindspore/lib/libmindspore_grpc++.so.1 +0 -0
  52. mindspore/lib/libmindspore_grpc.so.15 +0 -0
  53. mindspore/lib/libmindspore_shared_lib.so +0 -0
  54. mindspore/lib/libnnacl.so +0 -0
  55. mindspore/lib/libopencv_core.so.4.5 +0 -0
  56. mindspore/lib/libopencv_imgcodecs.so.4.5 +0 -0
  57. mindspore/lib/libopencv_imgproc.so.4.5 +0 -0
  58. mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/ai_core/tbe/config/ascend310/aic-ascend310-ops-info.json +123 -0
  59. mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/ai_core/tbe/config/ascend310p/aic-ascend310p-ops-info.json +123 -0
  60. mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/ai_core/tbe/config/ascend910/aic-ascend910-ops-info.json +158 -0
  61. mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/ai_core/tbe/config/ascend910b/aic-ascend910b-ops-info.json +37 -0
  62. mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/ai_core/tbe/custom_aicore_ops_impl/add_dsl.py +46 -0
  63. mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/ai_core/tbe/custom_aicore_ops_impl/add_tik.py +51 -0
  64. mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/ai_core/tbe/custom_aicore_ops_impl/kv_cache_mgr.py +241 -0
  65. mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/ai_core/tbe/custom_aicore_ops_impl/matmul_tik.py +212 -0
  66. mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/vector_core/tbe/custom_aicore_ops_impl/add_dsl.py +46 -0
  67. mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/vector_core/tbe/custom_aicore_ops_impl/add_tik.py +51 -0
  68. mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/vector_core/tbe/custom_aicore_ops_impl/kv_cache_mgr.py +241 -0
  69. mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/vector_core/tbe/custom_aicore_ops_impl/matmul_tik.py +212 -0
  70. mindspore/lib/plugin/ascend/custom_aicore_ops/op_proto/libop_proto.so +0 -0
  71. mindspore/lib/plugin/ascend/custom_aicpu_ops/op_impl/cpu/aicpu_kernel/impl/libcust_aicpu_kernels.so +0 -0
  72. mindspore/lib/plugin/ascend/custom_aicpu_ops/op_impl/cpu/aicpu_kernel/impl/libcust_cpu_kernels.so +0 -0
  73. mindspore/lib/plugin/ascend/custom_aicpu_ops/op_impl/cpu/config/cust_aicpu_kernel.json +78 -80
  74. mindspore/lib/plugin/ascend/custom_aicpu_ops/op_proto/libcust_op_proto.so +0 -0
  75. mindspore/lib/plugin/ascend/libakg.so +0 -0
  76. mindspore/lib/plugin/ascend/libhccl_plugin.so +0 -0
  77. mindspore/lib/plugin/ascend/libmindspore_aicpu_kernels.so +0 -0
  78. mindspore/lib/plugin/ascend/libmindspore_cpu_kernels.so +0 -0
  79. mindspore/lib/plugin/cpu/libakg.so +0 -0
  80. mindspore/lib/plugin/gpu/libcuda_ops.so.10 +0 -0
  81. mindspore/lib/plugin/gpu/libcuda_ops.so.11 +0 -0
  82. mindspore/lib/plugin/gpu10.1/libakg.so +0 -0
  83. mindspore/lib/plugin/gpu11.1/libakg.so +0 -0
  84. mindspore/lib/plugin/gpu11.1/libnccl.so.2 +0 -0
  85. mindspore/lib/plugin/gpu11.6/libakg.so +0 -0
  86. mindspore/lib/plugin/gpu11.6/libnccl.so.2 +0 -0
  87. mindspore/lib/plugin/libmindspore_ascend.so.1 +0 -0
  88. mindspore/lib/plugin/libmindspore_ascend.so.2 +0 -0
  89. mindspore/lib/plugin/libmindspore_gpu.so.10.1 +0 -0
  90. mindspore/lib/plugin/libmindspore_gpu.so.11.1 +0 -0
  91. mindspore/lib/plugin/libmindspore_gpu.so.11.6 +0 -0
  92. mindspore/nn/cell.py +0 -3
  93. mindspore/nn/layer/activation.py +4 -5
  94. mindspore/nn/layer/conv.py +39 -23
  95. mindspore/nn/layer/flash_attention.py +54 -129
  96. mindspore/nn/layer/math.py +3 -7
  97. mindspore/nn/layer/rnn_cells.py +5 -5
  98. mindspore/nn/wrap/__init__.py +4 -2
  99. mindspore/nn/wrap/cell_wrapper.py +12 -3
  100. mindspore/numpy/utils_const.py +5 -5
  101. mindspore/ops/_grad_experimental/grad_array_ops.py +1 -1
  102. mindspore/ops/_grad_experimental/grad_implementations.py +2 -2
  103. mindspore/ops/_grad_experimental/grad_math_ops.py +19 -18
  104. mindspore/ops/_grad_experimental/grad_sparse_ops.py +3 -3
  105. mindspore/ops/_op_impl/aicpu/add.py +3 -3
  106. mindspore/ops/_op_impl/aicpu/linear_sum_assignment.py +21 -2
  107. mindspore/ops/_utils/utils.py +2 -0
  108. mindspore/ops/composite/multitype_ops/_compile_utils.py +2 -1
  109. mindspore/ops/composite/multitype_ops/getitem_impl.py +2 -2
  110. mindspore/ops/function/array_func.py +10 -7
  111. mindspore/ops/function/grad/grad_func.py +0 -1
  112. mindspore/ops/function/nn_func.py +98 -9
  113. mindspore/ops/function/random_func.py +2 -1
  114. mindspore/ops/op_info_register.py +24 -21
  115. mindspore/ops/operations/__init__.py +6 -2
  116. mindspore/ops/operations/_grad_ops.py +25 -6
  117. mindspore/ops/operations/_inner_ops.py +155 -23
  118. mindspore/ops/operations/array_ops.py +9 -7
  119. mindspore/ops/operations/comm_ops.py +2 -2
  120. mindspore/ops/operations/custom_ops.py +85 -68
  121. mindspore/ops/operations/inner_ops.py +26 -3
  122. mindspore/ops/operations/math_ops.py +7 -6
  123. mindspore/ops/operations/nn_ops.py +193 -49
  124. mindspore/parallel/_parallel_serialization.py +10 -3
  125. mindspore/parallel/_tensor.py +4 -1
  126. mindspore/parallel/checkpoint_transform.py +13 -2
  127. mindspore/parallel/shard.py +17 -10
  128. mindspore/profiler/common/util.py +1 -0
  129. mindspore/profiler/parser/ascend_hccl_generator.py +232 -0
  130. mindspore/profiler/parser/ascend_msprof_exporter.py +86 -43
  131. mindspore/profiler/parser/ascend_msprof_generator.py +196 -9
  132. mindspore/profiler/parser/ascend_op_generator.py +1 -1
  133. mindspore/profiler/parser/ascend_timeline_generator.py +6 -182
  134. mindspore/profiler/parser/base_timeline_generator.py +1 -1
  135. mindspore/profiler/parser/cpu_gpu_timeline_generator.py +2 -2
  136. mindspore/profiler/parser/framework_parser.py +1 -1
  137. mindspore/profiler/parser/profiler_info.py +19 -0
  138. mindspore/profiler/profiling.py +46 -24
  139. mindspore/rewrite/api/pattern_engine.py +1 -1
  140. mindspore/rewrite/parsers/for_parser.py +7 -7
  141. mindspore/rewrite/parsers/module_parser.py +4 -4
  142. mindspore/rewrite/symbol_tree.py +1 -4
  143. mindspore/run_check/_check_version.py +5 -3
  144. mindspore/safeguard/rewrite_obfuscation.py +52 -28
  145. mindspore/scipy/ops.py +55 -5
  146. mindspore/scipy/optimize/__init__.py +3 -2
  147. mindspore/scipy/optimize/linear_sum_assignment.py +38 -33
  148. mindspore/train/callback/_summary_collector.py +1 -1
  149. mindspore/train/dataset_helper.py +1 -0
  150. mindspore/train/model.py +2 -2
  151. mindspore/train/serialization.py +97 -11
  152. mindspore/train/summary/_summary_adapter.py +1 -1
  153. mindspore/train/summary/summary_record.py +23 -7
  154. mindspore/version.py +1 -1
  155. {mindspore-2.2.0.dist-info → mindspore-2.2.11.dist-info}/METADATA +3 -2
  156. {mindspore-2.2.0.dist-info → mindspore-2.2.11.dist-info}/RECORD +160 -151
  157. mindspore/ops/_op_impl/_custom_op/flash_attention/attention.py +0 -406
  158. mindspore/ops/_op_impl/_custom_op/flash_attention/constants.py +0 -41
  159. mindspore/ops/_op_impl/_custom_op/flash_attention/flash_attention_bwd.py +0 -467
  160. mindspore/ops/_op_impl/_custom_op/flash_attention/flash_attention_fwd.py +0 -563
  161. mindspore/ops/_op_impl/_custom_op/flash_attention/flash_attention_impl.py +0 -193
  162. mindspore/ops/_op_impl/_custom_op/flash_attention/tik_ops_utils.py +0 -435
  163. mindspore/ops/_op_impl/_custom_op/flash_attention/tiling_strategy/__init__.py +0 -0
  164. mindspore/ops/_op_impl/_custom_op/flash_attention/tiling_strategy/sparse_tiling.py +0 -45
  165. mindspore/ops/_op_impl/_custom_op/flash_attention/tiling_strategy/strategy.py +0 -67
  166. mindspore/ops/_op_impl/_custom_op/flash_attention/tiling_strategy/wukong_tiling.py +0 -62
  167. /mindspore/{ops/_op_impl/_custom_op/flash_attention → _akg/akg/utils/ascend_profilier}/__init__.py +0 -0
  168. {mindspore-2.2.0.dist-info → mindspore-2.2.11.dist-info}/WHEEL +0 -0
  169. {mindspore-2.2.0.dist-info → mindspore-2.2.11.dist-info}/entry_points.txt +0 -0
  170. {mindspore-2.2.0.dist-info → mindspore-2.2.11.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,51 @@
1
+ # Copyright 2022 Huawei Technologies Co., Ltd
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # ============================================================================
15
+ """ascend custom op: add by tik"""
16
+ from tbe.common.register import register_op_compute
17
+ from tbe.common.utils import para_check
18
+ from tbe import tik
19
+
20
+
21
+ @register_op_compute("AddTik")
22
+ @para_check.check_op_params(para_check.REQUIRED_INPUT, para_check.REQUIRED_INPUT,
23
+ para_check.REQUIRED_OUTPUT, para_check.KERNEL_NAME)
24
+ def add_tik(x1, x2, y, kernel_name="add_tik"):
25
+ """add dsl impl function"""
26
+ tik_instance = tik.Tik()
27
+ x1_shape = x1.get("shape")
28
+ x2_shape = x2.get("shape")
29
+ y_shape = y.get("shape")
30
+
31
+ data_a = tik_instance.Tensor(
32
+ "float16", x1_shape, name="x1", scope=tik.scope_gm)
33
+ data_b = tik_instance.Tensor(
34
+ "float16", x2_shape, name="x2", scope=tik.scope_gm)
35
+ data_c = tik_instance.Tensor(
36
+ "float16", y_shape, name="y", scope=tik.scope_gm)
37
+ data_a_ub = tik_instance.Tensor(
38
+ "float16", x1_shape, name="data_A_ub", scope=tik.scope_ubuf)
39
+ data_b_ub = tik_instance.Tensor(
40
+ "float16", x2_shape, name="data_B_ub", scope=tik.scope_ubuf)
41
+ data_c_ub = tik_instance.Tensor(
42
+ "float16", y_shape, name="data_C_ub", scope=tik.scope_ubuf)
43
+
44
+ tik_instance.data_move(data_a_ub, data_a, 0, 1, 128 // 16, 0, 0)
45
+ tik_instance.data_move(data_b_ub, data_b, 0, 1, 128 // 16, 0, 0)
46
+ tik_instance.vec_add(
47
+ 128, data_c_ub[0], data_a_ub[0], data_b_ub[0], 1, 8, 8, 8)
48
+ tik_instance.data_move(data_c, data_c_ub, 0, 1, 128 // 16, 0, 0)
49
+ tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[data_a, data_b], outputs=[data_c])
50
+
51
+ return tik_instance
@@ -0,0 +1,241 @@
1
+ #!/usr/bin/python
2
+ # -*- coding: utf-8 -*-
3
+ # Copyright 2023 Huawei Technologies Co., Ltd
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ # ============================================================================
17
+ """ascend custom op: kv_cache_mgr by tik"""
18
+
19
+ import functools
20
+ from tbe import tik
21
+ import tbe.common.platform as tbe_platform
22
+ from tbe.common.utils import para_check
23
+
24
+
25
+ # 'pylint: disable=unused-argument,unused-variable,too-many-arguments,too-many-locals
26
+ def check_supported(past, cur, index, out, kernel_name="kv_cache_mgr"):
27
+ """check data type and shape"""
28
+ # check data type
29
+ past_dtype = past.get("dtype").lower()
30
+ cur_dtype = cur.get("dtype").lower()
31
+ out_dtype = out.get("dtype").lower()
32
+
33
+ if past_dtype != cur_dtype or past_dtype != out_dtype:
34
+ reason = "past_dtype is %s, cur_dtype is %s, out_dtype is %s" % (past_dtype, cur_dtype, out_dtype)
35
+ return False, reason
36
+
37
+ support_dtype_list = ["float32", "int32", "uint32",
38
+ "float16", "int16", "uint16",
39
+ "int8", "uint8"]
40
+ if past_dtype not in support_dtype_list:
41
+ reason = "past_dtype(%s) is not support" % (past_dtype)
42
+ return False, reason
43
+
44
+ index_dtype = index.get("dtype").lower()
45
+ if index_dtype != "int32":
46
+ reason = "index_dtype is %s, not int32" % (index_dtype)
47
+ return False, reason
48
+
49
+ # check shape
50
+ past_shape = past.get("shape")
51
+ cur_shape = cur.get("shape")
52
+
53
+ if len(past_shape) != 4 or len(cur_shape) != 4:
54
+ reason = "len(past_shape) != 4 or len(cur_shape) != 4 "
55
+ return False, reason
56
+
57
+ # key_past shape: (bs, num_heads, size_per_head, seq_length)
58
+ # value_past shape: (bs, num_heads, seq_length, size_per_head)
59
+ # key shape: (bs, num_heads, 1, size_per_head)
60
+ # value shape: (bs, num_heads, 1, size_per_head)
61
+
62
+ if past_shape[0] != cur_shape[0] or past_shape[1] != cur_shape[1]:
63
+ reason = "past_shape[0] != cur_shape[0] or past_shape[1] != cur_shape[1] "
64
+ return False, reason
65
+
66
+ if past_shape[3] != cur_shape[3]:
67
+ reason = "past_shape[3] != cur_shape[3]"
68
+ return False, reason
69
+
70
+ return True, ""
71
+
72
+
73
+ def ceil_div(dividend, divisor):
74
+ return (dividend + divisor - 1) // divisor
75
+
76
+
77
+ def get_loop_info(total_num, each_loop_num):
78
+ loop_times = ceil_div(total_num, each_loop_num)
79
+ last_loop_num = total_num - each_loop_num * (loop_times - 1)
80
+ return loop_times, last_loop_num
81
+
82
+
83
+ def elements_align(index_elements, data_size, align_size):
84
+ """Get element num align to align_size"""
85
+ total_size = index_elements * data_size
86
+ aligned_total_size = (total_size + align_size - 1) // align_size * align_size
87
+ return aligned_total_size // data_size
88
+
89
+
90
+ class TilingHelper:
91
+ """Tiling parameter"""
92
+ def __init__(self, past, cur, index, out, kernel_name="kv_cache_mgr"):
93
+ self.kernel_name = kernel_name
94
+
95
+ # sys info
96
+ self.core_num = tbe_platform.get_soc_spec(tbe_platform.CORE_NUM)
97
+ self.ub_size = tbe_platform.get_soc_spec(tbe_platform.UB_SIZE)
98
+
99
+ self.past_shape = past.get("shape")
100
+ self.cur_shape = cur.get("shape")
101
+ self.index_shape = index.get("shape")
102
+
103
+ self.gm_type = past.get("dtype").lower()
104
+ self.ub_type = self.gm_type
105
+ self.index_ub_type = "int32"
106
+ self.int32_size = 4
107
+
108
+ self.gm_dtype_size = 2
109
+ if self.gm_type in ["int8", "uint8"]:
110
+ self.gm_dtype_size = 1
111
+ elif self.gm_type in ["float16", "int16", "uint16"]:
112
+ self.gm_dtype_size = 2
113
+ elif self.gm_type in ["float32", "int32", "uint32"]:
114
+ self.gm_dtype_size = 4
115
+
116
+ # tiling policy
117
+ self.seq_length = self.past_shape[2]
118
+ self.size_per_head = self.past_shape[3]
119
+ self.update_seq_length = self.cur_shape[2]
120
+
121
+ self.num_head = self.past_shape[1]
122
+
123
+ self.past_elements = functools.reduce(lambda a, b: a * b, self.past_shape)
124
+ self.cur_elements = functools.reduce(lambda a, b: a * b, self.cur_shape)
125
+
126
+ # The `burst` unit is 32B
127
+ index_elements = functools.reduce(lambda a, b: a * b, self.index_shape)
128
+ self.index_elements = elements_align(index_elements, self.int32_size, 32)
129
+
130
+ # split cur
131
+ self.cur_bs = self.cur_shape[0] * self.cur_shape[1]
132
+ self.each_core_bs_num = ceil_div(self.cur_bs, self.core_num)
133
+ self.core_num, self.last_core_bs_num = get_loop_info(self.cur_bs, self.each_core_bs_num)
134
+ self.cur_ub_elements = self.each_core_bs_num * self.update_seq_length * self.size_per_head
135
+ self.last_cure_ub_elements = self.last_core_bs_num * self.update_seq_length * self.size_per_head
136
+
137
+
138
+ class KVCacheImpl(TilingHelper):
139
+ """KVCacheImpl"""
140
+ def __init__(self, past, cur, index, out, kernel_name):
141
+ super().__init__(past, cur, index, out, kernel_name)
142
+ # key_past or value_past shape: (bs, num_heads, seq_length, size_per_head)
143
+ # batch_valid_length
144
+ # cur update shape: (bs, num_heads, 1, size_per_head)
145
+
146
+ self.tik_inst = tik.Tik(disable_debug=True)
147
+ self.past_gm = self.tik_inst.Tensor(self.gm_type, (self.past_elements,), name="past_gm", scope=tik.scope_gm)
148
+ self.cur_gm = self.tik_inst.Tensor(self.gm_type, (self.cur_elements,), name="cur_gm", scope=tik.scope_gm)
149
+ self.index_gm = self.tik_inst.Tensor(self.index_ub_type, (self.index_elements,), name="index_gm",
150
+ scope=tik.scope_gm)
151
+ # we use is_atomic_add=True to set the out_gm zeros. But if inplace update out_gm, no need to set this flag.
152
+ self.out_gm = self.tik_inst.Tensor(self.gm_type, (self.past_elements,), name="out_gm", scope=tik.scope_gm)
153
+
154
+ def valid_cur_ub_load(self, core_idx):
155
+ """KVCacheImpl.valid_cur_ub_load"""
156
+ cur_ub = self.tik_inst.Tensor(self.ub_type, (self.cur_ub_elements,), name="valid_cur_ub",
157
+ scope=tik.scope_ubuf)
158
+ cur_gm_offset = core_idx * self.cur_ub_elements
159
+ with self.tik_inst.if_scope(core_idx != self.core_num -1):
160
+ self.tik_inst.data_move(cur_ub, self.cur_gm[cur_gm_offset:], 0, 1,
161
+ self.cur_ub_elements * self.gm_dtype_size // 32, 0, 0)
162
+ with self.tik_inst.else_scope():
163
+ self.tik_inst.data_move(cur_ub, self.cur_gm[cur_gm_offset:], 0, 1,
164
+ self.last_cure_ub_elements * self.gm_dtype_size // 32, 0, 0)
165
+ return cur_ub
166
+
167
+ def valid_index_ub_load(self):
168
+ """KVCacheImpl.valid_index_ub_load"""
169
+ index_ub = self.tik_inst.Tensor(self.index_ub_type, (self.index_elements,), name="valid_index_ub",
170
+ scope=tik.scope_ubuf)
171
+ self.tik_inst.data_move(index_ub, self.index_gm, 0, 1, self.index_elements * self.int32_size // 32, 0, 0)
172
+ return index_ub
173
+
174
+ def valid_pos_update(self, core_idx, cur_ub, index_ub, each_core_bs_num):
175
+ """KVCacheImpl.valid_pos_update"""
176
+ src_bs_stride = self.update_seq_length * self.size_per_head
177
+ dst_bs_stride = self.seq_length * self.size_per_head
178
+ burst_len = self.update_seq_length * self.size_per_head * self.gm_dtype_size // 32
179
+
180
+ valid_idx = self.tik_inst.Scalar(dtype="int32")
181
+ with self.tik_inst.for_range(0, each_core_bs_num) as each_core_bs_idx:
182
+ bs_idx = core_idx * self.each_core_bs_num + each_core_bs_idx
183
+ # because we fused bs * num_head, we need get the real bs_idx
184
+ valid_idx.set_as(index_ub[bs_idx // self.num_head])
185
+ with self.tik_inst.if_scope(valid_idx >= 0):
186
+ dst_offset = bs_idx * dst_bs_stride + valid_idx * self.size_per_head
187
+ src_offset = each_core_bs_idx * src_bs_stride
188
+ if burst_len < 65536:
189
+ self.tik_inst.data_move(self.out_gm[dst_offset], cur_ub[src_offset],
190
+ 0, 1, burst_len, 0, 0)
191
+ else:
192
+ nburst = 1
193
+ each_burst_len = burst_len
194
+ while each_burst_len > 65535:
195
+ nburst += 1
196
+ each_burst_len = burst_len // nburst
197
+ self.tik_inst.data_move(self.out_gm[dst_offset], cur_ub[src_offset], 0,
198
+ nburst, each_burst_len, 0, 0)
199
+
200
+ # 'pylint: disable=too-many-arguments
201
+ def compute_each_core(self, core_idx, core_bs_num):
202
+ """KVCacheImpl.compute_each_core"""
203
+ index_ub = self.valid_index_ub_load()
204
+ cur_ub = self.valid_cur_ub_load(core_idx)
205
+ self.valid_pos_update(core_idx, cur_ub, index_ub, core_bs_num)
206
+
207
+ def compute(self):
208
+ """KVCacheImpl.compute"""
209
+ if self.each_core_bs_num == self.last_core_bs_num:
210
+ with self.tik_inst.for_range(0, self.core_num, block_num=self.core_num) as core_index:
211
+ self.compute_each_core(core_idx=core_index, core_bs_num=self.each_core_bs_num)
212
+ else:
213
+ with self.tik_inst.for_range(0, self.core_num, block_num=self.core_num) as core_index:
214
+ with self.tik_inst.if_scope(core_index < self.core_num - 1):
215
+ self.compute_each_core(core_idx=core_index, core_bs_num=self.each_core_bs_num)
216
+ with self.tik_inst.else_scope():
217
+ self.compute_each_core(core_idx=core_index, core_bs_num=self.last_core_bs_num)
218
+
219
+ self.tik_inst.BuildCCE(kernel_name=self.kernel_name,
220
+ inputs=[self.past_gm, self.cur_gm, self.index_gm],
221
+ outputs=[self.out_gm],
222
+ )
223
+ return self.tik_inst
224
+
225
+
226
+ # 'pylint: disable = unused-argument
227
+ # 'pylint: disable=too-many-arguments,too-many-locals
228
+ @para_check.check_op_params(para_check.REQUIRED_INPUT, para_check.REQUIRED_INPUT,
229
+ para_check.REQUIRED_INPUT, para_check.REQUIRED_OUTPUT,
230
+ para_check.KERNEL_NAME)
231
+ def kv_cache_mgr(past, cur, index, out, kernel_name="kv_cache_mgr"):
232
+ """
233
+ :param past: key_past or value_past. shape: (bs, num_head, seq_length, size_pre_head)
234
+ :param cur: key_current or value_current. shape: (bs, num_head, update_seq_length, size_pre_head)
235
+ :param index: which index to update. shape * len(dtype) need be multiples of 32. Option Input.
236
+ :param out: output shape: (bs, num_head, seq_length, size_pre_head)
237
+ :param kernel_name: the name of the op
238
+ :return:
239
+ """
240
+ obj = KVCacheImpl(past, cur, index, out, kernel_name)
241
+ return obj.compute()
@@ -0,0 +1,212 @@
1
+ """
2
+ Copyright 2020 Huawei Technologies Co., Ltd. All rights reserved.
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+
16
+ matmul_tik
17
+ """
18
+
19
+ from tbe import tik
20
+ from tbe.common.platform import get_soc_spec
21
+
22
+ DTYPE_SIZE = {
23
+ 'bool': 1,
24
+ 'uint8': 1,
25
+ 'int8': 1,
26
+ 'uint16': 2,
27
+ 'int16': 2,
28
+ 'int24': 3,
29
+ 'uint32': 4,
30
+ 'int32': 4,
31
+ 'float16': 2,
32
+ 'float32': 4,
33
+ 'int48': 6,
34
+ 'int64': 8,
35
+ 'uint64': 8,
36
+ 'float64': 8
37
+ }
38
+
39
+
40
+ def MK_TO_K1MK0(tik_instance, mk_input_tensor, k1mk0_tensor, dtype, k1, m, k0):
41
+ """data move mk to k1mk0"""
42
+ src_ub = tik_instance.Tensor(dtype, (k1, m, k0), name='src_ub', scope=tik.scope_ubuf)
43
+
44
+ # data_move(m, k) ---> (k1, m, k0)
45
+ with tik_instance.for_range(0, k1) as i:
46
+ tik_instance.data_move(src_ub[i * m * k0:], mk_input_tensor[i * k0:], 0, m, k0 * DTYPE_SIZE[dtype] // 32,
47
+ (k1 - 1) * k0 * DTYPE_SIZE[dtype] // 32, 0)
48
+
49
+ tik_instance.data_move(k1mk0_tensor, src_ub, 0, 1, k1 * m * k0 * DTYPE_SIZE[dtype] // 32, 0, 0)
50
+
51
+
52
+ def KN_TO_K1NK0(tik_instance, kn_input_tensor, k1nk0_tensor, dtype, k1, n, k0):
53
+ """data move kn to k1nk0"""
54
+
55
+ with tik_instance.for_range(0, k1) as index:
56
+ k1nk0_ub = tik_instance.Tensor(dtype, (n, k0), tik.scope_ubuf, "k1nk0_ub")
57
+ src_ub = tik_instance.Tensor(dtype, (k0, n), tik.scope_ubuf, "src_ub")
58
+ burst_len = k0 * n * DTYPE_SIZE[dtype] // 32
59
+ tik_instance.data_move(src_ub, kn_input_tensor[index * k0 * n], 0, 1, burst_len, 0, 0)
60
+ dst_list = [k1nk0_ub[16 * i] for i in range(16)]
61
+ src_list = [src_ub[n * i] for i in range(16)]
62
+ rep_times = n // k0
63
+ dst_rep_stride = k0
64
+ src_rep_stride = 1
65
+ tik_instance.vec_trans_scatter(False, False, dst_list, src_list, rep_times, dst_rep_stride, src_rep_stride)
66
+ tik_instance.data_move(k1nk0_tensor[index * k0 * n], k1nk0_ub, 0, 1, burst_len, 0, 0)
67
+
68
+
69
+ def N1MN0_TO_MN(tik_instance, mn_output_tensor, n1mn0_tensor, dtype, n1, m, n0):
70
+ """data move mn to n1mn0"""
71
+ src_ub = tik_instance.Tensor(dtype, (m, n1 * n0), name='src_ub', scope=tik.scope_ubuf)
72
+
73
+ # data_move(n1, m, n0) ---> (m, n)
74
+ with tik_instance.for_range(0, n1) as i:
75
+ tik_instance.data_move(src_ub[i * n0:], n1mn0_tensor[i * m * n0:], 0, m,
76
+ n0 * DTYPE_SIZE[dtype] // 32, 0, (n1 - 1) * n0 * DTYPE_SIZE[dtype] // 32)
77
+
78
+ tik_instance.data_move(mn_output_tensor, src_ub, 0, 1, m * n1 * n0 * DTYPE_SIZE[dtype] // 32, 0, 0)
79
+
80
+
81
+ def matmul_tik_compute(params, kernel_name):
82
+ """
83
+ matmul tik compute
84
+ @param params: matmul data
85
+ @param kernel_name: kernel name
86
+ @return: tik instance
87
+ """
88
+ tik_instance = tik.Tik()
89
+ if not isinstance(params, dict):
90
+ params = params.__dict__
91
+ m_size, k_size, n_size = params['M'], params['K'], params['N']
92
+ data_type = params["data_type"]
93
+ m_tiling_size = int(params["m_tiling_size"])
94
+ n_tiling_size = int(params["n_tiling_size"])
95
+ k_tiling_size = int(params['k_tiling_size'])
96
+
97
+ m_cycle_times = params["m_cycle_times"]
98
+ n_cycle_times = params["n_cycle_times"]
99
+ k_cycle_times = params["k_cycle_times"]
100
+
101
+ # Determine the output type
102
+ if data_type == "float16":
103
+ if get_soc_spec("SOC_VERSION") in ["SD3403", "OPTG", "Hi3796CV300CS", "TsnsC"]:
104
+ C_loc_out_type = "float16"
105
+ else:
106
+ C_loc_out_type = "float32"
107
+ K0 = 16
108
+ else:
109
+ C_loc_out_type = "int32"
110
+ K0 = 32
111
+ block_size = 16
112
+
113
+ n_thread_num = params['n_thread_num']
114
+ m_thread_num = params['m_thread_num']
115
+ k_thread_num = params['k_thread_num']
116
+
117
+ mk_gm_input = tik_instance.Tensor(data_type, (m_size, k_size), name="mk_input_gm", scope=tik.scope_gm)
118
+ kn_gm_input = tik_instance.Tensor(data_type, (k_size, n_size), name="kn_input_gm", scope=tik.scope_gm)
119
+
120
+ k1mk0_workspace = tik_instance.Tensor(data_type, (k_size // K0, m_size, K0), name="k1mk0_workspace",
121
+ scope=tik.scope_gm, is_workspace=True)
122
+
123
+ k1nk0_workspace = tik_instance.Tensor(data_type, (k_size // K0, n_size, K0), name="k1nk0_workspace",
124
+ scope=tik.scope_gm, is_workspace=True)
125
+
126
+ mn_gm_output = tik_instance.Tensor(C_loc_out_type, (m_size, n_size), tik.scope_gm, name="mn_output_gm")
127
+ nmk0_workspace = tik_instance.Tensor(C_loc_out_type, (n_size // block_size, m_size, block_size),
128
+ name="nmk0_workspace", scope=tik.scope_gm, is_workspace=True)
129
+
130
+ MK_TO_K1MK0(tik_instance, mk_gm_input, k1mk0_workspace, data_type, k_size // K0, m_size, K0)
131
+ KN_TO_K1NK0(tik_instance, kn_gm_input, k1nk0_workspace, data_type, k_size // K0, n_size, K0)
132
+
133
+ # Tiling is realized through the for_range() loop.
134
+ with tik_instance.for_range(0, 2, block_num=1) as core_id:
135
+ with tik_instance.for_range(0, n_cycle_times // 2, thread_num=n_thread_num) as n_idx:
136
+ with tik_instance.for_range(0, m_cycle_times, thread_num=m_thread_num) as m_idx:
137
+ dst_l0c = tik_instance.Tensor(C_loc_out_type, [n_tiling_size // 16, m_tiling_size, 16], name='dst_l0c',
138
+ scope=tik.scope_cbuf_out)
139
+ with tik_instance.for_range(0, k_cycle_times,
140
+ thread_num=k_thread_num) as k_idx:
141
+ # Calculation result data transfer.
142
+ inputa_l1 = tik_instance.Tensor(params['data_type'], [k_tiling_size // K0, m_tiling_size, K0],
143
+ name="A_tiling_l1", scope=tik.scope_cbuf)
144
+ tik_instance.data_move(inputa_l1,
145
+ k1mk0_workspace[k_idx * k_tiling_size // K0, m_idx * m_tiling_size, :],
146
+ 0, k_tiling_size // K0, m_tiling_size, m_size - m_tiling_size, 0)
147
+ inputb_l1 = tik_instance.Tensor(params["data_type"], [k_tiling_size // K0, n_tiling_size, K0],
148
+ name="B_tiling_l1", scope=tik.scope_cbuf)
149
+ if n_size - n_tiling_size > 65535:
150
+ with tik_instance.for_range(0, k_tiling_size // K0) \
151
+ as dma_k_idx:
152
+ tik_instance.data_move(inputb_l1[dma_k_idx, :, :],
153
+ k1nk0_workspace[k_idx * k_tiling_size // K0 + dma_k_idx,
154
+ (core_id * n_cycle_times // 2 + n_idx)
155
+ * n_tiling_size, :],
156
+ 0, 1, n_tiling_size, 0, 0)
157
+ else:
158
+ tik_instance.data_move(inputb_l1, k1nk0_workspace[k_idx * k_tiling_size // K0,
159
+ (core_id * n_cycle_times // 2 + n_idx)
160
+ * n_tiling_size, :],
161
+ 0, k_tiling_size // K0, n_tiling_size, n_size - n_tiling_size, 0)
162
+ # Call matmul API to matrix multiplication calculation.
163
+ with tik_instance.if_scope(k_idx == 0):
164
+ tik_instance.matmul(dst_l0c, inputa_l1, inputb_l1, m_tiling_size, k_tiling_size, n_tiling_size,
165
+ init_l1out=True)
166
+ with tik_instance.else_scope():
167
+ tik_instance.matmul(dst_l0c, inputa_l1, inputb_l1, m_tiling_size, k_tiling_size, n_tiling_size,
168
+ init_l1out=False)
169
+ tik_instance.fixpipe(nmk0_workspace[n_tiling_size // 16 * (core_id * n_cycle_times // 2 + n_idx),
170
+ m_idx * m_tiling_size, :],
171
+ dst_l0c, n_tiling_size // 16,
172
+ m_tiling_size * 16 * DTYPE_SIZE[C_loc_out_type] // 32,
173
+ (m_size - m_tiling_size) * 16 * DTYPE_SIZE[C_loc_out_type] // 32, 0)
174
+
175
+ N1MN0_TO_MN(tik_instance, mn_gm_output, nmk0_workspace, C_loc_out_type, n_size // K0, m_size, K0)
176
+
177
+ tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[mk_gm_input, kn_gm_input], outputs=[mn_gm_output])
178
+ return tik_instance
179
+
180
+
181
+ def matmul_tik(input_x1, input_x2, output_y=None, kernel_name="simple_matmul"):
182
+ """
183
+ matmul_tik main func
184
+ Parameters
185
+ ----------
186
+ input_x1: input data 1
187
+ input_x2: input data 2
188
+ output_y: output dta
189
+ """
190
+ shape_a = input_x1.get("ori_shape")
191
+ shape_b = input_x2.get("ori_shape")
192
+ m = shape_a[0]
193
+ k = shape_a[1]
194
+ n = shape_b[1]
195
+ data_type = input_x1.get("dtype").lower()
196
+ params = {
197
+ 'M': m,
198
+ 'K': k,
199
+ 'N': n,
200
+ 'data_type': data_type,
201
+ 'm_tiling_size': 16,
202
+ 'm_cycle_times': 1,
203
+ 'm_thread_num': 1,
204
+ 'n_tiling_size': 64,
205
+ 'n_cycle_times': 16,
206
+ 'n_thread_num': 1,
207
+ 'k_tiling_size': 32,
208
+ 'k_cycle_times': 2,
209
+ 'k_thread_num': 2,
210
+ 'output_y': output_y
211
+ }
212
+ return matmul_tik_compute(params, kernel_name)