mindspore 2.2.0__cp38-cp38-manylinux1_x86_64.whl → 2.2.11__cp38-cp38-manylinux1_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mindspore might be problematic. Click here for more details.
- mindspore/.commit_id +1 -1
- mindspore/_akg/akg/composite/build_module.py +104 -20
- mindspore/_akg/akg/utils/ascend_profilier/cann_file_parser.py +76 -0
- mindspore/_akg/akg/utils/ascend_profilier/file_manager.py +56 -0
- mindspore/_akg/akg/utils/ascend_profilier/op_summary_bean.py +23 -0
- mindspore/_akg/akg/utils/ascend_profilier/op_summary_headers.py +8 -0
- mindspore/_akg/akg/utils/ascend_profilier/op_summary_parser.py +42 -0
- mindspore/_akg/akg/utils/ascend_profilier/path_manager.py +65 -0
- mindspore/_akg/akg/utils/composite_op_helper.py +7 -2
- mindspore/_akg/akg/utils/dump_ascend_meta.py +22 -3
- mindspore/_akg/akg/utils/kernel_exec.py +41 -15
- mindspore/_akg/akg/utils/tbe_codegen_utils.py +27 -6
- mindspore/_akg/akg/utils/util.py +56 -1
- mindspore/_c_dataengine.cpython-38-x86_64-linux-gnu.so +0 -0
- mindspore/_c_expression.cpython-38-x86_64-linux-gnu.so +0 -0
- mindspore/_checkparam.py +3 -3
- mindspore/_extends/graph_kernel/model/graph_split.py +84 -76
- mindspore/_extends/graph_kernel/splitter.py +3 -2
- mindspore/_extends/parallel_compile/akg_compiler/build_tbe_kernel.py +83 -66
- mindspore/_extends/parallel_compile/akg_compiler/tbe_topi.py +4 -4
- mindspore/_extends/parallel_compile/akg_compiler/util.py +10 -7
- mindspore/_extends/parallel_compile/tbe_compiler/tbe_helper.py +2 -1
- mindspore/_extends/parse/__init__.py +3 -2
- mindspore/_extends/parse/parser.py +6 -1
- mindspore/_extends/parse/standard_method.py +14 -11
- mindspore/_extends/remote/kernel_build_server.py +2 -1
- mindspore/_mindspore_offline_debug.cpython-38-x86_64-linux-gnu.so +0 -0
- mindspore/bin/cache_admin +0 -0
- mindspore/bin/cache_server +0 -0
- mindspore/common/_utils.py +16 -0
- mindspore/common/api.py +1 -1
- mindspore/common/auto_dynamic_shape.py +81 -85
- mindspore/common/dump.py +1 -1
- mindspore/common/tensor.py +3 -20
- mindspore/config/op_info.config +1 -1
- mindspore/context.py +11 -4
- mindspore/dataset/engine/cache_client.py +8 -5
- mindspore/dataset/engine/datasets_standard_format.py +5 -0
- mindspore/dataset/vision/transforms.py +21 -21
- mindspore/experimental/optim/adam.py +1 -1
- mindspore/gen_ops.py +1 -1
- mindspore/include/api/model.h +17 -0
- mindspore/include/api/status.h +8 -3
- mindspore/lib/libdnnl.so.2 +0 -0
- mindspore/lib/libmindspore.so +0 -0
- mindspore/lib/libmindspore_backend.so +0 -0
- mindspore/lib/libmindspore_common.so +0 -0
- mindspore/lib/libmindspore_core.so +0 -0
- mindspore/lib/libmindspore_glog.so.0 +0 -0
- mindspore/lib/libmindspore_gpr.so.15 +0 -0
- mindspore/lib/libmindspore_grpc++.so.1 +0 -0
- mindspore/lib/libmindspore_grpc.so.15 +0 -0
- mindspore/lib/libmindspore_shared_lib.so +0 -0
- mindspore/lib/libnnacl.so +0 -0
- mindspore/lib/libopencv_core.so.4.5 +0 -0
- mindspore/lib/libopencv_imgcodecs.so.4.5 +0 -0
- mindspore/lib/libopencv_imgproc.so.4.5 +0 -0
- mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/ai_core/tbe/config/ascend310/aic-ascend310-ops-info.json +123 -0
- mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/ai_core/tbe/config/ascend310p/aic-ascend310p-ops-info.json +123 -0
- mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/ai_core/tbe/config/ascend910/aic-ascend910-ops-info.json +158 -0
- mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/ai_core/tbe/config/ascend910b/aic-ascend910b-ops-info.json +37 -0
- mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/ai_core/tbe/custom_aicore_ops_impl/add_dsl.py +46 -0
- mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/ai_core/tbe/custom_aicore_ops_impl/add_tik.py +51 -0
- mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/ai_core/tbe/custom_aicore_ops_impl/kv_cache_mgr.py +241 -0
- mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/ai_core/tbe/custom_aicore_ops_impl/matmul_tik.py +212 -0
- mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/vector_core/tbe/custom_aicore_ops_impl/add_dsl.py +46 -0
- mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/vector_core/tbe/custom_aicore_ops_impl/add_tik.py +51 -0
- mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/vector_core/tbe/custom_aicore_ops_impl/kv_cache_mgr.py +241 -0
- mindspore/lib/plugin/ascend/custom_aicore_ops/op_impl/vector_core/tbe/custom_aicore_ops_impl/matmul_tik.py +212 -0
- mindspore/lib/plugin/ascend/custom_aicore_ops/op_proto/libop_proto.so +0 -0
- mindspore/lib/plugin/ascend/custom_aicpu_ops/op_impl/cpu/aicpu_kernel/impl/libcust_aicpu_kernels.so +0 -0
- mindspore/lib/plugin/ascend/custom_aicpu_ops/op_impl/cpu/aicpu_kernel/impl/libcust_cpu_kernels.so +0 -0
- mindspore/lib/plugin/ascend/custom_aicpu_ops/op_impl/cpu/config/cust_aicpu_kernel.json +78 -80
- mindspore/lib/plugin/ascend/custom_aicpu_ops/op_proto/libcust_op_proto.so +0 -0
- mindspore/lib/plugin/ascend/libakg.so +0 -0
- mindspore/lib/plugin/ascend/libhccl_plugin.so +0 -0
- mindspore/lib/plugin/ascend/libmindspore_aicpu_kernels.so +0 -0
- mindspore/lib/plugin/ascend/libmindspore_cpu_kernels.so +0 -0
- mindspore/lib/plugin/cpu/libakg.so +0 -0
- mindspore/lib/plugin/gpu/libcuda_ops.so.10 +0 -0
- mindspore/lib/plugin/gpu/libcuda_ops.so.11 +0 -0
- mindspore/lib/plugin/gpu10.1/libakg.so +0 -0
- mindspore/lib/plugin/gpu11.1/libakg.so +0 -0
- mindspore/lib/plugin/gpu11.1/libnccl.so.2 +0 -0
- mindspore/lib/plugin/gpu11.6/libakg.so +0 -0
- mindspore/lib/plugin/gpu11.6/libnccl.so.2 +0 -0
- mindspore/lib/plugin/libmindspore_ascend.so.1 +0 -0
- mindspore/lib/plugin/libmindspore_ascend.so.2 +0 -0
- mindspore/lib/plugin/libmindspore_gpu.so.10.1 +0 -0
- mindspore/lib/plugin/libmindspore_gpu.so.11.1 +0 -0
- mindspore/lib/plugin/libmindspore_gpu.so.11.6 +0 -0
- mindspore/nn/cell.py +0 -3
- mindspore/nn/layer/activation.py +4 -5
- mindspore/nn/layer/conv.py +39 -23
- mindspore/nn/layer/flash_attention.py +54 -129
- mindspore/nn/layer/math.py +3 -7
- mindspore/nn/layer/rnn_cells.py +5 -5
- mindspore/nn/wrap/__init__.py +4 -2
- mindspore/nn/wrap/cell_wrapper.py +12 -3
- mindspore/numpy/utils_const.py +5 -5
- mindspore/ops/_grad_experimental/grad_array_ops.py +1 -1
- mindspore/ops/_grad_experimental/grad_implementations.py +2 -2
- mindspore/ops/_grad_experimental/grad_math_ops.py +19 -18
- mindspore/ops/_grad_experimental/grad_sparse_ops.py +3 -3
- mindspore/ops/_op_impl/aicpu/add.py +3 -3
- mindspore/ops/_op_impl/aicpu/linear_sum_assignment.py +21 -2
- mindspore/ops/_utils/utils.py +2 -0
- mindspore/ops/composite/multitype_ops/_compile_utils.py +2 -1
- mindspore/ops/composite/multitype_ops/getitem_impl.py +2 -2
- mindspore/ops/function/array_func.py +10 -7
- mindspore/ops/function/grad/grad_func.py +0 -1
- mindspore/ops/function/nn_func.py +98 -9
- mindspore/ops/function/random_func.py +2 -1
- mindspore/ops/op_info_register.py +24 -21
- mindspore/ops/operations/__init__.py +6 -2
- mindspore/ops/operations/_grad_ops.py +25 -6
- mindspore/ops/operations/_inner_ops.py +155 -23
- mindspore/ops/operations/array_ops.py +9 -7
- mindspore/ops/operations/comm_ops.py +2 -2
- mindspore/ops/operations/custom_ops.py +85 -68
- mindspore/ops/operations/inner_ops.py +26 -3
- mindspore/ops/operations/math_ops.py +7 -6
- mindspore/ops/operations/nn_ops.py +193 -49
- mindspore/parallel/_parallel_serialization.py +10 -3
- mindspore/parallel/_tensor.py +4 -1
- mindspore/parallel/checkpoint_transform.py +13 -2
- mindspore/parallel/shard.py +17 -10
- mindspore/profiler/common/util.py +1 -0
- mindspore/profiler/parser/ascend_hccl_generator.py +232 -0
- mindspore/profiler/parser/ascend_msprof_exporter.py +86 -43
- mindspore/profiler/parser/ascend_msprof_generator.py +196 -9
- mindspore/profiler/parser/ascend_op_generator.py +1 -1
- mindspore/profiler/parser/ascend_timeline_generator.py +6 -182
- mindspore/profiler/parser/base_timeline_generator.py +1 -1
- mindspore/profiler/parser/cpu_gpu_timeline_generator.py +2 -2
- mindspore/profiler/parser/framework_parser.py +1 -1
- mindspore/profiler/parser/profiler_info.py +19 -0
- mindspore/profiler/profiling.py +46 -24
- mindspore/rewrite/api/pattern_engine.py +1 -1
- mindspore/rewrite/parsers/for_parser.py +7 -7
- mindspore/rewrite/parsers/module_parser.py +4 -4
- mindspore/rewrite/symbol_tree.py +1 -4
- mindspore/run_check/_check_version.py +5 -3
- mindspore/safeguard/rewrite_obfuscation.py +52 -28
- mindspore/scipy/ops.py +55 -5
- mindspore/scipy/optimize/__init__.py +3 -2
- mindspore/scipy/optimize/linear_sum_assignment.py +38 -33
- mindspore/train/callback/_summary_collector.py +1 -1
- mindspore/train/dataset_helper.py +1 -0
- mindspore/train/model.py +2 -2
- mindspore/train/serialization.py +97 -11
- mindspore/train/summary/_summary_adapter.py +1 -1
- mindspore/train/summary/summary_record.py +23 -7
- mindspore/version.py +1 -1
- {mindspore-2.2.0.dist-info → mindspore-2.2.11.dist-info}/METADATA +3 -2
- {mindspore-2.2.0.dist-info → mindspore-2.2.11.dist-info}/RECORD +160 -151
- mindspore/ops/_op_impl/_custom_op/flash_attention/attention.py +0 -406
- mindspore/ops/_op_impl/_custom_op/flash_attention/constants.py +0 -41
- mindspore/ops/_op_impl/_custom_op/flash_attention/flash_attention_bwd.py +0 -467
- mindspore/ops/_op_impl/_custom_op/flash_attention/flash_attention_fwd.py +0 -563
- mindspore/ops/_op_impl/_custom_op/flash_attention/flash_attention_impl.py +0 -193
- mindspore/ops/_op_impl/_custom_op/flash_attention/tik_ops_utils.py +0 -435
- mindspore/ops/_op_impl/_custom_op/flash_attention/tiling_strategy/__init__.py +0 -0
- mindspore/ops/_op_impl/_custom_op/flash_attention/tiling_strategy/sparse_tiling.py +0 -45
- mindspore/ops/_op_impl/_custom_op/flash_attention/tiling_strategy/strategy.py +0 -67
- mindspore/ops/_op_impl/_custom_op/flash_attention/tiling_strategy/wukong_tiling.py +0 -62
- /mindspore/{ops/_op_impl/_custom_op/flash_attention → _akg/akg/utils/ascend_profilier}/__init__.py +0 -0
- {mindspore-2.2.0.dist-info → mindspore-2.2.11.dist-info}/WHEEL +0 -0
- {mindspore-2.2.0.dist-info → mindspore-2.2.11.dist-info}/entry_points.txt +0 -0
- {mindspore-2.2.0.dist-info → mindspore-2.2.11.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# Copyright 2022 Huawei Technologies Co., Ltd
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
# ============================================================================
|
|
15
|
+
"""ascend custom op: add by tik"""
|
|
16
|
+
from tbe.common.register import register_op_compute
|
|
17
|
+
from tbe.common.utils import para_check
|
|
18
|
+
from tbe import tik
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@register_op_compute("AddTik")
|
|
22
|
+
@para_check.check_op_params(para_check.REQUIRED_INPUT, para_check.REQUIRED_INPUT,
|
|
23
|
+
para_check.REQUIRED_OUTPUT, para_check.KERNEL_NAME)
|
|
24
|
+
def add_tik(x1, x2, y, kernel_name="add_tik"):
|
|
25
|
+
"""add dsl impl function"""
|
|
26
|
+
tik_instance = tik.Tik()
|
|
27
|
+
x1_shape = x1.get("shape")
|
|
28
|
+
x2_shape = x2.get("shape")
|
|
29
|
+
y_shape = y.get("shape")
|
|
30
|
+
|
|
31
|
+
data_a = tik_instance.Tensor(
|
|
32
|
+
"float16", x1_shape, name="x1", scope=tik.scope_gm)
|
|
33
|
+
data_b = tik_instance.Tensor(
|
|
34
|
+
"float16", x2_shape, name="x2", scope=tik.scope_gm)
|
|
35
|
+
data_c = tik_instance.Tensor(
|
|
36
|
+
"float16", y_shape, name="y", scope=tik.scope_gm)
|
|
37
|
+
data_a_ub = tik_instance.Tensor(
|
|
38
|
+
"float16", x1_shape, name="data_A_ub", scope=tik.scope_ubuf)
|
|
39
|
+
data_b_ub = tik_instance.Tensor(
|
|
40
|
+
"float16", x2_shape, name="data_B_ub", scope=tik.scope_ubuf)
|
|
41
|
+
data_c_ub = tik_instance.Tensor(
|
|
42
|
+
"float16", y_shape, name="data_C_ub", scope=tik.scope_ubuf)
|
|
43
|
+
|
|
44
|
+
tik_instance.data_move(data_a_ub, data_a, 0, 1, 128 // 16, 0, 0)
|
|
45
|
+
tik_instance.data_move(data_b_ub, data_b, 0, 1, 128 // 16, 0, 0)
|
|
46
|
+
tik_instance.vec_add(
|
|
47
|
+
128, data_c_ub[0], data_a_ub[0], data_b_ub[0], 1, 8, 8, 8)
|
|
48
|
+
tik_instance.data_move(data_c, data_c_ub, 0, 1, 128 // 16, 0, 0)
|
|
49
|
+
tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[data_a, data_b], outputs=[data_c])
|
|
50
|
+
|
|
51
|
+
return tik_instance
|
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
# Copyright 2023 Huawei Technologies Co., Ltd
|
|
4
|
+
#
|
|
5
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
6
|
+
# you may not use this file except in compliance with the License.
|
|
7
|
+
# You may obtain a copy of the License at
|
|
8
|
+
#
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
# See the License for the specific language governing permissions and
|
|
15
|
+
# limitations under the License.
|
|
16
|
+
# ============================================================================
|
|
17
|
+
"""ascend custom op: kv_cache_mgr by tik"""
|
|
18
|
+
|
|
19
|
+
import functools
|
|
20
|
+
from tbe import tik
|
|
21
|
+
import tbe.common.platform as tbe_platform
|
|
22
|
+
from tbe.common.utils import para_check
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# 'pylint: disable=unused-argument,unused-variable,too-many-arguments,too-many-locals
|
|
26
|
+
def check_supported(past, cur, index, out, kernel_name="kv_cache_mgr"):
|
|
27
|
+
"""check data type and shape"""
|
|
28
|
+
# check data type
|
|
29
|
+
past_dtype = past.get("dtype").lower()
|
|
30
|
+
cur_dtype = cur.get("dtype").lower()
|
|
31
|
+
out_dtype = out.get("dtype").lower()
|
|
32
|
+
|
|
33
|
+
if past_dtype != cur_dtype or past_dtype != out_dtype:
|
|
34
|
+
reason = "past_dtype is %s, cur_dtype is %s, out_dtype is %s" % (past_dtype, cur_dtype, out_dtype)
|
|
35
|
+
return False, reason
|
|
36
|
+
|
|
37
|
+
support_dtype_list = ["float32", "int32", "uint32",
|
|
38
|
+
"float16", "int16", "uint16",
|
|
39
|
+
"int8", "uint8"]
|
|
40
|
+
if past_dtype not in support_dtype_list:
|
|
41
|
+
reason = "past_dtype(%s) is not support" % (past_dtype)
|
|
42
|
+
return False, reason
|
|
43
|
+
|
|
44
|
+
index_dtype = index.get("dtype").lower()
|
|
45
|
+
if index_dtype != "int32":
|
|
46
|
+
reason = "index_dtype is %s, not int32" % (index_dtype)
|
|
47
|
+
return False, reason
|
|
48
|
+
|
|
49
|
+
# check shape
|
|
50
|
+
past_shape = past.get("shape")
|
|
51
|
+
cur_shape = cur.get("shape")
|
|
52
|
+
|
|
53
|
+
if len(past_shape) != 4 or len(cur_shape) != 4:
|
|
54
|
+
reason = "len(past_shape) != 4 or len(cur_shape) != 4 "
|
|
55
|
+
return False, reason
|
|
56
|
+
|
|
57
|
+
# key_past shape: (bs, num_heads, size_per_head, seq_length)
|
|
58
|
+
# value_past shape: (bs, num_heads, seq_length, size_per_head)
|
|
59
|
+
# key shape: (bs, num_heads, 1, size_per_head)
|
|
60
|
+
# value shape: (bs, num_heads, 1, size_per_head)
|
|
61
|
+
|
|
62
|
+
if past_shape[0] != cur_shape[0] or past_shape[1] != cur_shape[1]:
|
|
63
|
+
reason = "past_shape[0] != cur_shape[0] or past_shape[1] != cur_shape[1] "
|
|
64
|
+
return False, reason
|
|
65
|
+
|
|
66
|
+
if past_shape[3] != cur_shape[3]:
|
|
67
|
+
reason = "past_shape[3] != cur_shape[3]"
|
|
68
|
+
return False, reason
|
|
69
|
+
|
|
70
|
+
return True, ""
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def ceil_div(dividend, divisor):
|
|
74
|
+
return (dividend + divisor - 1) // divisor
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def get_loop_info(total_num, each_loop_num):
|
|
78
|
+
loop_times = ceil_div(total_num, each_loop_num)
|
|
79
|
+
last_loop_num = total_num - each_loop_num * (loop_times - 1)
|
|
80
|
+
return loop_times, last_loop_num
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def elements_align(index_elements, data_size, align_size):
|
|
84
|
+
"""Get element num align to align_size"""
|
|
85
|
+
total_size = index_elements * data_size
|
|
86
|
+
aligned_total_size = (total_size + align_size - 1) // align_size * align_size
|
|
87
|
+
return aligned_total_size // data_size
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class TilingHelper:
|
|
91
|
+
"""Tiling parameter"""
|
|
92
|
+
def __init__(self, past, cur, index, out, kernel_name="kv_cache_mgr"):
|
|
93
|
+
self.kernel_name = kernel_name
|
|
94
|
+
|
|
95
|
+
# sys info
|
|
96
|
+
self.core_num = tbe_platform.get_soc_spec(tbe_platform.CORE_NUM)
|
|
97
|
+
self.ub_size = tbe_platform.get_soc_spec(tbe_platform.UB_SIZE)
|
|
98
|
+
|
|
99
|
+
self.past_shape = past.get("shape")
|
|
100
|
+
self.cur_shape = cur.get("shape")
|
|
101
|
+
self.index_shape = index.get("shape")
|
|
102
|
+
|
|
103
|
+
self.gm_type = past.get("dtype").lower()
|
|
104
|
+
self.ub_type = self.gm_type
|
|
105
|
+
self.index_ub_type = "int32"
|
|
106
|
+
self.int32_size = 4
|
|
107
|
+
|
|
108
|
+
self.gm_dtype_size = 2
|
|
109
|
+
if self.gm_type in ["int8", "uint8"]:
|
|
110
|
+
self.gm_dtype_size = 1
|
|
111
|
+
elif self.gm_type in ["float16", "int16", "uint16"]:
|
|
112
|
+
self.gm_dtype_size = 2
|
|
113
|
+
elif self.gm_type in ["float32", "int32", "uint32"]:
|
|
114
|
+
self.gm_dtype_size = 4
|
|
115
|
+
|
|
116
|
+
# tiling policy
|
|
117
|
+
self.seq_length = self.past_shape[2]
|
|
118
|
+
self.size_per_head = self.past_shape[3]
|
|
119
|
+
self.update_seq_length = self.cur_shape[2]
|
|
120
|
+
|
|
121
|
+
self.num_head = self.past_shape[1]
|
|
122
|
+
|
|
123
|
+
self.past_elements = functools.reduce(lambda a, b: a * b, self.past_shape)
|
|
124
|
+
self.cur_elements = functools.reduce(lambda a, b: a * b, self.cur_shape)
|
|
125
|
+
|
|
126
|
+
# The `burst` unit is 32B
|
|
127
|
+
index_elements = functools.reduce(lambda a, b: a * b, self.index_shape)
|
|
128
|
+
self.index_elements = elements_align(index_elements, self.int32_size, 32)
|
|
129
|
+
|
|
130
|
+
# split cur
|
|
131
|
+
self.cur_bs = self.cur_shape[0] * self.cur_shape[1]
|
|
132
|
+
self.each_core_bs_num = ceil_div(self.cur_bs, self.core_num)
|
|
133
|
+
self.core_num, self.last_core_bs_num = get_loop_info(self.cur_bs, self.each_core_bs_num)
|
|
134
|
+
self.cur_ub_elements = self.each_core_bs_num * self.update_seq_length * self.size_per_head
|
|
135
|
+
self.last_cure_ub_elements = self.last_core_bs_num * self.update_seq_length * self.size_per_head
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
class KVCacheImpl(TilingHelper):
|
|
139
|
+
"""KVCacheImpl"""
|
|
140
|
+
def __init__(self, past, cur, index, out, kernel_name):
|
|
141
|
+
super().__init__(past, cur, index, out, kernel_name)
|
|
142
|
+
# key_past or value_past shape: (bs, num_heads, seq_length, size_per_head)
|
|
143
|
+
# batch_valid_length
|
|
144
|
+
# cur update shape: (bs, num_heads, 1, size_per_head)
|
|
145
|
+
|
|
146
|
+
self.tik_inst = tik.Tik(disable_debug=True)
|
|
147
|
+
self.past_gm = self.tik_inst.Tensor(self.gm_type, (self.past_elements,), name="past_gm", scope=tik.scope_gm)
|
|
148
|
+
self.cur_gm = self.tik_inst.Tensor(self.gm_type, (self.cur_elements,), name="cur_gm", scope=tik.scope_gm)
|
|
149
|
+
self.index_gm = self.tik_inst.Tensor(self.index_ub_type, (self.index_elements,), name="index_gm",
|
|
150
|
+
scope=tik.scope_gm)
|
|
151
|
+
# we use is_atomic_add=True to set the out_gm zeros. But if inplace update out_gm, no need to set this flag.
|
|
152
|
+
self.out_gm = self.tik_inst.Tensor(self.gm_type, (self.past_elements,), name="out_gm", scope=tik.scope_gm)
|
|
153
|
+
|
|
154
|
+
def valid_cur_ub_load(self, core_idx):
|
|
155
|
+
"""KVCacheImpl.valid_cur_ub_load"""
|
|
156
|
+
cur_ub = self.tik_inst.Tensor(self.ub_type, (self.cur_ub_elements,), name="valid_cur_ub",
|
|
157
|
+
scope=tik.scope_ubuf)
|
|
158
|
+
cur_gm_offset = core_idx * self.cur_ub_elements
|
|
159
|
+
with self.tik_inst.if_scope(core_idx != self.core_num -1):
|
|
160
|
+
self.tik_inst.data_move(cur_ub, self.cur_gm[cur_gm_offset:], 0, 1,
|
|
161
|
+
self.cur_ub_elements * self.gm_dtype_size // 32, 0, 0)
|
|
162
|
+
with self.tik_inst.else_scope():
|
|
163
|
+
self.tik_inst.data_move(cur_ub, self.cur_gm[cur_gm_offset:], 0, 1,
|
|
164
|
+
self.last_cure_ub_elements * self.gm_dtype_size // 32, 0, 0)
|
|
165
|
+
return cur_ub
|
|
166
|
+
|
|
167
|
+
def valid_index_ub_load(self):
|
|
168
|
+
"""KVCacheImpl.valid_index_ub_load"""
|
|
169
|
+
index_ub = self.tik_inst.Tensor(self.index_ub_type, (self.index_elements,), name="valid_index_ub",
|
|
170
|
+
scope=tik.scope_ubuf)
|
|
171
|
+
self.tik_inst.data_move(index_ub, self.index_gm, 0, 1, self.index_elements * self.int32_size // 32, 0, 0)
|
|
172
|
+
return index_ub
|
|
173
|
+
|
|
174
|
+
def valid_pos_update(self, core_idx, cur_ub, index_ub, each_core_bs_num):
|
|
175
|
+
"""KVCacheImpl.valid_pos_update"""
|
|
176
|
+
src_bs_stride = self.update_seq_length * self.size_per_head
|
|
177
|
+
dst_bs_stride = self.seq_length * self.size_per_head
|
|
178
|
+
burst_len = self.update_seq_length * self.size_per_head * self.gm_dtype_size // 32
|
|
179
|
+
|
|
180
|
+
valid_idx = self.tik_inst.Scalar(dtype="int32")
|
|
181
|
+
with self.tik_inst.for_range(0, each_core_bs_num) as each_core_bs_idx:
|
|
182
|
+
bs_idx = core_idx * self.each_core_bs_num + each_core_bs_idx
|
|
183
|
+
# because we fused bs * num_head, we need get the real bs_idx
|
|
184
|
+
valid_idx.set_as(index_ub[bs_idx // self.num_head])
|
|
185
|
+
with self.tik_inst.if_scope(valid_idx >= 0):
|
|
186
|
+
dst_offset = bs_idx * dst_bs_stride + valid_idx * self.size_per_head
|
|
187
|
+
src_offset = each_core_bs_idx * src_bs_stride
|
|
188
|
+
if burst_len < 65536:
|
|
189
|
+
self.tik_inst.data_move(self.out_gm[dst_offset], cur_ub[src_offset],
|
|
190
|
+
0, 1, burst_len, 0, 0)
|
|
191
|
+
else:
|
|
192
|
+
nburst = 1
|
|
193
|
+
each_burst_len = burst_len
|
|
194
|
+
while each_burst_len > 65535:
|
|
195
|
+
nburst += 1
|
|
196
|
+
each_burst_len = burst_len // nburst
|
|
197
|
+
self.tik_inst.data_move(self.out_gm[dst_offset], cur_ub[src_offset], 0,
|
|
198
|
+
nburst, each_burst_len, 0, 0)
|
|
199
|
+
|
|
200
|
+
# 'pylint: disable=too-many-arguments
|
|
201
|
+
def compute_each_core(self, core_idx, core_bs_num):
|
|
202
|
+
"""KVCacheImpl.compute_each_core"""
|
|
203
|
+
index_ub = self.valid_index_ub_load()
|
|
204
|
+
cur_ub = self.valid_cur_ub_load(core_idx)
|
|
205
|
+
self.valid_pos_update(core_idx, cur_ub, index_ub, core_bs_num)
|
|
206
|
+
|
|
207
|
+
def compute(self):
|
|
208
|
+
"""KVCacheImpl.compute"""
|
|
209
|
+
if self.each_core_bs_num == self.last_core_bs_num:
|
|
210
|
+
with self.tik_inst.for_range(0, self.core_num, block_num=self.core_num) as core_index:
|
|
211
|
+
self.compute_each_core(core_idx=core_index, core_bs_num=self.each_core_bs_num)
|
|
212
|
+
else:
|
|
213
|
+
with self.tik_inst.for_range(0, self.core_num, block_num=self.core_num) as core_index:
|
|
214
|
+
with self.tik_inst.if_scope(core_index < self.core_num - 1):
|
|
215
|
+
self.compute_each_core(core_idx=core_index, core_bs_num=self.each_core_bs_num)
|
|
216
|
+
with self.tik_inst.else_scope():
|
|
217
|
+
self.compute_each_core(core_idx=core_index, core_bs_num=self.last_core_bs_num)
|
|
218
|
+
|
|
219
|
+
self.tik_inst.BuildCCE(kernel_name=self.kernel_name,
|
|
220
|
+
inputs=[self.past_gm, self.cur_gm, self.index_gm],
|
|
221
|
+
outputs=[self.out_gm],
|
|
222
|
+
)
|
|
223
|
+
return self.tik_inst
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
# 'pylint: disable = unused-argument
|
|
227
|
+
# 'pylint: disable=too-many-arguments,too-many-locals
|
|
228
|
+
@para_check.check_op_params(para_check.REQUIRED_INPUT, para_check.REQUIRED_INPUT,
|
|
229
|
+
para_check.REQUIRED_INPUT, para_check.REQUIRED_OUTPUT,
|
|
230
|
+
para_check.KERNEL_NAME)
|
|
231
|
+
def kv_cache_mgr(past, cur, index, out, kernel_name="kv_cache_mgr"):
|
|
232
|
+
"""
|
|
233
|
+
:param past: key_past or value_past. shape: (bs, num_head, seq_length, size_pre_head)
|
|
234
|
+
:param cur: key_current or value_current. shape: (bs, num_head, update_seq_length, size_pre_head)
|
|
235
|
+
:param index: which index to update. shape * len(dtype) need be multiples of 32. Option Input.
|
|
236
|
+
:param out: output shape: (bs, num_head, seq_length, size_pre_head)
|
|
237
|
+
:param kernel_name: the name of the op
|
|
238
|
+
:return:
|
|
239
|
+
"""
|
|
240
|
+
obj = KVCacheImpl(past, cur, index, out, kernel_name)
|
|
241
|
+
return obj.compute()
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2020 Huawei Technologies Co., Ltd. All rights reserved.
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
|
|
16
|
+
matmul_tik
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from tbe import tik
|
|
20
|
+
from tbe.common.platform import get_soc_spec
|
|
21
|
+
|
|
22
|
+
DTYPE_SIZE = {
|
|
23
|
+
'bool': 1,
|
|
24
|
+
'uint8': 1,
|
|
25
|
+
'int8': 1,
|
|
26
|
+
'uint16': 2,
|
|
27
|
+
'int16': 2,
|
|
28
|
+
'int24': 3,
|
|
29
|
+
'uint32': 4,
|
|
30
|
+
'int32': 4,
|
|
31
|
+
'float16': 2,
|
|
32
|
+
'float32': 4,
|
|
33
|
+
'int48': 6,
|
|
34
|
+
'int64': 8,
|
|
35
|
+
'uint64': 8,
|
|
36
|
+
'float64': 8
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def MK_TO_K1MK0(tik_instance, mk_input_tensor, k1mk0_tensor, dtype, k1, m, k0):
|
|
41
|
+
"""data move mk to k1mk0"""
|
|
42
|
+
src_ub = tik_instance.Tensor(dtype, (k1, m, k0), name='src_ub', scope=tik.scope_ubuf)
|
|
43
|
+
|
|
44
|
+
# data_move(m, k) ---> (k1, m, k0)
|
|
45
|
+
with tik_instance.for_range(0, k1) as i:
|
|
46
|
+
tik_instance.data_move(src_ub[i * m * k0:], mk_input_tensor[i * k0:], 0, m, k0 * DTYPE_SIZE[dtype] // 32,
|
|
47
|
+
(k1 - 1) * k0 * DTYPE_SIZE[dtype] // 32, 0)
|
|
48
|
+
|
|
49
|
+
tik_instance.data_move(k1mk0_tensor, src_ub, 0, 1, k1 * m * k0 * DTYPE_SIZE[dtype] // 32, 0, 0)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def KN_TO_K1NK0(tik_instance, kn_input_tensor, k1nk0_tensor, dtype, k1, n, k0):
|
|
53
|
+
"""data move kn to k1nk0"""
|
|
54
|
+
|
|
55
|
+
with tik_instance.for_range(0, k1) as index:
|
|
56
|
+
k1nk0_ub = tik_instance.Tensor(dtype, (n, k0), tik.scope_ubuf, "k1nk0_ub")
|
|
57
|
+
src_ub = tik_instance.Tensor(dtype, (k0, n), tik.scope_ubuf, "src_ub")
|
|
58
|
+
burst_len = k0 * n * DTYPE_SIZE[dtype] // 32
|
|
59
|
+
tik_instance.data_move(src_ub, kn_input_tensor[index * k0 * n], 0, 1, burst_len, 0, 0)
|
|
60
|
+
dst_list = [k1nk0_ub[16 * i] for i in range(16)]
|
|
61
|
+
src_list = [src_ub[n * i] for i in range(16)]
|
|
62
|
+
rep_times = n // k0
|
|
63
|
+
dst_rep_stride = k0
|
|
64
|
+
src_rep_stride = 1
|
|
65
|
+
tik_instance.vec_trans_scatter(False, False, dst_list, src_list, rep_times, dst_rep_stride, src_rep_stride)
|
|
66
|
+
tik_instance.data_move(k1nk0_tensor[index * k0 * n], k1nk0_ub, 0, 1, burst_len, 0, 0)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def N1MN0_TO_MN(tik_instance, mn_output_tensor, n1mn0_tensor, dtype, n1, m, n0):
|
|
70
|
+
"""data move mn to n1mn0"""
|
|
71
|
+
src_ub = tik_instance.Tensor(dtype, (m, n1 * n0), name='src_ub', scope=tik.scope_ubuf)
|
|
72
|
+
|
|
73
|
+
# data_move(n1, m, n0) ---> (m, n)
|
|
74
|
+
with tik_instance.for_range(0, n1) as i:
|
|
75
|
+
tik_instance.data_move(src_ub[i * n0:], n1mn0_tensor[i * m * n0:], 0, m,
|
|
76
|
+
n0 * DTYPE_SIZE[dtype] // 32, 0, (n1 - 1) * n0 * DTYPE_SIZE[dtype] // 32)
|
|
77
|
+
|
|
78
|
+
tik_instance.data_move(mn_output_tensor, src_ub, 0, 1, m * n1 * n0 * DTYPE_SIZE[dtype] // 32, 0, 0)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def matmul_tik_compute(params, kernel_name):
|
|
82
|
+
"""
|
|
83
|
+
matmul tik compute
|
|
84
|
+
@param params: matmul data
|
|
85
|
+
@param kernel_name: kernel name
|
|
86
|
+
@return: tik instance
|
|
87
|
+
"""
|
|
88
|
+
tik_instance = tik.Tik()
|
|
89
|
+
if not isinstance(params, dict):
|
|
90
|
+
params = params.__dict__
|
|
91
|
+
m_size, k_size, n_size = params['M'], params['K'], params['N']
|
|
92
|
+
data_type = params["data_type"]
|
|
93
|
+
m_tiling_size = int(params["m_tiling_size"])
|
|
94
|
+
n_tiling_size = int(params["n_tiling_size"])
|
|
95
|
+
k_tiling_size = int(params['k_tiling_size'])
|
|
96
|
+
|
|
97
|
+
m_cycle_times = params["m_cycle_times"]
|
|
98
|
+
n_cycle_times = params["n_cycle_times"]
|
|
99
|
+
k_cycle_times = params["k_cycle_times"]
|
|
100
|
+
|
|
101
|
+
# Determine the output type
|
|
102
|
+
if data_type == "float16":
|
|
103
|
+
if get_soc_spec("SOC_VERSION") in ["SD3403", "OPTG", "Hi3796CV300CS", "TsnsC"]:
|
|
104
|
+
C_loc_out_type = "float16"
|
|
105
|
+
else:
|
|
106
|
+
C_loc_out_type = "float32"
|
|
107
|
+
K0 = 16
|
|
108
|
+
else:
|
|
109
|
+
C_loc_out_type = "int32"
|
|
110
|
+
K0 = 32
|
|
111
|
+
block_size = 16
|
|
112
|
+
|
|
113
|
+
n_thread_num = params['n_thread_num']
|
|
114
|
+
m_thread_num = params['m_thread_num']
|
|
115
|
+
k_thread_num = params['k_thread_num']
|
|
116
|
+
|
|
117
|
+
mk_gm_input = tik_instance.Tensor(data_type, (m_size, k_size), name="mk_input_gm", scope=tik.scope_gm)
|
|
118
|
+
kn_gm_input = tik_instance.Tensor(data_type, (k_size, n_size), name="kn_input_gm", scope=tik.scope_gm)
|
|
119
|
+
|
|
120
|
+
k1mk0_workspace = tik_instance.Tensor(data_type, (k_size // K0, m_size, K0), name="k1mk0_workspace",
|
|
121
|
+
scope=tik.scope_gm, is_workspace=True)
|
|
122
|
+
|
|
123
|
+
k1nk0_workspace = tik_instance.Tensor(data_type, (k_size // K0, n_size, K0), name="k1nk0_workspace",
|
|
124
|
+
scope=tik.scope_gm, is_workspace=True)
|
|
125
|
+
|
|
126
|
+
mn_gm_output = tik_instance.Tensor(C_loc_out_type, (m_size, n_size), tik.scope_gm, name="mn_output_gm")
|
|
127
|
+
nmk0_workspace = tik_instance.Tensor(C_loc_out_type, (n_size // block_size, m_size, block_size),
|
|
128
|
+
name="nmk0_workspace", scope=tik.scope_gm, is_workspace=True)
|
|
129
|
+
|
|
130
|
+
MK_TO_K1MK0(tik_instance, mk_gm_input, k1mk0_workspace, data_type, k_size // K0, m_size, K0)
|
|
131
|
+
KN_TO_K1NK0(tik_instance, kn_gm_input, k1nk0_workspace, data_type, k_size // K0, n_size, K0)
|
|
132
|
+
|
|
133
|
+
# Tiling is realized through the for_range() loop.
|
|
134
|
+
with tik_instance.for_range(0, 2, block_num=1) as core_id:
|
|
135
|
+
with tik_instance.for_range(0, n_cycle_times // 2, thread_num=n_thread_num) as n_idx:
|
|
136
|
+
with tik_instance.for_range(0, m_cycle_times, thread_num=m_thread_num) as m_idx:
|
|
137
|
+
dst_l0c = tik_instance.Tensor(C_loc_out_type, [n_tiling_size // 16, m_tiling_size, 16], name='dst_l0c',
|
|
138
|
+
scope=tik.scope_cbuf_out)
|
|
139
|
+
with tik_instance.for_range(0, k_cycle_times,
|
|
140
|
+
thread_num=k_thread_num) as k_idx:
|
|
141
|
+
# Calculation result data transfer.
|
|
142
|
+
inputa_l1 = tik_instance.Tensor(params['data_type'], [k_tiling_size // K0, m_tiling_size, K0],
|
|
143
|
+
name="A_tiling_l1", scope=tik.scope_cbuf)
|
|
144
|
+
tik_instance.data_move(inputa_l1,
|
|
145
|
+
k1mk0_workspace[k_idx * k_tiling_size // K0, m_idx * m_tiling_size, :],
|
|
146
|
+
0, k_tiling_size // K0, m_tiling_size, m_size - m_tiling_size, 0)
|
|
147
|
+
inputb_l1 = tik_instance.Tensor(params["data_type"], [k_tiling_size // K0, n_tiling_size, K0],
|
|
148
|
+
name="B_tiling_l1", scope=tik.scope_cbuf)
|
|
149
|
+
if n_size - n_tiling_size > 65535:
|
|
150
|
+
with tik_instance.for_range(0, k_tiling_size // K0) \
|
|
151
|
+
as dma_k_idx:
|
|
152
|
+
tik_instance.data_move(inputb_l1[dma_k_idx, :, :],
|
|
153
|
+
k1nk0_workspace[k_idx * k_tiling_size // K0 + dma_k_idx,
|
|
154
|
+
(core_id * n_cycle_times // 2 + n_idx)
|
|
155
|
+
* n_tiling_size, :],
|
|
156
|
+
0, 1, n_tiling_size, 0, 0)
|
|
157
|
+
else:
|
|
158
|
+
tik_instance.data_move(inputb_l1, k1nk0_workspace[k_idx * k_tiling_size // K0,
|
|
159
|
+
(core_id * n_cycle_times // 2 + n_idx)
|
|
160
|
+
* n_tiling_size, :],
|
|
161
|
+
0, k_tiling_size // K0, n_tiling_size, n_size - n_tiling_size, 0)
|
|
162
|
+
# Call matmul API to matrix multiplication calculation.
|
|
163
|
+
with tik_instance.if_scope(k_idx == 0):
|
|
164
|
+
tik_instance.matmul(dst_l0c, inputa_l1, inputb_l1, m_tiling_size, k_tiling_size, n_tiling_size,
|
|
165
|
+
init_l1out=True)
|
|
166
|
+
with tik_instance.else_scope():
|
|
167
|
+
tik_instance.matmul(dst_l0c, inputa_l1, inputb_l1, m_tiling_size, k_tiling_size, n_tiling_size,
|
|
168
|
+
init_l1out=False)
|
|
169
|
+
tik_instance.fixpipe(nmk0_workspace[n_tiling_size // 16 * (core_id * n_cycle_times // 2 + n_idx),
|
|
170
|
+
m_idx * m_tiling_size, :],
|
|
171
|
+
dst_l0c, n_tiling_size // 16,
|
|
172
|
+
m_tiling_size * 16 * DTYPE_SIZE[C_loc_out_type] // 32,
|
|
173
|
+
(m_size - m_tiling_size) * 16 * DTYPE_SIZE[C_loc_out_type] // 32, 0)
|
|
174
|
+
|
|
175
|
+
N1MN0_TO_MN(tik_instance, mn_gm_output, nmk0_workspace, C_loc_out_type, n_size // K0, m_size, K0)
|
|
176
|
+
|
|
177
|
+
tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[mk_gm_input, kn_gm_input], outputs=[mn_gm_output])
|
|
178
|
+
return tik_instance
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def matmul_tik(input_x1, input_x2, output_y=None, kernel_name="simple_matmul"):
|
|
182
|
+
"""
|
|
183
|
+
matmul_tik main func
|
|
184
|
+
Parameters
|
|
185
|
+
----------
|
|
186
|
+
input_x1: input data 1
|
|
187
|
+
input_x2: input data 2
|
|
188
|
+
output_y: output dta
|
|
189
|
+
"""
|
|
190
|
+
shape_a = input_x1.get("ori_shape")
|
|
191
|
+
shape_b = input_x2.get("ori_shape")
|
|
192
|
+
m = shape_a[0]
|
|
193
|
+
k = shape_a[1]
|
|
194
|
+
n = shape_b[1]
|
|
195
|
+
data_type = input_x1.get("dtype").lower()
|
|
196
|
+
params = {
|
|
197
|
+
'M': m,
|
|
198
|
+
'K': k,
|
|
199
|
+
'N': n,
|
|
200
|
+
'data_type': data_type,
|
|
201
|
+
'm_tiling_size': 16,
|
|
202
|
+
'm_cycle_times': 1,
|
|
203
|
+
'm_thread_num': 1,
|
|
204
|
+
'n_tiling_size': 64,
|
|
205
|
+
'n_cycle_times': 16,
|
|
206
|
+
'n_thread_num': 1,
|
|
207
|
+
'k_tiling_size': 32,
|
|
208
|
+
'k_cycle_times': 2,
|
|
209
|
+
'k_thread_num': 2,
|
|
210
|
+
'output_y': output_y
|
|
211
|
+
}
|
|
212
|
+
return matmul_tik_compute(params, kernel_name)
|
|
Binary file
|