mindspore 2.4.1__cp310-cp310-win_amd64.whl → 2.4.10__cp310-cp310-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mindspore might be problematic. Click here for more details.
- mindspore/.commit_id +1 -1
- mindspore/_c_dataengine.cp310-win_amd64.pyd +0 -0
- mindspore/_c_expression.cp310-win_amd64.pyd +0 -0
- mindspore/_c_mindrecord.cp310-win_amd64.pyd +0 -0
- mindspore/common/api.py +1 -4
- mindspore/common/file_system.py +2 -0
- mindspore/common/parameter.py +1 -14
- mindspore/communication/_comm_helper.py +5 -0
- mindspore/context.py +7 -2
- mindspore/dataset/engine/datasets_standard_format.py +17 -0
- mindspore/dataset/engine/datasets_user_defined.py +27 -1
- mindspore/experimental/llm_boost/__init__.py +2 -2
- mindspore/experimental/llm_boost/atb/boost_base.py +240 -64
- mindspore/experimental/llm_boost/atb/llama_boost.py +46 -29
- mindspore/experimental/llm_boost/atb/qwen_boost.py +47 -24
- mindspore/include/api/context.h +1 -1
- mindspore/include/dataset/constants.h +2 -2
- mindspore/mindspore_backend.dll +0 -0
- mindspore/mindspore_common.dll +0 -0
- mindspore/mindspore_core.dll +0 -0
- mindspore/mindspore_np_dtype.dll +0 -0
- mindspore/mindspore_ops.dll +0 -0
- mindspore/nn/__init__.py +2 -0
- mindspore/nn/cell.py +16 -2
- mindspore/nn/layer/conv.py +3 -0
- mindspore/nn/layer/pooling.py +8 -10
- mindspore/nn/utils/__init__.py +22 -0
- mindspore/nn/utils/init.py +71 -0
- mindspore/ops/_grad_experimental/grad_comm_ops.py +25 -7
- mindspore/ops/auto_generate/gen_ops_prim.py +3 -2
- mindspore/ops/function/math_func.py +5 -4
- mindspore/ops/operations/comm_ops.py +4 -1
- mindspore/ops/operations/custom_ops.py +6 -4
- mindspore/ops/operations/nn_ops.py +7 -2
- mindspore/parallel/_auto_parallel_context.py +23 -4
- mindspore/parallel/_cell_wrapper.py +22 -3
- mindspore/parallel/_utils.py +0 -1
- mindspore/run_check/_check_version.py +17 -8
- mindspore/train/callback/_tft_register.py +7 -6
- mindspore/train/model.py +1 -0
- mindspore/train/serialization.py +4 -1
- mindspore/version.py +1 -1
- {mindspore-2.4.1.dist-info → mindspore-2.4.10.dist-info}/METADATA +2 -2
- {mindspore-2.4.1.dist-info → mindspore-2.4.10.dist-info}/RECORD +47 -45
- {mindspore-2.4.1.dist-info → mindspore-2.4.10.dist-info}/WHEEL +0 -0
- {mindspore-2.4.1.dist-info → mindspore-2.4.10.dist-info}/entry_points.txt +0 -0
- {mindspore-2.4.1.dist-info → mindspore-2.4.10.dist-info}/top_level.txt +0 -0
|
@@ -15,10 +15,16 @@
|
|
|
15
15
|
"""llm boost"""
|
|
16
16
|
import json
|
|
17
17
|
import mindspore.common.dtype as mstype
|
|
18
|
-
from mindspore.experimental.llm_boost.atb.boost_base import
|
|
18
|
+
from mindspore.experimental.llm_boost.atb.boost_base import (
|
|
19
|
+
AtbBoostBase,
|
|
20
|
+
PositionEmbeddingType,
|
|
21
|
+
NormType,
|
|
22
|
+
)
|
|
19
23
|
from mindspore._c_expression import LlmBoostBinder
|
|
20
24
|
from mindspore.experimental.llm_boost.register import LlmBoostRegister, LlmBoostType
|
|
21
25
|
|
|
26
|
+
CPP_LLAMA_MODEL_CLASS_NAME = "llama_LlamaDecoderModel"
|
|
27
|
+
|
|
22
28
|
|
|
23
29
|
@LlmBoostRegister.register(LlmBoostType.BUILDIN, "Llama")
|
|
24
30
|
class LlamaBoost(AtbBoostBase):
|
|
@@ -30,14 +36,17 @@ class LlamaBoost(AtbBoostBase):
|
|
|
30
36
|
self.acl_encoder_operation_inputs = [None] * self.in_tensor_length
|
|
31
37
|
self.acl_decoder_operation_inputs = [None] * self.in_tensor_length
|
|
32
38
|
self.atb_encoder_operation = LlmBoostBinder(
|
|
33
|
-
|
|
39
|
+
self.backend_name, CPP_LLAMA_MODEL_CLASS_NAME
|
|
40
|
+
)
|
|
34
41
|
self.atb_decoder_operation = LlmBoostBinder(
|
|
35
|
-
|
|
42
|
+
self.backend_name, CPP_LLAMA_MODEL_CLASS_NAME
|
|
43
|
+
)
|
|
36
44
|
|
|
37
45
|
def init(self):
|
|
38
46
|
"""set param"""
|
|
39
47
|
coder_param = {
|
|
40
|
-
"
|
|
48
|
+
"normEps": self.config.rms_norm_eps,
|
|
49
|
+
"normType": NormType.RMS_NORM,
|
|
41
50
|
"numAttentionHeadsPerRank": self.config.num_heads // self.device_num,
|
|
42
51
|
"hiddenSizePerAttentionHead": self.head_dim,
|
|
43
52
|
"numHiddenLayers": self.num_layers,
|
|
@@ -46,32 +55,41 @@ class LlamaBoost(AtbBoostBase):
|
|
|
46
55
|
"isFA": False,
|
|
47
56
|
"isBF16": self.dtype == mstype.bfloat16,
|
|
48
57
|
"packQuantType": [[1, 1] for _ in range(self.num_layers)],
|
|
49
|
-
"linearQuantType": [
|
|
50
|
-
|
|
58
|
+
"linearQuantType": [
|
|
59
|
+
[0, -1, -1, 0, 0, -1, 0] for _ in range(self.num_layers)
|
|
60
|
+
],
|
|
61
|
+
"linearTransposeType": [
|
|
62
|
+
[1, -1, -1, 1, 1, -1, 1] for i in range(self.num_layers)
|
|
63
|
+
],
|
|
51
64
|
"isEmbeddingParallel": False,
|
|
52
65
|
"isLmHeadParallel": not self.config.parallel_config.vocab_emb_dp,
|
|
53
66
|
"lmHeadTransposeType": 1,
|
|
54
|
-
"
|
|
55
|
-
"
|
|
67
|
+
"enableSwiGLU": True,
|
|
68
|
+
"enablekvQuant": self.kv_quant is not None,
|
|
56
69
|
"rank": self.rank_id,
|
|
57
70
|
"worldSize": self.device_num,
|
|
58
|
-
"backend":
|
|
71
|
+
"backend": self.config.communication_backend,
|
|
59
72
|
"rankTableFile": "",
|
|
60
|
-
"positionEmbeddingType":
|
|
73
|
+
"positionEmbeddingType": PositionEmbeddingType.ROPE,
|
|
61
74
|
"hiddenSize": self.config.hidden_size,
|
|
62
75
|
"gemma": False,
|
|
63
|
-
"enableAddNorm":
|
|
64
|
-
"
|
|
76
|
+
"enableAddNorm": False,
|
|
77
|
+
"enableCompressHead": False,
|
|
78
|
+
"isUnpadInputs": True,
|
|
65
79
|
}
|
|
66
80
|
encoder_param = {
|
|
67
|
-
**coder_param,
|
|
68
|
-
"
|
|
69
|
-
"
|
|
70
|
-
"
|
|
81
|
+
**coder_param,
|
|
82
|
+
"isPrefill": True,
|
|
83
|
+
"enableLcoc": True,
|
|
84
|
+
"enableSpeculate": False,
|
|
85
|
+
"skipWordEmbedding": False,
|
|
86
|
+
"enableSplitFuse": False,
|
|
71
87
|
}
|
|
72
88
|
decoder_param = {
|
|
73
|
-
**coder_param,
|
|
74
|
-
"
|
|
89
|
+
**coder_param,
|
|
90
|
+
"isPrefill": False,
|
|
91
|
+
"enableLcoc": False,
|
|
92
|
+
"enableSpeculate": False,
|
|
75
93
|
}
|
|
76
94
|
self.atb_encoder_operation.init(json.dumps({**encoder_param}))
|
|
77
95
|
self.atb_decoder_operation.init(json.dumps({**decoder_param}))
|
|
@@ -92,14 +110,15 @@ class LlamaBoost(AtbBoostBase):
|
|
|
92
110
|
**kwargs
|
|
93
111
|
):
|
|
94
112
|
"""prepare inputs"""
|
|
95
|
-
self.acl_param = json.dumps(
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
113
|
+
self.acl_param = json.dumps(
|
|
114
|
+
{
|
|
115
|
+
"seqLen": seqLen,
|
|
116
|
+
}
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
self.acl_decoder_operation_inputs[0] = input_ids
|
|
100
120
|
self.acl_decoder_operation_inputs[1] = self.placeholder
|
|
101
|
-
self.acl_decoder_operation_inputs[2] =
|
|
102
|
-
position_ids, mstype.int32)
|
|
121
|
+
self.acl_decoder_operation_inputs[2] = position_ids
|
|
103
122
|
self.acl_decoder_operation_inputs[3] = cos_embed
|
|
104
123
|
self.acl_decoder_operation_inputs[4] = sin_embed
|
|
105
124
|
self.acl_decoder_operation_inputs[5] = attention_mask
|
|
@@ -108,8 +127,6 @@ class LlamaBoost(AtbBoostBase):
|
|
|
108
127
|
self.acl_decoder_operation_inputs[8] = self.placeholder
|
|
109
128
|
self.acl_decoder_operation_inputs[9] = self.placeholder
|
|
110
129
|
self.acl_decoder_operation_inputs[10] = self.placeholder
|
|
111
|
-
self.acl_decoder_operation_inputs[11] =
|
|
112
|
-
|
|
113
|
-
self.acl_decoder_operation_inputs[12] = self.cast(
|
|
114
|
-
lm_head_indices, mstype.int64)
|
|
130
|
+
self.acl_decoder_operation_inputs[11] = input_lengths
|
|
131
|
+
self.acl_decoder_operation_inputs[12] = lm_head_indices
|
|
115
132
|
return self.acl_decoder_operation_inputs, self.acl_param
|
|
@@ -15,11 +15,14 @@
|
|
|
15
15
|
"""llm boost"""
|
|
16
16
|
import json
|
|
17
17
|
import mindspore.common.dtype as mstype
|
|
18
|
-
from mindspore.experimental.llm_boost.atb.boost_base import AtbBoostBase
|
|
18
|
+
from mindspore.experimental.llm_boost.atb.boost_base import AtbBoostBase, NormType
|
|
19
19
|
from mindspore._c_expression import LlmBoostBinder
|
|
20
20
|
from mindspore.experimental.llm_boost.register import LlmBoostRegister, LlmBoostType
|
|
21
21
|
|
|
22
22
|
|
|
23
|
+
CPP_QWEN_MODEL_CLASS_NAME = "qwen_QwenDecoderModel"
|
|
24
|
+
|
|
25
|
+
|
|
23
26
|
@LlmBoostRegister.register(LlmBoostType.BUILDIN, "Qwen")
|
|
24
27
|
class QwenBoost(AtbBoostBase):
|
|
25
28
|
"""QwenBoost class"""
|
|
@@ -30,9 +33,11 @@ class QwenBoost(AtbBoostBase):
|
|
|
30
33
|
self.acl_encoder_operation_inputs = [None] * self.in_tensor_length
|
|
31
34
|
self.acl_decoder_operation_inputs = [None] * self.in_tensor_length
|
|
32
35
|
self.atb_encoder_operation = LlmBoostBinder(
|
|
33
|
-
|
|
36
|
+
self.backend_name, CPP_QWEN_MODEL_CLASS_NAME
|
|
37
|
+
)
|
|
34
38
|
self.atb_decoder_operation = LlmBoostBinder(
|
|
35
|
-
|
|
39
|
+
self.backend_name, CPP_QWEN_MODEL_CLASS_NAME
|
|
40
|
+
)
|
|
36
41
|
|
|
37
42
|
def init(self):
|
|
38
43
|
"""set param"""
|
|
@@ -42,24 +47,43 @@ class QwenBoost(AtbBoostBase):
|
|
|
42
47
|
"withEmbedding": True,
|
|
43
48
|
"isEmbeddingParallel": True,
|
|
44
49
|
"isLmHeadParallel": True,
|
|
45
|
-
"linearTransposeType": [
|
|
50
|
+
"linearTransposeType": [
|
|
51
|
+
[1, -1, -1, 1, 1, -1, 1] for i in range(self.num_layers)
|
|
52
|
+
],
|
|
46
53
|
"lmHeadTransposeType": 1,
|
|
47
|
-
"
|
|
48
|
-
"
|
|
54
|
+
"enableSwiGLU": not self.need_nz,
|
|
55
|
+
"normEps": self.config.rms_norm_eps,
|
|
56
|
+
"normType": NormType.RMS_NORM,
|
|
49
57
|
"numAttentionHeadsPerRank": self.config.num_heads // self.device_num,
|
|
50
58
|
"hiddenSizePerAttentionHead": self.head_dim,
|
|
51
59
|
"numHiddenLayers": self.num_layers,
|
|
52
60
|
"numKeyValueHeadsPerRank": self.n_kv_heads // self.device_num,
|
|
53
61
|
"rank": self.rank_id,
|
|
54
62
|
"worldSize": self.device_num,
|
|
55
|
-
"backend":
|
|
63
|
+
"backend": self.config.communication_backend,
|
|
56
64
|
"packQuantType": [[1, 1] for _ in range(self.num_layers)],
|
|
57
|
-
"linearQuantType": [
|
|
58
|
-
|
|
65
|
+
"linearQuantType": [
|
|
66
|
+
[0, -1, -1, 0, 0, -1, 0] for _ in range(self.num_layers)
|
|
67
|
+
],
|
|
68
|
+
"linearHasBias": [[True, False, False, False]] * self.num_layers,
|
|
69
|
+
"enableKvQuant": self.kv_quant is not None,
|
|
70
|
+
"enableLora": False,
|
|
71
|
+
"isUnpadInputs": True,
|
|
72
|
+
"enableAddNorm": False,
|
|
73
|
+
}
|
|
74
|
+
encoder_param = {
|
|
75
|
+
**param_dict,
|
|
76
|
+
"isPrefill": True,
|
|
77
|
+
"enableLcoc": False,
|
|
78
|
+
"enableSplitFuse": False,
|
|
79
|
+
}
|
|
80
|
+
decoder_param = {
|
|
81
|
+
**param_dict,
|
|
82
|
+
"isPrefill": False,
|
|
83
|
+
"enableLcoc": False,
|
|
84
|
+
"enableSpeculate": False,
|
|
85
|
+
"enablePrefixCache": False,
|
|
59
86
|
}
|
|
60
|
-
encoder_param = {**param_dict, "isPrefill": True, "supportLcoc": False}
|
|
61
|
-
decoder_param = {**param_dict, "isPrefill": False,
|
|
62
|
-
"supportLcoc": False, "supportSpeculate": False}
|
|
63
87
|
self.atb_encoder_operation.init(json.dumps({**encoder_param}))
|
|
64
88
|
self.atb_decoder_operation.init(json.dumps({**decoder_param}))
|
|
65
89
|
|
|
@@ -79,13 +103,14 @@ class QwenBoost(AtbBoostBase):
|
|
|
79
103
|
**kwargs
|
|
80
104
|
):
|
|
81
105
|
"""prepare inputs"""
|
|
82
|
-
self.acl_param = json.dumps(
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
106
|
+
self.acl_param = json.dumps(
|
|
107
|
+
{
|
|
108
|
+
"seqLen": seqLen,
|
|
109
|
+
}
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
self.acl_decoder_operation_inputs[0] = input_ids
|
|
113
|
+
self.acl_decoder_operation_inputs[1] = position_ids
|
|
89
114
|
self.acl_decoder_operation_inputs[2] = cos_embed
|
|
90
115
|
self.acl_decoder_operation_inputs[3] = sin_embed
|
|
91
116
|
self.acl_decoder_operation_inputs[4] = attention_mask
|
|
@@ -93,9 +118,7 @@ class QwenBoost(AtbBoostBase):
|
|
|
93
118
|
self.acl_decoder_operation_inputs[6] = slots
|
|
94
119
|
self.acl_decoder_operation_inputs[7] = self.placeholder
|
|
95
120
|
self.acl_decoder_operation_inputs[8] = self.placeholder
|
|
96
|
-
self.acl_decoder_operation_inputs[9] = self.
|
|
97
|
-
|
|
98
|
-
self.acl_decoder_operation_inputs[
|
|
99
|
-
lm_head_indices, mstype.int64)
|
|
100
|
-
self.acl_decoder_operation_inputs[11] = self.placeholder
|
|
121
|
+
self.acl_decoder_operation_inputs[9] = self.placeholder
|
|
122
|
+
self.acl_decoder_operation_inputs[10] = input_lengths
|
|
123
|
+
self.acl_decoder_operation_inputs[11] = lm_head_indices
|
|
101
124
|
return self.acl_decoder_operation_inputs, self.acl_param
|
mindspore/include/api/context.h
CHANGED
|
@@ -236,7 +236,7 @@ std::string DeviceInfoContext::GetProviderDevice() const { return CharToString(G
|
|
|
236
236
|
void DeviceInfoContext::SetProviderDevice(const std::string &device) { SetProviderDevice(StringToChar(device)); }
|
|
237
237
|
|
|
238
238
|
/// \brief Derived from DeviceInfoContext, The configuration of the model running auto on the Host Devices, include
|
|
239
|
-
/// CPU/GPU/NPU/
|
|
239
|
+
/// CPU/GPU/NPU/Ascend. This option is only valid for MindSpore Lite.
|
|
240
240
|
class MS_API AutoDeviceInfo : public DeviceInfoContext {
|
|
241
241
|
public:
|
|
242
242
|
/// \brief Get the type of this DeviceInfoContext.
|
|
@@ -108,8 +108,8 @@ enum class DATASET_API ManualOffloadMode {
|
|
|
108
108
|
enum class DATASET_API MapTargetDevice {
|
|
109
109
|
kCpu = 0, ///< CPU Device.
|
|
110
110
|
kGpu, ///< Gpu Device.
|
|
111
|
-
kAscend310, ///<
|
|
112
|
-
kAscend910B, ///<
|
|
111
|
+
kAscend310, ///<
|
|
112
|
+
kAscend910B, ///<
|
|
113
113
|
kInvalid = 100
|
|
114
114
|
};
|
|
115
115
|
|
mindspore/mindspore_backend.dll
CHANGED
|
Binary file
|
mindspore/mindspore_common.dll
CHANGED
|
Binary file
|
mindspore/mindspore_core.dll
CHANGED
|
Binary file
|
mindspore/mindspore_np_dtype.dll
CHANGED
|
Binary file
|
mindspore/mindspore_ops.dll
CHANGED
|
Binary file
|
mindspore/nn/__init__.py
CHANGED
|
@@ -31,6 +31,7 @@ from mindspore.nn.wrap import *
|
|
|
31
31
|
from mindspore.nn.grad import Jvp, Vjp
|
|
32
32
|
from mindspore.nn.sparse import *
|
|
33
33
|
from mindspore.nn.reinforcement import *
|
|
34
|
+
from mindspore.nn.utils import *
|
|
34
35
|
|
|
35
36
|
__all__ = ["Cell", "GraphCell"]
|
|
36
37
|
__all__.extend(layer.__all__)
|
|
@@ -43,5 +44,6 @@ __all__.extend(sparse.__all__)
|
|
|
43
44
|
__all__.extend(learning_rate_schedule.__all__)
|
|
44
45
|
__all__.extend(dynamic_lr.__all__)
|
|
45
46
|
__all__.extend(reinforcement.__all__)
|
|
47
|
+
__all__.extend(utils.__all__)
|
|
46
48
|
|
|
47
49
|
__all__.sort()
|
mindspore/nn/cell.py
CHANGED
|
@@ -32,7 +32,8 @@ from mindspore import context
|
|
|
32
32
|
from mindspore._c_expression import init_pipeline, update_func_graph_hyper_params, Cell_, FuncGraph, MixedPrecisionType
|
|
33
33
|
from mindspore import _checkparam as Validator
|
|
34
34
|
from mindspore.common import dtype as mstype
|
|
35
|
-
from mindspore.common.api import _cell_graph_executor, _pynative_executor, _get_args_for_run, cells_compile_cache,
|
|
35
|
+
from mindspore.common.api import _cell_graph_executor, _pynative_executor, _get_args_for_run, cells_compile_cache, \
|
|
36
|
+
_no_grad
|
|
36
37
|
from mindspore.common.api import _generate_branch_control_input, _convert_python_data, _get_args_for_run_predict
|
|
37
38
|
from mindspore.common.api import _process_dyn_args, _generate_dyn_compile_args
|
|
38
39
|
from mindspore.common.parameter import Parameter, ParameterTuple
|
|
@@ -45,6 +46,7 @@ from mindspore._check_jit_forbidden_api import jit_forbidden_register
|
|
|
45
46
|
from mindspore.common._decorator import deprecated
|
|
46
47
|
from mindspore.common._register_for_recompute import recompute_registry
|
|
47
48
|
|
|
49
|
+
|
|
48
50
|
class Cell(Cell_):
|
|
49
51
|
"""
|
|
50
52
|
The basic building block of neural networks in MindSpore. The model or neural network layer should inherit this
|
|
@@ -2582,7 +2584,7 @@ class Cell(Cell_):
|
|
|
2582
2584
|
"""
|
|
2583
2585
|
if context.get_context("mode") == context.PYNATIVE_MODE:
|
|
2584
2586
|
self._recompute_cell = recompute_registry.get()(self.construct)
|
|
2585
|
-
self.
|
|
2587
|
+
self._add_recompute_flag()
|
|
2586
2588
|
return
|
|
2587
2589
|
self._recompute()
|
|
2588
2590
|
if 'mp_comm_recompute' in kwargs.keys():
|
|
@@ -2685,6 +2687,18 @@ class Cell(Cell_):
|
|
|
2685
2687
|
if hasattr(network, "_amp_level"):
|
|
2686
2688
|
self._amp_level = getattr(network, "_amp_level")
|
|
2687
2689
|
|
|
2690
|
+
def _add_recompute_flag(self):
|
|
2691
|
+
"""
|
|
2692
|
+
Set pynative cell recomputed.
|
|
2693
|
+
"""
|
|
2694
|
+
if not self._has_config_recompute:
|
|
2695
|
+
self._has_config_recompute = True
|
|
2696
|
+
else:
|
|
2697
|
+
logger.info("The recompute interface can be configured only once."
|
|
2698
|
+
" If the parent cell is configured, the child cell should not be configured")
|
|
2699
|
+
for cell in self.cells():
|
|
2700
|
+
cell._add_recompute_flag()
|
|
2701
|
+
|
|
2688
2702
|
|
|
2689
2703
|
class GraphCell(Cell):
|
|
2690
2704
|
"""
|
mindspore/nn/layer/conv.py
CHANGED
|
@@ -862,6 +862,9 @@ class Conv3dTranspose(_Conv):
|
|
|
862
862
|
However, when `stride` > 1, Conv2d maps multiple input shapes to the same output shape. Deconvolutional network
|
|
863
863
|
can refer to `Deconvolutional Networks <https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf>`_.
|
|
864
864
|
|
|
865
|
+
Note:
|
|
866
|
+
For Atlas A2 training series products, `output_padding` is currently not supported.
|
|
867
|
+
|
|
865
868
|
Args:
|
|
866
869
|
in_channels (int): The channel number of the input tensor of the Conv3dTranspose layer.
|
|
867
870
|
out_channels (int): The channel number of the output tensor of the Conv3dTranspose layer.
|
mindspore/nn/layer/pooling.py
CHANGED
|
@@ -297,6 +297,9 @@ class MaxPool3d(_PoolNd):
|
|
|
297
297
|
\max_{l=0, \ldots, d_{ker}-1} \max_{m=0, \ldots, h_{ker}-1} \max_{n=0, \ldots, w_{ker}-1}
|
|
298
298
|
\text{input}(N_i, C_j, s_0 \times d + l, s_1 \times h + m, s_2 \times w + n)
|
|
299
299
|
|
|
300
|
+
.. note::
|
|
301
|
+
For Atlas training series products, this interface is not supported.
|
|
302
|
+
|
|
300
303
|
Args:
|
|
301
304
|
kernel_size (Union[int, tuple[int]]): The size of kernel used to take the maximum value,
|
|
302
305
|
is an int number or a single element tuple that represents depth, height and width of the kernel, or a tuple
|
|
@@ -1032,16 +1035,11 @@ class AvgPool2dExt(Cell):
|
|
|
1032
1035
|
>>> import numpy as np
|
|
1033
1036
|
>>> from mindspore import Tensor, nn
|
|
1034
1037
|
>>> from mindspore import dtype as mstype
|
|
1035
|
-
>>>
|
|
1036
|
-
>>>
|
|
1037
|
-
>>> output =
|
|
1038
|
-
>>> print(output)
|
|
1039
|
-
|
|
1040
|
-
[ 6.5 7.5 8.5]]
|
|
1041
|
-
[[14.5 15.5 16.5]
|
|
1042
|
-
[18.5 19.5 20.5]]
|
|
1043
|
-
[[26.5 27.5 28.5]
|
|
1044
|
-
[30.5 31.5 32.5]]]]
|
|
1038
|
+
>>> input = Tensor(np.arange(1 * 3 * 3 * 4).reshape(1, 3, 3, 4), mstype.float32)
|
|
1039
|
+
>>> net = nn.AvgPool2dExt(kernel_size=2, stride=1)
|
|
1040
|
+
>>> output = net(input)
|
|
1041
|
+
>>> print(output.shape)
|
|
1042
|
+
(1, 3, 2, 3)
|
|
1045
1043
|
"""
|
|
1046
1044
|
def __init__(self, kernel_size, stride=None, padding=0, ceil_mode=False,
|
|
1047
1045
|
count_include_pad=True, divisor_override=None):
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# Copyright 2024 Huawei Technologies Co., Ltd
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
# ============================================================================
|
|
15
|
+
"""
|
|
16
|
+
nn.utils.
|
|
17
|
+
"""
|
|
18
|
+
from __future__ import absolute_import
|
|
19
|
+
|
|
20
|
+
from .init import no_init_parameters
|
|
21
|
+
|
|
22
|
+
__all__ = ["no_init_parameters"]
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# Copyright 2024 Huawei Technologies Co., Ltd
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
# ============================================================================
|
|
15
|
+
|
|
16
|
+
"""init for nn.Cell."""
|
|
17
|
+
from __future__ import absolute_import
|
|
18
|
+
|
|
19
|
+
from contextlib import contextmanager
|
|
20
|
+
from mindspore.common.parameter import Parameter
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@contextmanager
|
|
24
|
+
def no_init_parameters():
|
|
25
|
+
r"""
|
|
26
|
+
In scenarios where a checkpoint is loaded, parameters within the network instantiation will be
|
|
27
|
+
instantiated and occupy physical memory. Loading a checkpoint will replace the parameter values.
|
|
28
|
+
Decorator can be applied during network instantiation to add an attribute `init_param` to all
|
|
29
|
+
parameters within the current Cell, setting it to `init_param=False` .
|
|
30
|
+
When `init_param=False` is detected, the initialization of the parameters is skipped,
|
|
31
|
+
and the parameters are assigned values directly from the checkpoint during loading,
|
|
32
|
+
which can optimize performance and reduce physical memory usage.
|
|
33
|
+
|
|
34
|
+
Note:
|
|
35
|
+
Initialization of parameters created with `initializer` can only be skipped.
|
|
36
|
+
Parameters created by `Tensor` or `numpy` cannot be skipped.
|
|
37
|
+
|
|
38
|
+
Examples:
|
|
39
|
+
>>> import mindspore as ms
|
|
40
|
+
>>> from mindspore import nn, ops, load_checkpoint
|
|
41
|
+
>>> from mindspore.common.initializer import initializer
|
|
42
|
+
>>> from mindspore.nn.utils import no_init_parameters
|
|
43
|
+
>>> # 1. Add a decorator to the network that requires delayed initialization
|
|
44
|
+
>>> class Net(nn.Cell):
|
|
45
|
+
... def __init__(self, in_channels, out_channels):
|
|
46
|
+
... super().__init__()
|
|
47
|
+
... self.weight = ms.Parameter(initializer("normal", [in_channels, out_channels], ms.float32))
|
|
48
|
+
... self.bias = ms.Parameter(initializer("normal", [out_channels], ms.float32))
|
|
49
|
+
... self.matmul = ops.MatMul()
|
|
50
|
+
... self.add = ops.Add()
|
|
51
|
+
...
|
|
52
|
+
... def construct(self, x):
|
|
53
|
+
... x = self.matmul(x, self.weight)
|
|
54
|
+
... x = self.add(x, self.bias)
|
|
55
|
+
... return x
|
|
56
|
+
>>> with no_init_parameters():
|
|
57
|
+
... # After instantiation, all parameters in the net are not initialized
|
|
58
|
+
... net = Net(28*28, 64)
|
|
59
|
+
>>> # 2. Load checkpoint parameters to the net
|
|
60
|
+
>>> load_checkpoint('./checkpoint/test_net.ckpt', net=net)
|
|
61
|
+
>>> # 3. After loading the checkpoint, manually call init_parameters_data() to initialize
|
|
62
|
+
>>> # the uninitialized parameters in the net if need. If the network is executed,
|
|
63
|
+
>>> # the framework will automatically call this interface.
|
|
64
|
+
>>> net.init_parameters_data()
|
|
65
|
+
"""
|
|
66
|
+
init_class = Parameter
|
|
67
|
+
setattr(init_class, "init_param", False)
|
|
68
|
+
try:
|
|
69
|
+
yield
|
|
70
|
+
finally:
|
|
71
|
+
setattr(init_class, "init_param", True)
|
|
@@ -16,7 +16,7 @@
|
|
|
16
16
|
"""Generate bprop for comm ops"""
|
|
17
17
|
from __future__ import division
|
|
18
18
|
from __future__ import absolute_import
|
|
19
|
-
from mindspore import Tensor
|
|
19
|
+
from mindspore import Tensor, Parameter
|
|
20
20
|
import mindspore.common.dtype as mstype
|
|
21
21
|
from mindspore.ops import functional as F
|
|
22
22
|
from mindspore.communication import get_rank, get_group_size
|
|
@@ -37,6 +37,9 @@ from mindspore.ops._grad_experimental.grad_base import bprop_getters
|
|
|
37
37
|
from mindspore.ops.operations import _grad_ops as G
|
|
38
38
|
import mindspore as ms
|
|
39
39
|
|
|
40
|
+
_device_local_norm = None
|
|
41
|
+
if ms.get_auto_parallel_context("dump_device_local_norm"):
|
|
42
|
+
_device_local_norm = Parameter(Tensor(0.0, mstype.float32), name="_device_local_norm", requires_grad=False)
|
|
40
43
|
|
|
41
44
|
@bprop_getters.register(AllReduce)
|
|
42
45
|
def get_bprop_all_reduce(self):
|
|
@@ -247,10 +250,15 @@ def get_bprop_mirror_micro_step_operator(self):
|
|
|
247
250
|
reduce_sum = P.ReduceSum(keep_dims=False)
|
|
248
251
|
square = P.Square()
|
|
249
252
|
dump_local_norm = ms.get_auto_parallel_context("dump_local_norm")
|
|
253
|
+
dump_device_local_norm = ms.get_auto_parallel_context("dump_device_local_norm")
|
|
250
254
|
|
|
251
255
|
def bprop(x, z, out, dout):
|
|
252
|
-
if dump_local_norm:
|
|
253
|
-
|
|
256
|
+
if dump_local_norm or dump_device_local_norm:
|
|
257
|
+
_norm = reduce_sum(square((z)))
|
|
258
|
+
if dump_local_norm:
|
|
259
|
+
z = F.depend(z, ln_print("dump local norm: ", param_name, _norm))
|
|
260
|
+
if dump_device_local_norm:
|
|
261
|
+
z = F.depend(z, F.assign_add(_device_local_norm, cast(_norm, _device_local_norm.dtype)))
|
|
254
262
|
real_grad = z
|
|
255
263
|
assign_out = dout
|
|
256
264
|
if issubclass_(F.typeof(dout), mstype.tensor_type):
|
|
@@ -373,6 +381,7 @@ def get_bprop_micro_step_all_gather(self):
|
|
|
373
381
|
reduce_sum = P.ReduceSum(keep_dims=False)
|
|
374
382
|
square = P.Square()
|
|
375
383
|
dump_local_norm = ms.get_auto_parallel_context("dump_local_norm")
|
|
384
|
+
dump_device_local_norm = ms.get_auto_parallel_context("dump_device_local_norm")
|
|
376
385
|
|
|
377
386
|
def bprop(x, z, out, dout):
|
|
378
387
|
if with_mirror_operator:
|
|
@@ -383,8 +392,12 @@ def get_bprop_micro_step_all_gather(self):
|
|
|
383
392
|
real_grad = F.tensor_mul(real_grad, scale)
|
|
384
393
|
return (real_grad, cast(out_tensor, dtype(z)))
|
|
385
394
|
z = F.depend(z, dout)
|
|
386
|
-
if dump_local_norm:
|
|
387
|
-
|
|
395
|
+
if dump_local_norm or dump_device_local_norm:
|
|
396
|
+
_norm = reduce_sum(square((z)))
|
|
397
|
+
if dump_local_norm:
|
|
398
|
+
z = F.depend(z, ln_print("dump local norm: ", param_name, _norm))
|
|
399
|
+
if dump_device_local_norm:
|
|
400
|
+
z = F.depend(z, F.assign_add(_device_local_norm, cast(_norm, _device_local_norm.dtype)))
|
|
388
401
|
if not do_mirror:
|
|
389
402
|
return (z, cast(out_tensor, dtype(z)))
|
|
390
403
|
real_grad = reduce_scatter(z)
|
|
@@ -586,6 +599,7 @@ def get_bprop_mirror_operator(self):
|
|
|
586
599
|
|
|
587
600
|
dev_num_r = 1.0
|
|
588
601
|
dump_local_norm = ms.get_auto_parallel_context("dump_local_norm")
|
|
602
|
+
dump_device_local_norm = ms.get_auto_parallel_context("dump_device_local_norm")
|
|
589
603
|
if dev_num > 1:
|
|
590
604
|
dev_num_r = 1.0 / dev_num
|
|
591
605
|
all_reduce = AllReduce(group=group)
|
|
@@ -608,8 +622,12 @@ def get_bprop_mirror_operator(self):
|
|
|
608
622
|
all_reduce.set_prim_instance_name(instance_name)
|
|
609
623
|
|
|
610
624
|
def bprop(x, out, dout):
|
|
611
|
-
if dump_local_norm:
|
|
612
|
-
|
|
625
|
+
if dump_local_norm or dump_device_local_norm:
|
|
626
|
+
_norm = reduce_sum(square((dout)))
|
|
627
|
+
if dump_local_norm:
|
|
628
|
+
dout = F.depend(dout, ln_print("dump local norm: ", param_name, _norm))
|
|
629
|
+
if dump_device_local_norm:
|
|
630
|
+
dout = F.depend(dout, F.assign_add(_device_local_norm, cast(_norm, _device_local_norm.dtype)))
|
|
613
631
|
|
|
614
632
|
if dev_num == 1:
|
|
615
633
|
return (dout,)
|
|
@@ -2387,7 +2387,8 @@ class BatchMatMul(Primitive):
|
|
|
2387
2387
|
|
|
2388
2388
|
\text{output}[..., :, :] = \text{matrix}(x[..., :, :]) * \text{matrix}(y[..., :, :])
|
|
2389
2389
|
|
|
2390
|
-
The rank of
|
|
2390
|
+
The rank of the two input tensors must be at least `2`, and the two input tensors must have the same rank
|
|
2391
|
+
if the environment is GPU or CPU.
|
|
2391
2392
|
|
|
2392
2393
|
Args:
|
|
2393
2394
|
transpose_a (bool): If ``True`` , the last two dimensions of `x` is transposed before multiplication.
|
|
@@ -9488,7 +9489,7 @@ class MatMul(Primitive):
|
|
|
9488
9489
|
|
|
9489
9490
|
.. math::
|
|
9490
9491
|
|
|
9491
|
-
|
|
9492
|
+
(Output)_{i j}=\sum_{k=1}^{p} a_{i k} b_{k j}=a_{i 1} b_{1 j}+a_{i 2} b_{2 j}+\cdots+a_{i p} b_{p j}, p\in N
|
|
9492
9493
|
|
|
9493
9494
|
where the :math:`i,j` indicates the output of the i-th row and j-th column element.
|
|
9494
9495
|
|
|
@@ -9088,9 +9088,9 @@ def remainder(input, other):
|
|
|
9088
9088
|
both dtypes cannot be bool, and the shapes of them could be broadcast. When the inputs are one tensor
|
|
9089
9089
|
and one scalar, the scalar could only be a constant.
|
|
9090
9090
|
|
|
9091
|
-
..
|
|
9091
|
+
.. code:: python
|
|
9092
9092
|
|
|
9093
|
-
remainder(input, other)
|
|
9093
|
+
remainder(input, other) == input - input.div(other, rounding_mode="floor") * other
|
|
9094
9094
|
|
|
9095
9095
|
.. warning::
|
|
9096
9096
|
- When the elements of input exceed 2048, there might be accuracy problems.
|
|
@@ -9135,9 +9135,10 @@ def remainder_ext(input, other):
|
|
|
9135
9135
|
|
|
9136
9136
|
Supports broadcasting to a common shape and implicit type promotion.
|
|
9137
9137
|
|
|
9138
|
-
..
|
|
9138
|
+
.. code:: python
|
|
9139
|
+
|
|
9140
|
+
remainder(input, other) == input - input.div(other, rounding_mode="floor") * other
|
|
9139
9141
|
|
|
9140
|
-
remainder(input, other) = input - input.div(other, rounding\_mode="floor") * other
|
|
9141
9142
|
|
|
9142
9143
|
Note:
|
|
9143
9144
|
Complex inputs are not supported. At least one input need to be tensor, but not both are bool tensors.
|
|
@@ -988,6 +988,9 @@ class NeighborExchangeV2(Primitive):
|
|
|
988
988
|
in the same subnet, please check the `details \
|
|
989
989
|
<https://www.mindspore.cn/docs/en/master/api_python/samples/ops/communicate_ops.html#notes>`_.
|
|
990
990
|
|
|
991
|
+
Users need to ensure that the length of the received data `recv_lens` is consistent with that of
|
|
992
|
+
the sent data `send_lens`.
|
|
993
|
+
|
|
991
994
|
Args:
|
|
992
995
|
send_rank_ids (list(int)): Ranks which the data is sent to. 8 rank_ids represents 8 directions, if one
|
|
993
996
|
direction is not send to , set it -1.
|
|
@@ -1393,7 +1396,7 @@ class Send(PrimitiveWithInfer):
|
|
|
1393
1396
|
>>> def __init__(self):
|
|
1394
1397
|
>>> super(SendNet, self).__init__()
|
|
1395
1398
|
>>> self.depend = ops.Depend()
|
|
1396
|
-
>>> self.send = ops.Send(
|
|
1399
|
+
>>> self.send = ops.Send(sr_tag=0, dest_rank=8, group="hccl_world_group")
|
|
1397
1400
|
>>>
|
|
1398
1401
|
>>> def construct(self, x):
|
|
1399
1402
|
>>> out = self.depend(x, self.send(x))
|
|
@@ -251,11 +251,13 @@ class Custom(ops.PrimitiveWithInfer):
|
|
|
251
251
|
|
|
252
252
|
- "xxx.so" file generation:
|
|
253
253
|
|
|
254
|
-
1) GPU Platform: Given user defined "xxx.cu" file (ex. "{path}/add.cu"),
|
|
255
|
-
|
|
254
|
+
1) GPU Platform: Given user defined "xxx.cu" file (ex. "{path}/add.cu"),
|
|
255
|
+
use nvcc command to compile
|
|
256
|
+
it.(ex. :code:`nvcc --shared -Xcompiler -fPIC -o add.so add.cu`)
|
|
256
257
|
|
|
257
|
-
2) CPU Platform: Given user defined "xxx.cc" file (ex. "{path}/add.cc"),
|
|
258
|
-
|
|
258
|
+
2) CPU Platform: Given user defined "xxx.cc" file (ex. "{path}/add.cc"),
|
|
259
|
+
use g++/gcc command to
|
|
260
|
+
compile it.(ex. :code:`g++ --shared -fPIC -o add.so add.cc`)
|
|
259
261
|
|
|
260
262
|
- Define a "xxx.cc"/"xxx.cu" file:
|
|
261
263
|
|