mindspore 2.4.0__cp310-cp310-manylinux1_x86_64.whl → 2.4.1__cp310-cp310-manylinux1_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mindspore might be problematic. Click here for more details.
- mindspore/.commit_id +1 -1
- mindspore/_c_dataengine.cpython-310-x86_64-linux-gnu.so +0 -0
- mindspore/_c_expression.cpython-310-x86_64-linux-gnu.so +0 -0
- mindspore/bin/cache_admin +0 -0
- mindspore/bin/cache_server +0 -0
- mindspore/common/initializer.py +51 -15
- mindspore/common/parameter.py +18 -4
- mindspore/common/tensor.py +15 -49
- mindspore/communication/comm_func.py +7 -7
- mindspore/context.py +9 -0
- mindspore/include/mindapi/base/format.h +13 -0
- mindspore/lib/libdnnl.so.2 +0 -0
- mindspore/lib/libmindspore_backend.so +0 -0
- mindspore/lib/libmindspore_common.so +0 -0
- mindspore/lib/libmindspore_core.so +0 -0
- mindspore/lib/libmindspore_glog.so.0 +0 -0
- mindspore/lib/libmindspore_gpr.so.15 +0 -0
- mindspore/lib/libmindspore_grpc++.so.1 +0 -0
- mindspore/lib/libmindspore_grpc.so.15 +0 -0
- mindspore/lib/libmindspore_ops.so +0 -0
- mindspore/lib/libopencv_core.so.4.5 +0 -0
- mindspore/lib/libopencv_imgcodecs.so.4.5 +0 -0
- mindspore/lib/libopencv_imgproc.so.4.5 +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_ops/op_impl/ai_core/tbe/kernel/config/ascend910b/all_finite.json +10 -10
- mindspore/lib/plugin/ascend/custom_ascendc_ops/op_impl/ai_core/tbe/kernel/config/ascend910b/binary_info_config.json +8 -8
- mindspore/lib/plugin/ascend/custom_compiler/setup.py +1 -1
- mindspore/lib/plugin/ascend/libdvpp_utils.so +0 -0
- mindspore/lib/plugin/ascend/libmindspore_internal_kernels.so +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/asdops/include/asdops/utils/rt/base/types.h +5 -5
- mindspore/lib/plugin/ascend/ms_kernels_internal/asdops/lib/libasdops.so +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/asdops/lib/libasdops_static.a +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/asdops/lib/liblcal.so +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/asdops/lib/liblcal_static.a +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/include/acme/include/acme_op.h +1 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/include/acme/src/ops/host_src/paged_attention_op.h +6 -1
- mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/include/acme/src/ops/host_src/rms_norm_op.h +4 -3
- mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/lib/libapply_rotary_pos_emb_310p_impl.so +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/lib/libms_kernels_internal.so +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/op_kernels/ascend910b/flash_attention_score/flash_attention_score_bf16_bnsd_full_mix.o +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/op_kernels/ascend910b/flash_attention_score/flash_attention_score_bf16_bnsd_tri_mix.o +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/op_kernels/ascend910b/flash_attention_score/flash_attention_score_bf16_bsh_full_mix.o +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/op_kernels/ascend910b/flash_attention_score/flash_attention_score_bf16_bsh_tri_mix.o +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/op_kernels/ascend910b/flash_attention_score/flash_attention_score_fp16_bnsd_full_mix.o +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/op_kernels/ascend910b/flash_attention_score/flash_attention_score_fp16_bnsd_tri_mix.o +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/op_kernels/ascend910b/flash_attention_score/flash_attention_score_fp16_bsh_full_mix.o +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/op_kernels/ascend910b/flash_attention_score/flash_attention_score_fp16_bsh_tri_mix.o +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/op_kernels/ascend910b/paged_attention/paged_attention_bf16_bnsd_mix.o +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/op_kernels/ascend910b/paged_attention/paged_attention_bf16_bsh_mix.o +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/op_kernels/ascend910b/paged_attention/paged_attention_fp16_bnsd_mix.o +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/op_kernels/ascend910b/paged_attention/paged_attention_fp16_bsh_mix.o +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/lccl/lib/liblcal.so +0 -0
- mindspore/lib/plugin/gpu/libcuda_ops.so.10 +0 -0
- mindspore/lib/plugin/gpu/libcuda_ops.so.11 +0 -0
- mindspore/lib/plugin/gpu10.1/libnccl.so.2 +0 -0
- mindspore/lib/plugin/gpu11.1/libnccl.so.2 +0 -0
- mindspore/lib/plugin/gpu11.6/libnccl.so.2 +0 -0
- mindspore/lib/plugin/libmindspore_ascend.so.2 +0 -0
- mindspore/lib/plugin/libmindspore_gpu.so.10.1 +0 -0
- mindspore/lib/plugin/libmindspore_gpu.so.11.1 +0 -0
- mindspore/lib/plugin/libmindspore_gpu.so.11.6 +0 -0
- mindspore/mint/__init__.py +490 -2
- mindspore/mint/nn/__init__.py +2 -2
- mindspore/mint/optim/adamw.py +6 -14
- mindspore/nn/cell.py +1 -3
- mindspore/nn/layer/basic.py +24 -7
- mindspore/nn/layer/embedding.py +31 -14
- mindspore/nn/optim/tft_wrapper.py +12 -15
- mindspore/ops/_grad_experimental/grad_array_ops.py +0 -11
- mindspore/ops/_grad_experimental/grad_comm_ops.py +20 -1
- mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +6 -0
- mindspore/ops/auto_generate/gen_extend_func.py +33 -0
- mindspore/ops/auto_generate/gen_ops_def.py +52 -3
- mindspore/ops/auto_generate/gen_ops_prim.py +155 -6
- mindspore/ops/function/array_func.py +2 -0
- mindspore/ops/function/math_func.py +7 -1
- mindspore/ops/function/random_func.py +221 -7
- mindspore/ops/operations/__init__.py +1 -1
- mindspore/ops/operations/array_ops.py +3 -1
- mindspore/ops/operations/comm_ops.py +21 -0
- mindspore/ops/operations/manually_defined/ops_def.py +8 -10
- mindspore/parallel/_auto_parallel_context.py +3 -1
- mindspore/parallel/_cell_wrapper.py +2 -0
- mindspore/parallel/_tensor.py +46 -2
- mindspore/parallel/_utils.py +40 -21
- mindspore/parallel/transform_safetensors.py +196 -43
- mindspore/profiler/profiling.py +5 -1
- mindspore/run_check/_check_version.py +4 -2
- mindspore/train/_utils.py +92 -32
- mindspore/train/callback/_checkpoint.py +12 -9
- mindspore/train/callback/_on_request_exit.py +12 -1
- mindspore/train/callback/_tft_register.py +27 -4
- mindspore/train/dataset_helper.py +10 -2
- mindspore/train/model.py +20 -0
- mindspore/train/serialization.py +8 -18
- mindspore/version.py +1 -1
- {mindspore-2.4.0.dist-info → mindspore-2.4.1.dist-info}/METADATA +8 -6
- {mindspore-2.4.0.dist-info → mindspore-2.4.1.dist-info}/RECORD +100 -100
- {mindspore-2.4.0.dist-info → mindspore-2.4.1.dist-info}/WHEEL +0 -0
- {mindspore-2.4.0.dist-info → mindspore-2.4.1.dist-info}/entry_points.txt +0 -0
- {mindspore-2.4.0.dist-info → mindspore-2.4.1.dist-info}/top_level.txt +0 -0
mindspore/mint/optim/adamw.py
CHANGED
|
@@ -22,32 +22,25 @@ from mindspore.common import dtype as mstype
|
|
|
22
22
|
from mindspore.ops import auto_generate as gen
|
|
23
23
|
from mindspore.experimental.optim.optimizer import Optimizer
|
|
24
24
|
from mindspore import _checkparam as validator
|
|
25
|
-
from mindspore import mint
|
|
26
25
|
|
|
27
|
-
_optim_adamw_opt = C.MultitypeFuncGraph("optim_adamw_opt")
|
|
28
26
|
hyper_map = C.HyperMap()
|
|
29
27
|
|
|
30
28
|
|
|
31
|
-
@_optim_adamw_opt.register("Function", "Float", "Float", "Float", "Float", "Float", "Tensor", "Bool", "Bool", "Tensor",
|
|
32
|
-
"Tensor", "Tensor", "Tensor", "Tensor")
|
|
33
29
|
def _run_optim_adamw_amsgrad_opt(opt, beta1, beta2, lr, eps, weight_decay, step, amsgrad, maximize, parameters, grads,
|
|
34
30
|
exp_avg, exp_avg_sq, max_exp_avg_sq):
|
|
35
31
|
"""Apply adamw optimizer to the weight parameter."""
|
|
36
32
|
success = True
|
|
37
|
-
opt(parameters, exp_avg, exp_avg_sq, max_exp_avg_sq,
|
|
38
|
-
|
|
33
|
+
opt(parameters, exp_avg, exp_avg_sq, max_exp_avg_sq, grads, step, lr, beta1, beta2, weight_decay, eps, amsgrad,
|
|
34
|
+
maximize)
|
|
39
35
|
return success
|
|
40
36
|
|
|
41
37
|
|
|
42
|
-
@_optim_adamw_opt.register("Function", "Float", "Float", "Float", "Float", "Float", "Tensor", "Bool", "Bool", "Tensor",
|
|
43
|
-
"Tensor", "Tensor", "Tensor")
|
|
44
38
|
def _run_optim_adamw_opt(opt, beta1, beta2, lr, eps, weight_decay, step, amsgrad, maximize, parameters, grads, exp_avg,
|
|
45
39
|
exp_avg_sq):
|
|
46
40
|
"""Apply adamw optimizer to the weight parameter."""
|
|
47
41
|
success = True
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
weight_decay, eps, amsgrad, maximize)
|
|
42
|
+
opt(parameters, exp_avg, exp_avg_sq, exp_avg_sq, grads, step, lr, beta1, beta2, weight_decay, eps, amsgrad,
|
|
43
|
+
maximize)
|
|
51
44
|
return success
|
|
52
45
|
|
|
53
46
|
|
|
@@ -177,7 +170,6 @@ class AdamW(Optimizer):
|
|
|
177
170
|
self.state_step = Parameter(Tensor([-1], mstype.float32), "state_step")
|
|
178
171
|
self.increase_tensor = Tensor(1, mstype.float32)
|
|
179
172
|
self.assignadd = P.AssignAdd()
|
|
180
|
-
self.op_cast = P.Cast()
|
|
181
173
|
self.adamw_opt = gen.AdamW()
|
|
182
174
|
|
|
183
175
|
def construct(self, gradients):
|
|
@@ -191,13 +183,13 @@ class AdamW(Optimizer):
|
|
|
191
183
|
grads = tuple(gradients[start_id: end_id])
|
|
192
184
|
|
|
193
185
|
if group.get("amsgrad"):
|
|
194
|
-
self.hyper_map(F.partial(
|
|
186
|
+
self.hyper_map(F.partial(_run_optim_adamw_amsgrad_opt, self.adamw_opt, beta1, beta2, float(lr),
|
|
195
187
|
group.get("eps"), group.get("weight_decay"), self.state_step,
|
|
196
188
|
group.get("amsgrad"), maximize),
|
|
197
189
|
self.parameters[start_id: end_id], grads, self.exp_avg[start_id: end_id],
|
|
198
190
|
self.exp_avg_sq[start_id: end_id], group.get("max_exp_avg_sq"))
|
|
199
191
|
else:
|
|
200
|
-
self.hyper_map(F.partial(
|
|
192
|
+
self.hyper_map(F.partial(_run_optim_adamw_opt, self.adamw_opt, beta1, beta2, float(lr),
|
|
201
193
|
group.get("eps"), group.get("weight_decay"), self.state_step,
|
|
202
194
|
group.get("amsgrad"), maximize),
|
|
203
195
|
self.parameters[start_id: end_id], grads, self.exp_avg[start_id: end_id],
|
mindspore/nn/cell.py
CHANGED
|
@@ -1820,9 +1820,6 @@ class Cell(Cell_):
|
|
|
1820
1820
|
if not hasattr(self, "_func_graph_flags"):
|
|
1821
1821
|
self._func_graph_flags = {}
|
|
1822
1822
|
self._func_graph_flags.update({**flags})
|
|
1823
|
-
if context._get_mode() == context.PYNATIVE_MODE and self._func_graph_flags.get("output_no_recompute"):
|
|
1824
|
-
raise TypeError("Recompute is not supported in PyNative mode currently, you can use "
|
|
1825
|
-
"'context.set_context(mode=context.GRAPH_MODE)' or @jit to set graph mode.")
|
|
1826
1823
|
self.__dict__.update({**flags})
|
|
1827
1824
|
self._add_mixed_precision_flag(**flags)
|
|
1828
1825
|
return self
|
|
@@ -2585,6 +2582,7 @@ class Cell(Cell_):
|
|
|
2585
2582
|
"""
|
|
2586
2583
|
if context.get_context("mode") == context.PYNATIVE_MODE:
|
|
2587
2584
|
self._recompute_cell = recompute_registry.get()(self.construct)
|
|
2585
|
+
self._recompute()
|
|
2588
2586
|
return
|
|
2589
2587
|
self._recompute()
|
|
2590
2588
|
if 'mp_comm_recompute' in kwargs.keys():
|
mindspore/nn/layer/basic.py
CHANGED
|
@@ -579,11 +579,15 @@ class Identity(Cell):
|
|
|
579
579
|
r"""
|
|
580
580
|
A placeholder identity operator that returns the same as input.
|
|
581
581
|
|
|
582
|
+
Args:
|
|
583
|
+
args (Any): Any argument.
|
|
584
|
+
kwargs (Any): Any keyword argument.
|
|
585
|
+
|
|
582
586
|
Inputs:
|
|
583
|
-
- **
|
|
587
|
+
- **input** (Any) - The input of Identity.
|
|
584
588
|
|
|
585
589
|
Outputs:
|
|
586
|
-
The same as `
|
|
590
|
+
The same as `input`.
|
|
587
591
|
|
|
588
592
|
Supported Platforms:
|
|
589
593
|
``Ascend`` ``GPU`` ``CPU``
|
|
@@ -592,19 +596,19 @@ class Identity(Cell):
|
|
|
592
596
|
>>> import mindspore
|
|
593
597
|
>>> from mindspore import Tensor, nn
|
|
594
598
|
>>> import numpy as np
|
|
595
|
-
>>>
|
|
599
|
+
>>> input = Tensor(np.array([1, 2, 3, 4]), mindspore.int64)
|
|
596
600
|
>>> net = nn.Identity()
|
|
597
|
-
>>> output = net(
|
|
601
|
+
>>> output = net(input)
|
|
598
602
|
>>> print(output)
|
|
599
603
|
[1 2 3 4]
|
|
600
604
|
"""
|
|
601
605
|
|
|
602
|
-
def __init__(self):
|
|
606
|
+
def __init__(self, *args, **kwargs):
|
|
603
607
|
"""Initialize Identity."""
|
|
604
608
|
super(Identity, self).__init__()
|
|
605
609
|
|
|
606
|
-
def construct(self,
|
|
607
|
-
return
|
|
610
|
+
def construct(self, input):
|
|
611
|
+
return input
|
|
608
612
|
|
|
609
613
|
|
|
610
614
|
class Dense(Cell):
|
|
@@ -621,6 +625,9 @@ class Dense(Cell):
|
|
|
621
625
|
data type as the :math:`X` created by the layer, and :math:`\text{bias}` is a bias vector
|
|
622
626
|
with the same data type as the :math:`X` created by the layer (only if has_bias is True).
|
|
623
627
|
|
|
628
|
+
.. warning::
|
|
629
|
+
In PYNATIVE mode, if `bias` is ``False`` , the `x` cannot be greater than 6D.
|
|
630
|
+
|
|
624
631
|
Args:
|
|
625
632
|
in_channels (int): The number of channels in the input space.
|
|
626
633
|
out_channels (int): The number of channels in the output space.
|
|
@@ -635,6 +642,8 @@ class Dense(Cell):
|
|
|
635
642
|
layer. Both activation name, e.g. 'relu', and mindspore activation function, e.g. mindspore.ops.ReLU(),
|
|
636
643
|
are supported. Default: ``None`` .
|
|
637
644
|
dtype (:class:`mindspore.dtype`): Data type of Parameter. Default: ``mstype.float32`` .
|
|
645
|
+
When `weight_init` is Tensor, Parameter has the same data type as `weight_init` ,
|
|
646
|
+
in other cases, Parameter has the same data type as `dtype`, the same goes for `bias_init`.
|
|
638
647
|
|
|
639
648
|
Inputs:
|
|
640
649
|
- **x** (Tensor) - Tensor of shape :math:`(*, in\_channels)`. The `in_channels` in `Args` should be equal
|
|
@@ -651,6 +660,7 @@ class Dense(Cell):
|
|
|
651
660
|
is not equal to `out_channels` or shape[1] of `weight_init` is not equal to `in_channels`.
|
|
652
661
|
ValueError: If length of shape of `bias_init` is not equal to 1
|
|
653
662
|
or shape[0] of `bias_init` is not equal to `out_channels`.
|
|
663
|
+
RuntimeError: If `bias` is ``False`` and `x` is greater than 6D in PYNATIVE mode.
|
|
654
664
|
|
|
655
665
|
Supported Platforms:
|
|
656
666
|
``Ascend`` ``GPU`` ``CPU``
|
|
@@ -752,6 +762,9 @@ class Linear(Cell):
|
|
|
752
762
|
.. math::
|
|
753
763
|
\text{outputs} = X * kernel + bias
|
|
754
764
|
|
|
765
|
+
.. warning::
|
|
766
|
+
In PYNATIVE mode, if `bias` is ``False`` , the `x` cannot be greater than 6D.
|
|
767
|
+
|
|
755
768
|
where :math:`X` is the input tensors, :math:`\text{kernel}` is a weight matrix with the same
|
|
756
769
|
data type as the :math:`X` created by the layer, and :math:`\text{bias}` is a bias vector
|
|
757
770
|
with the same data type as the :math:`X` created by the layer (only if has_bias is True).
|
|
@@ -767,6 +780,9 @@ class Linear(Cell):
|
|
|
767
780
|
same as `x`. The values of str refer to the function `initializer`. Default: ``None`` ,
|
|
768
781
|
bias will be initialized using Uniform.
|
|
769
782
|
dtype (:class:`mindspore.dtype`): Data type of Parameter. Default: ``None`` .
|
|
783
|
+
If `dtype` is ``None`` , `dtype` is set to ``mstype.float32`` when initializing the method.
|
|
784
|
+
When `weight_init` is Tensor, Parameter has the same data type as `weight_init` ,
|
|
785
|
+
in other cases, Parameter has the same data type as `dtype`, the same goes for `bias_init`.
|
|
770
786
|
|
|
771
787
|
Inputs:
|
|
772
788
|
- **x** (Tensor) - Tensor of shape :math:`(*, in\_features)`. The `in_features` in `Args` should be equal
|
|
@@ -782,6 +798,7 @@ class Linear(Cell):
|
|
|
782
798
|
is not equal to `out_features` or shape[1] of `weight_init` is not equal to `in_features`.
|
|
783
799
|
ValueError: If length of shape of `bias_init` is not equal to 1
|
|
784
800
|
or shape[0] of `bias_init` is not equal to `out_features`.
|
|
801
|
+
RuntimeError: If `bias` is ``False`` and `x` is greater than 6D in PYNATIVE mode.
|
|
785
802
|
|
|
786
803
|
Supported Platforms:
|
|
787
804
|
``Ascend`` ``GPU`` ``CPU``
|
mindspore/nn/layer/embedding.py
CHANGED
|
@@ -164,11 +164,11 @@ class Embedding(Cell):
|
|
|
164
164
|
|
|
165
165
|
class EmbeddingExt(Cell):
|
|
166
166
|
r"""
|
|
167
|
-
|
|
168
|
-
Retrieve the word embeddings in weight stored in the layer using indices specified in `input`.
|
|
167
|
+
The value in `input` is used as the index, and the corresponding embedding vector is queried from `weight` .
|
|
169
168
|
|
|
170
169
|
.. warning::
|
|
171
|
-
|
|
170
|
+
- This is an experimental API that is subject to change or deletion.
|
|
171
|
+
- On Ascend, the behavior is unpredictable when the value of `input` is invalid.
|
|
172
172
|
|
|
173
173
|
Args:
|
|
174
174
|
num_embeddings (int): Size of the dictionary of embeddings.
|
|
@@ -183,14 +183,22 @@ class EmbeddingExt(Cell):
|
|
|
183
183
|
norm_type (float, optional): Indicated the value of p in p-norm. Default ``2.0``.
|
|
184
184
|
scale_grad_by_freq (bool, optional): If ``True`` the gradients will be scaled by the inverse of frequency
|
|
185
185
|
of the index in `input`. Default ``False``.
|
|
186
|
-
|
|
186
|
+
sparse (bool, optional): If ``True``, gradient w.r.t. `weight` matrix will be a sparse tensor which
|
|
187
|
+
has not been supported. Default: ``False``.
|
|
188
|
+
_weight (Tensor, optional): Used to initialize the `weight` of Embedding. If ``None``, the weight will be
|
|
187
189
|
initialized from normal distribution :math:`{N}(\text{sigma=1.0}, \text{mean=0.0})`. Default ``None``.
|
|
188
|
-
|
|
189
|
-
Default: ``
|
|
190
|
+
_freeze(bool, optional): If `weight` , the learnable weights of this module, should be freezed.
|
|
191
|
+
Default: ``False``.
|
|
192
|
+
dtype (mindspore.dtype, optional) : Dtype of Embedding's `weight` . It is meaningless when `_weight` is
|
|
193
|
+
not None. Default: ``None``.
|
|
194
|
+
|
|
195
|
+
Variables:
|
|
196
|
+
weight (Parameter): The learnable weights of this module of shape (num_embeddings, embedding_dim), which
|
|
197
|
+
initialized from :math:`{N}(\text{sigma=1.0}, \text{mean=0.0})` or `_weight` .
|
|
190
198
|
|
|
191
199
|
Inputs:
|
|
192
200
|
- **input** (Tensor) - The indices used to lookup in the embedding vector. The data type must be
|
|
193
|
-
|
|
201
|
+
int32 or int64, and the value should be in range `[0, num_embeddings)`.
|
|
194
202
|
|
|
195
203
|
Outputs:
|
|
196
204
|
Tensor, has the same data type as weight, the shape is :math:`(*input.shape, embedding\_dim)`.
|
|
@@ -202,6 +210,7 @@ class EmbeddingExt(Cell):
|
|
|
202
210
|
TypeError: If `max_norm` is not a float.
|
|
203
211
|
TypeError: If `norm_type` is not a float.
|
|
204
212
|
TypeError: If `scale_grad_by_freq` is not a bool.
|
|
213
|
+
ValueError: If `weight.shape` is invalid.
|
|
205
214
|
TypeError: If `dtype` is not one of mindspore.dtype.
|
|
206
215
|
|
|
207
216
|
Supported Platforms:
|
|
@@ -212,7 +221,7 @@ class EmbeddingExt(Cell):
|
|
|
212
221
|
>>> import numpy as np
|
|
213
222
|
>>> from mindspore import Tensor, nn
|
|
214
223
|
>>> input = Tensor([[1, 0, 1, 1], [0, 0, 1, 0]])
|
|
215
|
-
>>> embedding = nn.
|
|
224
|
+
>>> embedding = nn.EmbeddingExt(num_embeddings=10, embedding_dim=3)
|
|
216
225
|
>>> output = embedding(input)
|
|
217
226
|
>>> print(output)
|
|
218
227
|
[[[-0.0024154 -0.01203444 0.00811537]
|
|
@@ -226,23 +235,30 @@ class EmbeddingExt(Cell):
|
|
|
226
235
|
"""
|
|
227
236
|
|
|
228
237
|
def __init__(self, num_embeddings, embedding_dim, padding_idx=None, max_norm=None, norm_type=2.0,
|
|
229
|
-
scale_grad_by_freq=False, _weight=None, dtype=
|
|
238
|
+
scale_grad_by_freq=False, sparse=False, _weight=None, _freeze=False, dtype=None):
|
|
230
239
|
"""Initialize Embedding."""
|
|
231
240
|
super().__init__()
|
|
241
|
+
self.sparse = Validator.check_value_type('sparse', sparse, [bool], self.cls_name)
|
|
242
|
+
if self.sparse:
|
|
243
|
+
raise ValueError("For Embedding, the scenerio, where `sparse` is True, has not be supported.")
|
|
232
244
|
self.num_embeddings = Validator.check_value_type(
|
|
233
245
|
'num_embeddings', num_embeddings, [int], self.cls_name)
|
|
234
246
|
self.embedding_dim = Validator.check_value_type(
|
|
235
247
|
'embedding_dim', embedding_dim, [int], self.cls_name)
|
|
248
|
+
self.dtype = dtype if dtype is not None else mstype.float32
|
|
236
249
|
Validator.check_subclass(
|
|
237
|
-
"dtype", dtype, mstype.number_type, self.cls_name)
|
|
238
|
-
self.dtype = dtype
|
|
250
|
+
"dtype", self.dtype, mstype.number_type, self.cls_name)
|
|
239
251
|
self.padding_idx = padding_idx
|
|
240
252
|
if _weight is None:
|
|
241
|
-
init_tensor = Tensor(shape=[num_embeddings, embedding_dim], dtype=dtype, init=Normal(1, 0))
|
|
253
|
+
init_tensor = Tensor(shape=[num_embeddings, embedding_dim], dtype=self.dtype, init=Normal(1, 0))
|
|
242
254
|
init_tensor = self._zero_weight_by_index(init_tensor)
|
|
243
|
-
self.weight = Parameter(init_tensor, name='weight')
|
|
255
|
+
self.weight = Parameter(init_tensor, name='weight', requires_grad=not _freeze)
|
|
244
256
|
else:
|
|
245
|
-
|
|
257
|
+
if _weight.shape != (num_embeddings, embedding_dim):
|
|
258
|
+
raise ValueError(f"For Embedding, shape of weight should be match with num_embeddings "
|
|
259
|
+
f"and embedding_dim, but got weight.shape: {_weight.shape}, "
|
|
260
|
+
f"and (num_embeddings, embedding_dim): ({num_embeddings}, {embedding_dim})")
|
|
261
|
+
self.weight = Parameter(_weight, name='weight', requires_grad=not _freeze)
|
|
246
262
|
|
|
247
263
|
self.max_norm = max_norm
|
|
248
264
|
if max_norm is not None:
|
|
@@ -300,6 +316,7 @@ class EmbeddingLookup(Cell):
|
|
|
300
316
|
specified 'axis = 0' to lookup table.
|
|
301
317
|
In field slice mode, the manual_shapes must be given. It is a tuple ,where
|
|
302
318
|
the element is vocab[i], vocab[i] is the row numbers for i-th part.
|
|
319
|
+
This module does not support the PyNative mode.
|
|
303
320
|
|
|
304
321
|
Args:
|
|
305
322
|
vocab_size (int): Size of the dictionary of embeddings.
|
|
@@ -20,6 +20,8 @@ from mindspore.common.tensor import Tensor
|
|
|
20
20
|
from mindspore.nn.optim.optimizer import Optimizer
|
|
21
21
|
from mindspore.ops.operations.manually_defined._inner import TensorReport
|
|
22
22
|
from mindspore import ops, context
|
|
23
|
+
from mindspore.common.parameter import Parameter
|
|
24
|
+
import mindspore.common.dtype as mstype
|
|
23
25
|
|
|
24
26
|
class OptTFTWrapper(Optimizer):
|
|
25
27
|
r"""
|
|
@@ -61,9 +63,9 @@ class OptTFTWrapper(Optimizer):
|
|
|
61
63
|
"""
|
|
62
64
|
|
|
63
65
|
def __init__(self, opt, **kwargs):
|
|
64
|
-
super(OptTFTWrapper, self).__init__(opt.learning_rate, opt._parameters) # pylint: disable=W0212
|
|
65
66
|
if not isinstance(opt, Optimizer):
|
|
66
67
|
raise TypeError(f"For 'OptTFTWrapper', the argument 'opt' must be Optimizer type, " f"but got {type(opt)}.")
|
|
68
|
+
super(OptTFTWrapper, self).__init__(opt.learning_rate, opt._parameters) # pylint: disable=W0212
|
|
67
69
|
tft_env = os.getenv("MS_ENABLE_TFT", "")
|
|
68
70
|
if ("TTP:1" not in tft_env) and ("UCE:1" not in tft_env):
|
|
69
71
|
raise ValueError("MindIO TFT regitster need custom switch on[MS_ENABLE_TFT='{TTP:1,UCE:1}']!")
|
|
@@ -74,13 +76,9 @@ class OptTFTWrapper(Optimizer):
|
|
|
74
76
|
self.opt = opt
|
|
75
77
|
self.report = TensorReport()
|
|
76
78
|
self.depend = ops.Depend()
|
|
77
|
-
self.
|
|
78
|
-
|
|
79
|
-
self.
|
|
80
|
-
|
|
81
|
-
if self.use_allreduce:
|
|
82
|
-
self.allreduce_sum = ops.AllReduce()
|
|
83
|
-
self.allreduce_sum.add_prim_attr("tft_report_before", True)
|
|
79
|
+
self.allreduce_sum = ops.AllReduce()
|
|
80
|
+
self.allreduce_sum.add_prim_attr("tft_report_before", True)
|
|
81
|
+
self.tft_g_one_flag = Parameter(Tensor([1], dtype=mstype.int32))
|
|
84
82
|
|
|
85
83
|
self.param_rank = opt.param_rank
|
|
86
84
|
self.optim_filter = opt.optim_filter
|
|
@@ -118,10 +116,9 @@ class OptTFTWrapper(Optimizer):
|
|
|
118
116
|
self.enable_tuple_broaden = opt.enable_tuple_broaden
|
|
119
117
|
|
|
120
118
|
def construct(self, gradients):
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
return self.opt(gradients)
|
|
119
|
+
tft_g_one_flag = self.depend(self.tft_g_one_flag, gradients)
|
|
120
|
+
self.tft_g_one_flag = self.allreduce_sum(tft_g_one_flag)
|
|
121
|
+
|
|
122
|
+
grads = self.depend(gradients, self.report("tft_report", self.tft_g_one_flag))
|
|
123
|
+
opt_ret = self.opt(grads)
|
|
124
|
+
return opt_ret
|
|
@@ -38,7 +38,6 @@ from mindspore.ops.operations.array_ops import SegmentMean
|
|
|
38
38
|
from mindspore.ops.operations.array_ops import AffineGrid
|
|
39
39
|
from mindspore.ops.operations.array_ops import MaskedScatter
|
|
40
40
|
from mindspore.ops.operations.array_ops import MaskedSelect
|
|
41
|
-
from mindspore.ops.operations.array_ops import CountNonZero
|
|
42
41
|
from mindspore.ops.operations.random_ops import LogNormalReverse
|
|
43
42
|
from mindspore.ops.operations.random_ops import ParameterizedTruncatedNormal
|
|
44
43
|
from mindspore.ops.operations import _inner_ops as inner
|
|
@@ -125,16 +124,6 @@ def get_bprop_masked_scatter(self):
|
|
|
125
124
|
return bprop
|
|
126
125
|
|
|
127
126
|
|
|
128
|
-
@bprop_getters.register(CountNonZero)
|
|
129
|
-
def get_bprop_countnonzero(self):
|
|
130
|
-
"""Grad definition for CountNonZero"""
|
|
131
|
-
|
|
132
|
-
def bprop(x, out, dout):
|
|
133
|
-
return (zeros_like(x),)
|
|
134
|
-
|
|
135
|
-
return bprop
|
|
136
|
-
|
|
137
|
-
|
|
138
127
|
@bprop_getters.register(Mvlgamma)
|
|
139
128
|
def get_bprop_mvlgamma(self):
|
|
140
129
|
"""Grad definition for Mvlgamma"""
|
|
@@ -31,7 +31,8 @@ from mindspore.ops.operations.comm_ops import (AllGather, _MiniStepAllGather, _H
|
|
|
31
31
|
_GetTensorSlice, _MirrorOperator, _MirrorMiniStepOperator, ReduceOp,
|
|
32
32
|
ReduceScatter, _HostReduceScatter, _VirtualDiv, _VirtualAdd, _AllSwap,
|
|
33
33
|
_VirtualAssignAdd, _VirtualAccuGrad, _MirrorMicroStepOperator,
|
|
34
|
-
_MicroStepAllGather, Reduce, CollectiveGather, CollectiveScatter
|
|
34
|
+
_MicroStepAllGather, Reduce, CollectiveGather, CollectiveScatter,
|
|
35
|
+
_VirtualAssignKvCache)
|
|
35
36
|
from mindspore.ops._grad_experimental.grad_base import bprop_getters
|
|
36
37
|
from mindspore.ops.operations import _grad_ops as G
|
|
37
38
|
import mindspore as ms
|
|
@@ -179,6 +180,24 @@ def get_bprop_virtual_assign_add(self):
|
|
|
179
180
|
return bprop
|
|
180
181
|
|
|
181
182
|
|
|
183
|
+
@bprop_getters.register(_VirtualAssignKvCache)
|
|
184
|
+
def get_bprop_virtual_assign_kv_cache(self):
|
|
185
|
+
"""Generate bprop for VirtualAssignAdd."""
|
|
186
|
+
assign = P.Assign()
|
|
187
|
+
cast = P.Cast()
|
|
188
|
+
dtype = P.DType()
|
|
189
|
+
out_tensor = Tensor(0.0, mstype.float16)
|
|
190
|
+
|
|
191
|
+
def bprop(x, y, seq_chunk, out, dout):
|
|
192
|
+
dout_update = dout + y
|
|
193
|
+
kv_equal = F.equal(seq_chunk, 0)
|
|
194
|
+
update_kv = F.select(kv_equal, F.broadcast_to(cast(out_tensor, dtype(y)), F.shape(y)), dout_update)
|
|
195
|
+
return F.depend((dout_update, cast(out_tensor, dtype(y)),
|
|
196
|
+
cast(out_tensor, dtype(seq_chunk))), assign(y, update_kv))
|
|
197
|
+
|
|
198
|
+
return bprop
|
|
199
|
+
|
|
200
|
+
|
|
182
201
|
@bprop_getters.register(_VirtualAccuGrad)
|
|
183
202
|
def get_bprop_virtual_accu_grad(self):
|
|
184
203
|
"""Generate bprop for VirtualAccuGrad."""
|
|
@@ -65,6 +65,7 @@ op_args_default_value = {
|
|
|
65
65
|
"ConvolutionGrad": {"bias": None, "stride": 1, "padding": 0, "dilation": 1, "transposed": False, "output_padding": 0, "groups": 1, "output_mask": ()},
|
|
66
66
|
"Convolution": {"bias": None, "stride": 1, "padding": 0, "dilation": 1, "transposed": False, "output_padding": 0, "groups": 1},
|
|
67
67
|
"Correlate": {"mode": 'valid'},
|
|
68
|
+
"CountNonZero": {"dim": None},
|
|
68
69
|
"Cross": {"dim": -65530},
|
|
69
70
|
"CumProd": {"exclusive": False, "reverse": False},
|
|
70
71
|
"CumSum": {"exclusive": False, "reverse": False},
|
|
@@ -185,6 +186,11 @@ op_args_default_value = {
|
|
|
185
186
|
"Qr": {"full_matrices": False},
|
|
186
187
|
"RandExt": {"dtype": None},
|
|
187
188
|
"RandLikeExt": {"dtype": None},
|
|
189
|
+
"RandIntLike": {"dtype": None},
|
|
190
|
+
"RandInt": {"dtype": None},
|
|
191
|
+
"RandnLike": {"dtype": None},
|
|
192
|
+
"Randn": {"dtype": None},
|
|
193
|
+
"RandpermExt": {"dtype": mstype.int64},
|
|
188
194
|
"RandpermV2": {"seed": 0, "offset": 0, "dtype": mstype.int64},
|
|
189
195
|
"Range": {"maxlen": 1000000},
|
|
190
196
|
"ReduceAll": {"axis": None, "keep_dims": False},
|
|
@@ -1350,6 +1350,39 @@ def prod(input, axis=None, keep_dims=False, dtype=None):
|
|
|
1350
1350
|
return prod_impl(input, axis, keep_dims, dtype)
|
|
1351
1351
|
|
|
1352
1352
|
|
|
1353
|
+
def select(input, dim, index):
|
|
1354
|
+
r"""
|
|
1355
|
+
Slices the input tensor along the selected dimension at the given index.
|
|
1356
|
+
|
|
1357
|
+
.. warning::
|
|
1358
|
+
This is an experimental API that is subject to change or deletion.
|
|
1359
|
+
|
|
1360
|
+
Args:
|
|
1361
|
+
input (Tensor): the input tensor.
|
|
1362
|
+
dim (int): the dimension to slice.
|
|
1363
|
+
index (int): the index to select with.
|
|
1364
|
+
|
|
1365
|
+
Returns:
|
|
1366
|
+
Tensor.
|
|
1367
|
+
|
|
1368
|
+
Raises:
|
|
1369
|
+
TypeError: If input is not a Tensor.
|
|
1370
|
+
|
|
1371
|
+
Supported Platforms:
|
|
1372
|
+
``Ascend``
|
|
1373
|
+
|
|
1374
|
+
Examples:
|
|
1375
|
+
>>> import mindspore
|
|
1376
|
+
>>> from mindspore import Tensor, mint
|
|
1377
|
+
>>> input = Tensor([[2, 3, 4, 5],[3, 2, 4, 5]])
|
|
1378
|
+
>>> y = mint.select(input, 0, 0)
|
|
1379
|
+
>>> y = Tensor([1,2], mindspore.float32)
|
|
1380
|
+
>>> print(y)
|
|
1381
|
+
[2 3 4 5]
|
|
1382
|
+
"""
|
|
1383
|
+
return select_impl(input, dim, index)
|
|
1384
|
+
|
|
1385
|
+
|
|
1353
1386
|
def selu(input):
|
|
1354
1387
|
r"""
|
|
1355
1388
|
Activation function SELU (Scaled exponential Linear Unit).
|
|
@@ -1655,6 +1655,54 @@ def cosh(input):
|
|
|
1655
1655
|
return cosh_op(input)
|
|
1656
1656
|
|
|
1657
1657
|
|
|
1658
|
+
def count_nonzero(input, dim=None):
|
|
1659
|
+
r"""
|
|
1660
|
+
Counts the number of non-zero values in the tensor input along the given dim. If no dim is specified then all non-zeros in the tensor are counted.
|
|
1661
|
+
|
|
1662
|
+
.. warning::
|
|
1663
|
+
This is an experimental API that is subject to change or deletion.
|
|
1664
|
+
|
|
1665
|
+
Args:
|
|
1666
|
+
input (Tensor): Input data is used to count non-zero numbers. With shape
|
|
1667
|
+
:math:`(*)` where :math:`*` means, any number of additional dimensions.
|
|
1668
|
+
dim (Union[int, tuple(int), list(int)], optional): The dimension to reduce. Default value: ``None``, which indicates that the number of non-zero elements is calculated. If `dim` is ``None``, all elements in the tensor are summed up.
|
|
1669
|
+
|
|
1670
|
+
Returns:
|
|
1671
|
+
Tensor, number of nonzero element across dim specified by `dim`.
|
|
1672
|
+
|
|
1673
|
+
Raises:
|
|
1674
|
+
TypeError: If `input` is not tensor.
|
|
1675
|
+
TypeError: If `dim` is not int, tuple(int), list(int) or None.
|
|
1676
|
+
ValueError: If any value in `dim` is not in range [-x.ndim, x.ndim).
|
|
1677
|
+
|
|
1678
|
+
Supported Platforms:
|
|
1679
|
+
``Ascend``
|
|
1680
|
+
|
|
1681
|
+
Examples:
|
|
1682
|
+
>>> from mindspore import Tensor, ops
|
|
1683
|
+
>>> import numpy as np
|
|
1684
|
+
>>> import mindspore
|
|
1685
|
+
>>> # case 1: each value specified.
|
|
1686
|
+
>>> x = Tensor(np.array([[0, 1, 0], [1, 1, 0]]).astype(np.float32))
|
|
1687
|
+
>>> nonzero_num = ops.count_nonzero(input=x, dim=[0, 1])
|
|
1688
|
+
>>> print(nonzero_num)
|
|
1689
|
+
[[3]]
|
|
1690
|
+
>>> # case 2: all value is default.
|
|
1691
|
+
>>> nonzero_num = ops.count_nonzero(input=x)
|
|
1692
|
+
>>> print(nonzero_num)
|
|
1693
|
+
3
|
|
1694
|
+
>>> # case 3: dim value was specified 0.
|
|
1695
|
+
>>> nonzero_num = ops.count_nonzero(input=x, dim=[0,])
|
|
1696
|
+
>>> print(nonzero_num)
|
|
1697
|
+
[1 2 0]
|
|
1698
|
+
>>> # case 4: dim value was specified 1.
|
|
1699
|
+
>>> nonzero_num = ops.count_nonzero(input=x, dim=[1,])
|
|
1700
|
+
>>> print(nonzero_num)
|
|
1701
|
+
[1 2]
|
|
1702
|
+
"""
|
|
1703
|
+
return count_nonzero_op(input, dim)
|
|
1704
|
+
|
|
1705
|
+
|
|
1658
1706
|
def cummax(input, axis):
|
|
1659
1707
|
r"""
|
|
1660
1708
|
Returns a tuple (values,indices) where 'values' is the cumulative maximum value of input Tensor `input`
|
|
@@ -1860,7 +1908,8 @@ def dense(input, weight, bias=None):
|
|
|
1860
1908
|
output = input * weight^{T} + bias
|
|
1861
1909
|
|
|
1862
1910
|
.. warning::
|
|
1863
|
-
This is an experimental API that is subject to change or deletion.
|
|
1911
|
+
- This is an experimental API that is subject to change or deletion.
|
|
1912
|
+
- In PYNATIVE mode, if `bias` is not 1D, the `input` cannot be greater than 6D.
|
|
1864
1913
|
|
|
1865
1914
|
Args:
|
|
1866
1915
|
input (Tensor): Input Tensor of shape :math:`(*, in\_channels)`,
|
|
@@ -1877,6 +1926,7 @@ def dense(input, weight, bias=None):
|
|
|
1877
1926
|
TypeError: If `input` is not Tensor.
|
|
1878
1927
|
TypeError: If `weight` is not Tensor.
|
|
1879
1928
|
TypeError: If `bias` is not Tensor.
|
|
1929
|
+
RuntimeError: If `bias` is not 1D and `input` is greater than 6D in PYNATIVE mode.
|
|
1880
1930
|
|
|
1881
1931
|
Supported Platforms:
|
|
1882
1932
|
``Ascend`` ``GPU`` ``CPU``
|
|
@@ -6404,7 +6454,7 @@ def rotary_position_embedding(x, cos, sin, mode=0):
|
|
|
6404
6454
|
|
|
6405
6455
|
Args:
|
|
6406
6456
|
x (Tensor): 4D tensor, with float16, bfloat16 or float32 data type.
|
|
6407
|
-
cos (Tensor): 4D
|
|
6457
|
+
cos (Tensor): 4D constant, has the same type as `x` , in range of [-1, 1].
|
|
6408
6458
|
sin (Tensor): Same with `cos` .
|
|
6409
6459
|
mode (int): An optional attribute. Used to select a calculation mode. 0: rotate_half(GPT-NeoX style); 1: rotate_interleaved(GPT-J style). Defaults to ``0`` .
|
|
6410
6460
|
|
|
@@ -6420,7 +6470,6 @@ def rotary_position_embedding(x, cos, sin, mode=0):
|
|
|
6420
6470
|
|
|
6421
6471
|
11SD, B1SD, BNSD; D < 896 and D is an Even. B, N < 1000;
|
|
6422
6472
|
|
|
6423
|
-
B * N <= 1024 if gradient calculation of cos/sin is used.
|
|
6424
6473
|
- Supported layout: 11SD, B1SD, BNSD;
|
|
6425
6474
|
|
|
6426
6475
|
D < 896 and D is an Even.
|