PyPI - mindspore - Versions diffs - 2.7.0__cp311-cp311-win_amd64.whl → 2.7.1__cp311-cp311-win_amd64.whl - Mend

mindspore 2.7.0__cp311-cp311-win_amd64.whl → 2.7.1__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mindspore might be problematic. Click here for more details.

Files changed (290) hide show

mindspore/.commit_id +1 -1
mindspore/__init__.py +4 -1
mindspore/_c_dataengine.cp311-win_amd64.pyd +0 -0
mindspore/_c_expression.cp311-win_amd64.pyd +0 -0
mindspore/_c_mindrecord.cp311-win_amd64.pyd +0 -0
mindspore/_extends/parse/compile_config.py +24 -1
mindspore/_extends/parse/deprecated/deprecated_tensor_method.py +6 -2
mindspore/_extends/parse/resources.py +1 -1
mindspore/_extends/parse/standard_method.py +8 -1
mindspore/_extends/parse/trope.py +2 -1
mindspore/_extends/pijit/pijit_func_white_list.py +7 -22
mindspore/avcodec-59.dll +0 -0
mindspore/avdevice-59.dll +0 -0
mindspore/avfilter-8.dll +0 -0
mindspore/avformat-59.dll +0 -0
mindspore/avutil-57.dll +0 -0
mindspore/boost/base.py +29 -2
mindspore/common/_decorator.py +3 -2
mindspore/common/_grad_function.py +3 -1
mindspore/common/_tensor_cpp_method.py +1 -1
mindspore/common/_tensor_docs.py +275 -64
mindspore/common/_utils.py +0 -44
mindspore/common/api.py +285 -35
mindspore/common/dump.py +7 -108
mindspore/common/dynamic_shape/auto_dynamic_shape.py +1 -3
mindspore/common/hook_handle.py +60 -0
mindspore/common/jit_config.py +5 -1
mindspore/common/jit_trace.py +27 -12
mindspore/common/lazy_inline.py +5 -3
mindspore/common/parameter.py +13 -107
mindspore/common/recompute.py +4 -11
mindspore/common/tensor.py +16 -169
mindspore/communication/_comm_helper.py +11 -1
mindspore/communication/comm_func.py +138 -4
mindspore/communication/management.py +85 -1
mindspore/config/op_info.config +0 -15
mindspore/context.py +5 -85
mindspore/dataset/engine/datasets.py +8 -4
mindspore/dataset/engine/datasets_vision.py +1 -1
mindspore/dataset/engine/validators.py +1 -15
mindspore/dnnl.dll +0 -0
mindspore/{experimental/llm_boost/ascend_native → graph}/__init__.py +7 -7
mindspore/graph/custom_pass.py +55 -0
mindspore/include/dataset/execute.h +2 -2
mindspore/jpeg62.dll +0 -0
mindspore/mindrecord/__init__.py +3 -3
mindspore/mindrecord/common/exceptions.py +1 -0
mindspore/mindrecord/config.py +1 -1
mindspore/{parallel/mpi → mindrecord/core}/__init__.py +4 -1
mindspore/mindrecord/{shardheader.py → core/shardheader.py} +2 -1
mindspore/mindrecord/{shardindexgenerator.py → core/shardindexgenerator.py} +1 -1
mindspore/mindrecord/{shardreader.py → core/shardreader.py} +2 -1
mindspore/mindrecord/{shardsegment.py → core/shardsegment.py} +2 -2
mindspore/mindrecord/{shardutils.py → core/shardutils.py} +1 -1
mindspore/mindrecord/{shardwriter.py → core/shardwriter.py} +1 -1
mindspore/mindrecord/filereader.py +4 -4
mindspore/mindrecord/filewriter.py +5 -5
mindspore/mindrecord/mindpage.py +2 -2
mindspore/mindrecord/tools/cifar10.py +1 -1
mindspore/mindrecord/tools/cifar100.py +1 -1
mindspore/mindrecord/tools/cifar100_to_mr.py +1 -1
mindspore/mindrecord/tools/cifar10_to_mr.py +1 -1
mindspore/mindrecord/tools/csv_to_mr.py +1 -1
mindspore/mindrecord/tools/imagenet_to_mr.py +1 -1
mindspore/mindrecord/tools/mnist_to_mr.py +1 -1
mindspore/mindrecord/tools/tfrecord_to_mr.py +1 -1
mindspore/mindspore_backend_common.dll +0 -0
mindspore/mindspore_backend_manager.dll +0 -0
mindspore/mindspore_cluster.dll +0 -0
mindspore/mindspore_common.dll +0 -0
mindspore/mindspore_core.dll +0 -0
mindspore/mindspore_cpu.dll +0 -0
mindspore/mindspore_dump.dll +0 -0
mindspore/mindspore_frontend.dll +0 -0
mindspore/mindspore_glog.dll +0 -0
mindspore/mindspore_hardware_abstract.dll +0 -0
mindspore/mindspore_memory_pool.dll +0 -0
mindspore/mindspore_ms_backend.dll +0 -0
mindspore/mindspore_ops.dll +0 -0
mindspore/{mindspore_ops_host.dll → mindspore_ops_cpu.dll} +0 -0
mindspore/mindspore_profiler.dll +0 -0
mindspore/mindspore_pyboost.dll +0 -0
mindspore/mindspore_pynative.dll +0 -0
mindspore/mindspore_runtime_pipeline.dll +0 -0
mindspore/mindspore_runtime_utils.dll +0 -0
mindspore/mindspore_tools.dll +0 -0
mindspore/mint/__init__.py +15 -10
mindspore/mint/distributed/distributed.py +182 -62
mindspore/mint/nn/__init__.py +2 -16
mindspore/mint/nn/functional.py +4 -110
mindspore/mint/nn/layer/__init__.py +0 -2
mindspore/mint/nn/layer/activation.py +0 -6
mindspore/mint/nn/layer/basic.py +0 -47
mindspore/mint/nn/layer/conv.py +4 -4
mindspore/mint/nn/layer/normalization.py +8 -13
mindspore/mint/nn/layer/pooling.py +0 -4
mindspore/nn/__init__.py +1 -3
mindspore/nn/cell.py +16 -66
mindspore/nn/layer/basic.py +49 -1
mindspore/nn/layer/container.py +16 -0
mindspore/nn/layer/embedding.py +4 -169
mindspore/nn/layer/normalization.py +2 -1
mindspore/nn/layer/thor_layer.py +4 -85
mindspore/nn/optim/ada_grad.py +0 -1
mindspore/nn/optim/adafactor.py +0 -1
mindspore/nn/optim/adam.py +31 -124
mindspore/nn/optim/adamax.py +0 -1
mindspore/nn/optim/asgd.py +0 -1
mindspore/nn/optim/ftrl.py +8 -102
mindspore/nn/optim/lamb.py +0 -1
mindspore/nn/optim/lars.py +0 -3
mindspore/nn/optim/lazyadam.py +25 -218
mindspore/nn/optim/momentum.py +5 -43
mindspore/nn/optim/optimizer.py +6 -55
mindspore/nn/optim/proximal_ada_grad.py +0 -1
mindspore/nn/optim/rmsprop.py +0 -1
mindspore/nn/optim/rprop.py +0 -1
mindspore/nn/optim/sgd.py +0 -1
mindspore/nn/optim/tft_wrapper.py +0 -1
mindspore/nn/optim/thor.py +0 -2
mindspore/nn/probability/bijector/bijector.py +7 -8
mindspore/nn/probability/bijector/gumbel_cdf.py +2 -2
mindspore/nn/probability/bijector/power_transform.py +20 -21
mindspore/nn/probability/bijector/scalar_affine.py +5 -5
mindspore/nn/probability/bijector/softplus.py +13 -14
mindspore/nn/wrap/grad_reducer.py +4 -74
mindspore/numpy/array_creations.py +2 -2
mindspore/numpy/fft.py +9 -9
mindspore/{nn/reinforcement → onnx}/__init__.py +5 -8
mindspore/onnx/onnx_export.py +137 -0
mindspore/opencv_core4110.dll +0 -0
mindspore/opencv_imgcodecs4110.dll +0 -0
mindspore/{opencv_imgproc452.dll → opencv_imgproc4110.dll} +0 -0
mindspore/ops/__init__.py +2 -0
mindspore/ops/_grad_experimental/grad_comm_ops.py +38 -2
mindspore/ops/_op_impl/aicpu/__init__.py +0 -10
mindspore/ops/_op_impl/cpu/__init__.py +0 -5
mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +16 -22
mindspore/ops/auto_generate/gen_extend_func.py +2 -7
mindspore/ops/auto_generate/gen_ops_def.py +98 -141
mindspore/ops/auto_generate/gen_ops_prim.py +12708 -12686
mindspore/ops/communication.py +97 -0
mindspore/ops/composite/__init__.py +5 -2
mindspore/ops/composite/base.py +15 -1
mindspore/ops/composite/multitype_ops/__init__.py +3 -1
mindspore/ops/composite/multitype_ops/_compile_utils.py +150 -8
mindspore/ops/composite/multitype_ops/add_impl.py +7 -0
mindspore/ops/composite/multitype_ops/mod_impl.py +27 -0
mindspore/ops/function/__init__.py +1 -0
mindspore/ops/function/array_func.py +14 -12
mindspore/ops/function/comm_func.py +3883 -0
mindspore/ops/function/debug_func.py +3 -4
mindspore/ops/function/math_func.py +45 -54
mindspore/ops/function/nn_func.py +75 -294
mindspore/ops/function/random_func.py +9 -18
mindspore/ops/functional.py +2 -0
mindspore/ops/functional_overload.py +354 -18
mindspore/ops/operations/__init__.py +2 -5
mindspore/ops/operations/_custom_ops_utils.py +7 -9
mindspore/ops/operations/_inner_ops.py +1 -38
mindspore/ops/operations/_rl_inner_ops.py +0 -933
mindspore/ops/operations/array_ops.py +1 -0
mindspore/ops/operations/comm_ops.py +94 -2
mindspore/ops/operations/custom_ops.py +228 -19
mindspore/ops/operations/debug_ops.py +27 -29
mindspore/ops/operations/manually_defined/ops_def.py +27 -306
mindspore/ops/operations/nn_ops.py +2 -2
mindspore/ops/operations/sparse_ops.py +0 -83
mindspore/ops/primitive.py +1 -17
mindspore/ops/tensor_method.py +72 -3
mindspore/ops_generate/aclnn/aclnn_kernel_register_auto_cc_generator.py +5 -5
mindspore/ops_generate/aclnn/gen_aclnn_implement.py +8 -8
mindspore/ops_generate/api/functions_cc_generator.py +53 -4
mindspore/ops_generate/api/tensor_func_reg_cpp_generator.py +25 -11
mindspore/ops_generate/common/gen_constants.py +11 -10
mindspore/ops_generate/common/op_proto.py +18 -1
mindspore/ops_generate/common/template.py +102 -245
mindspore/ops_generate/common/template_utils.py +212 -0
mindspore/ops_generate/gen_custom_ops.py +69 -0
mindspore/ops_generate/op_def/ops_def_cc_generator.py +78 -7
mindspore/ops_generate/op_def_py/base_op_prim_py_generator.py +360 -0
mindspore/ops_generate/op_def_py/custom_op_prim_py_generator.py +140 -0
mindspore/ops_generate/op_def_py/op_def_py_generator.py +54 -7
mindspore/ops_generate/op_def_py/op_prim_py_generator.py +5 -312
mindspore/ops_generate/pyboost/auto_grad_impl_cc_generator.py +74 -17
mindspore/ops_generate/pyboost/auto_grad_reg_cc_generator.py +22 -5
mindspore/ops_generate/pyboost/op_template_parser.py +3 -2
mindspore/ops_generate/pyboost/pyboost_functions_cpp_generator.py +21 -5
mindspore/ops_generate/pyboost/pyboost_functions_h_generator.py +2 -2
mindspore/ops_generate/pyboost/pyboost_functions_impl_cpp_generator.py +30 -10
mindspore/ops_generate/pyboost/pyboost_grad_function_cpp_generator.py +10 -3
mindspore/ops_generate/pyboost/pyboost_internal_kernel_info_adapter_generator.py +1 -1
mindspore/ops_generate/pyboost/pyboost_native_grad_functions_generator.py +19 -9
mindspore/ops_generate/pyboost/pyboost_op_cpp_code_generator.py +71 -28
mindspore/ops_generate/pyboost/pyboost_overload_functions_cpp_generator.py +10 -9
mindspore/ops_generate/pyboost/pyboost_utils.py +27 -16
mindspore/ops_generate/resources/yaml_loader.py +13 -0
mindspore/ops_generate/tensor_py_cc_generator.py +2 -2
mindspore/parallel/_cell_wrapper.py +1 -1
mindspore/parallel/_parallel_serialization.py +1 -4
mindspore/parallel/_utils.py +29 -6
mindspore/parallel/checkpoint_transform.py +18 -2
mindspore/parallel/cluster/process_entity/_api.py +24 -32
mindspore/parallel/cluster/process_entity/_utils.py +9 -5
mindspore/{experimental/llm_boost/atb → parallel/distributed}/__init__.py +21 -23
mindspore/parallel/distributed/distributed_data_parallel.py +393 -0
mindspore/parallel/distributed/flatten_grad_buffer.py +295 -0
mindspore/parallel/strategy.py +336 -0
mindspore/parallel/transform_safetensors.py +117 -16
mindspore/profiler/analysis/viewer/ascend_kernel_details_viewer.py +3 -0
mindspore/profiler/analysis/viewer/ms_minddata_viewer.py +1 -1
mindspore/profiler/common/constant.py +5 -0
mindspore/profiler/common/file_manager.py +9 -0
mindspore/profiler/common/msprof_cmd_tool.py +38 -2
mindspore/profiler/common/path_manager.py +56 -24
mindspore/profiler/common/profiler_context.py +2 -12
mindspore/profiler/common/profiler_info.py +3 -3
mindspore/profiler/common/profiler_path_manager.py +13 -0
mindspore/profiler/common/util.py +30 -3
mindspore/profiler/experimental_config.py +2 -1
mindspore/profiler/platform/npu_profiler.py +33 -6
mindspore/run_check/_check_version.py +108 -24
mindspore/runtime/__init__.py +3 -2
mindspore/runtime/executor.py +11 -3
mindspore/runtime/memory.py +112 -0
mindspore/swresample-4.dll +0 -0
mindspore/swscale-6.dll +0 -0
mindspore/tinyxml2.dll +0 -0
mindspore/{experimental/llm_boost → tools}/__init__.py +5 -5
mindspore/tools/data_dump.py +130 -0
mindspore/tools/sdc_detect.py +91 -0
mindspore/tools/stress_detect.py +63 -0
mindspore/train/__init__.py +6 -6
mindspore/train/_utils.py +5 -18
mindspore/train/amp.py +6 -4
mindspore/train/callback/_checkpoint.py +0 -9
mindspore/train/callback/_train_fault_tolerance.py +69 -18
mindspore/train/data_sink.py +1 -5
mindspore/train/model.py +38 -211
mindspore/train/serialization.py +126 -387
mindspore/turbojpeg.dll +0 -0
mindspore/utils/__init__.py +6 -3
mindspore/utils/dlpack.py +92 -0
mindspore/utils/dryrun.py +1 -1
mindspore/utils/runtime_execution_order_check.py +10 -0
mindspore/utils/sdc_detect.py +14 -12
mindspore/utils/stress_detect.py +43 -0
mindspore/utils/utils.py +144 -8
mindspore/version.py +1 -1
{mindspore-2.7.0.dist-info → mindspore-2.7.1.dist-info}/METADATA +3 -2
{mindspore-2.7.0.dist-info → mindspore-2.7.1.dist-info}/RECORD +254 -267
mindspore/experimental/llm_boost/ascend_native/llama_boost_ascend_native.py +0 -210
mindspore/experimental/llm_boost/ascend_native/llm_boost.py +0 -52
mindspore/experimental/llm_boost/atb/boost_base.py +0 -385
mindspore/experimental/llm_boost/atb/llama_boost.py +0 -137
mindspore/experimental/llm_boost/atb/qwen_boost.py +0 -124
mindspore/experimental/llm_boost/register.py +0 -130
mindspore/experimental/llm_boost/utils.py +0 -31
mindspore/include/OWNERS +0 -7
mindspore/mindspore_cpu_res_manager.dll +0 -0
mindspore/mindspore_ops_kernel_common.dll +0 -0
mindspore/mindspore_res_manager.dll +0 -0
mindspore/nn/optim/_dist_optimizer_registry.py +0 -111
mindspore/nn/reinforcement/_batch_read_write.py +0 -142
mindspore/nn/reinforcement/_tensors_queue.py +0 -152
mindspore/nn/reinforcement/tensor_array.py +0 -145
mindspore/opencv_core452.dll +0 -0
mindspore/opencv_imgcodecs452.dll +0 -0
mindspore/ops/_op_impl/aicpu/priority_replay_buffer.py +0 -113
mindspore/ops/_op_impl/aicpu/reservoir_replay_buffer.py +0 -96
mindspore/ops/_op_impl/aicpu/sparse_cross.py +0 -42
mindspore/ops/_op_impl/cpu/buffer_append.py +0 -28
mindspore/ops/_op_impl/cpu/buffer_get.py +0 -28
mindspore/ops/_op_impl/cpu/buffer_sample.py +0 -28
mindspore/ops/_op_impl/cpu/priority_replay_buffer.py +0 -42
mindspore/ops/operations/_tensor_array.py +0 -359
mindspore/ops/operations/rl_ops.py +0 -288
mindspore/parallel/_offload_context.py +0 -275
mindspore/parallel/_recovery_context.py +0 -115
mindspore/parallel/_transformer/__init__.py +0 -35
mindspore/parallel/_transformer/layers.py +0 -765
mindspore/parallel/_transformer/loss.py +0 -251
mindspore/parallel/_transformer/moe.py +0 -693
mindspore/parallel/_transformer/op_parallel_config.py +0 -222
mindspore/parallel/_transformer/transformer.py +0 -3124
mindspore/parallel/mpi/_mpi_config.py +0 -116
mindspore/train/memory_profiling_pb2.py +0 -298
{mindspore-2.7.0.dist-info → mindspore-2.7.1.dist-info}/WHEEL +0 -0
{mindspore-2.7.0.dist-info → mindspore-2.7.1.dist-info}/entry_points.txt +0 -0
{mindspore-2.7.0.dist-info → mindspore-2.7.1.dist-info}/top_level.txt +0 -0

mindspore/runtime/memory.py CHANGED Viewed

@@ -14,9 +14,15 @@
 # ============================================================================
 """Memory interfaces."""
+import contextlib
+import ctypes
 import os
 from mindspore._c_expression import RuntimeConf, DeviceManagerConf, _memory_stats, \
     _reset_max_mem_reserved, _reset_max_mem_allocated, DeviceContextManager, _empty_cache, _memory_replay
+try:
+    from mindspore._c_expression import _enable_pluggable_allocator, _disable_pluggable_allocator
+except ImportError:
+    pass
 from mindspore import _checkparam as Validator
 from mindspore._checkparam import args_type_check
 from mindspore import log as logger
@@ -406,3 +412,109 @@ def memory_replay(file_path):
         >>> ms.runtime.memory_replay("/data/memory_block.csv")
     """
     _memory_replay(os.path.realpath(file_path))
+class PluggableAllocator():
+    r"""
+    Receive a .so file via ctypes, and dynamically load the alloc and free functions within it.
+    It needs to be used in conjunction with :class:`mindspore.runtime.MemPool` and
+    :func:`mindspore.runtime.use_mem_pool` to take over the memory allocation and free
+    in the MindSpore memory pool.
+    .. warning::
+        This is currently supported only in unix OSs.
+    Args:
+        path_to_so_file(str): Path in the file system to the `.so` file containing
+            the allocator functions.
+        alloc_fn_name(str): Name of the function to perform the memory allocation
+            in the so file. The signature must be:
+            `void* alloc_fn(size_t size, int device, aclrtStream stream);` .
+        free_fn_name(str): Name of the function to perform the memory release
+            in the so file. The signature must be:
+            `void free_fn(void* ptr, size_t size, aclrtStream stream);` .
+    Supported Platforms:
+        ``Ascend``
+    """
+    def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str):
+        allocator = ctypes.CDLL(path_to_so_file)
+        alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value
+        free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value
+        if alloc_fn is None:
+            raise ValueError(f"Cannot find allocator function {alloc_fn_name} in {path_to_so_file}")
+        if free_fn is None:
+            raise ValueError(f"Cannot find free function {free_fn_name} in {path_to_so_file}")
+        self._alloc_fn = alloc_fn
+        self._free_fn = free_fn
+    @property
+    def alloc_fn_ptr(self) -> int:
+        """Function pointer of the allocator function."""
+        return self._alloc_fn
+    @property
+    def free_fn_ptr(self) -> int:
+        """Function pointer of the free function."""
+        return self._free_fn
+class MemPool():
+    r"""
+    A MemPool warp a :class:`mindspore.runtime.PluggableAllocator`,
+    and pass it to :func:`mindspore.runtime.use_mem_pool`.
+    Args:
+        allocator(mindspore.runtime.PluggableAllocator): a mindspore.runtime.PluggableAllocator
+            that can be used to define how memory gets allocated and freed in the pool.
+    Supported Platforms:
+        ``Ascend``
+    """
+    def __init__(self, allocator: PluggableAllocator):
+        self._allocator = allocator
+    @property
+    def allocator(self) -> PluggableAllocator:
+        """The allocator used by the pool."""
+        return self._allocator
+@contextlib.contextmanager
+def use_mem_pool(pool: MemPool):
+    r"""
+    A context manager that routes allocations and deallocations to a given pool.
+    Note:
+        - This context manager makes only current thread's allocations route to the given pool.
+        - If a new thread is spawned inside the context manager the allocations in that thread
+          will not route to the given pool.
+        - Only by allocating Device memory inside the context manager, the allocation operation
+          can be routed to the given pool.
+        - Only Atlas A2 training series products support this interface.
+    Args:
+        pool(mindspore.runtime.MemPool): a MemPool object that warp a PluggableAllocator.
+    Supported Platforms:
+        ``Ascend``
+    Examples:
+        >>> import mindspore as ms
+        >>> path = "/path/to/allocator.so"
+        >>> allocator = ms.runtime.PluggableAllocator(path, "Alloc", "Free")
+        >>> mem_pool = ms.runtime.MemPool(allocator)
+        >>> shape = (1024, 1024)
+        >>> x = ms.ops.Ones()(shape, ms.float32)
+        >>> with ms.runtime.use_mem_pool(mem_pool):
+        >>>     y = ms.ops.Ones()(shape, ms.float32)
+        >>> output = x + y
+    """
+    allocator = pool.allocator
+    _enable_pluggable_allocator(allocator.alloc_fn_ptr, allocator.free_fn_ptr)
+    try:
+        yield
+    finally:
+        _disable_pluggable_allocator()

mindspore/swresample-4.dll CHANGED Viewed

Binary file

mindspore/swscale-6.dll CHANGED Viewed

Binary file

mindspore/tinyxml2.dll CHANGED Viewed

Binary file

mindspore/{experimental/llm_boost → tools}/__init__.py RENAMED Viewed

@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
-"""LlmBoost Register"""
+"""Tools module."""
 from __future__ import absolute_import
-from mindspore.experimental.llm_boost.atb import LlamaBoost, QwenBoost
-from mindspore.experimental.llm_boost.ascend_native import *
-from mindspore.experimental.llm_boost.register import LlmBoostRegister
+__all__ = ["stress_detect", "sdc_detect_start", "sdc_detect_stop", "get_sdc_detect_result", "set_dump"]
-__all__ = ["LlmBoostRegister"]
+from .stress_detect import stress_detect
+from .sdc_detect import sdc_detect_start, sdc_detect_stop, get_sdc_detect_result
+from .data_dump import set_dump

mindspore/tools/data_dump.py ADDED Viewed

@@ -0,0 +1,130 @@
+# Copyright 2021-2025 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Controlling dump behavior."""
+from __future__ import absolute_import
+from warnings import warn
+import mindspore.context as context
+from mindspore._c_expression import security
+def set_dump(target, enabled=True):
+    """
+    Enable or disable dump for the `target` and its contents.
+    `target` should be an instance of :class:`mindspore.nn.Cell` or :class:`mindspore.ops.Primitive` .
+    Please note that this API takes effect only when the Dump function is enabled, and the `dump_mode`
+    field in the Dump configuration file is set to `"2"` with the `ms_backend` compilation backend
+    (please refer to the backend parameter in
+    `jit <https://www.mindspore.cn/docs/en/master/api_python/mindspore/mindspore.jit.html>`_).
+    See the `dump document <https://www.mindspore.cn/tutorials/en/master/debug/dump.html>`_ for details.
+    By default, instances of :class:`mindspore.nn.Cell` and :class:`mindspore.ops.Primitive` do not enable
+    the Dump data feature.
+    Note:
+        1. This API is only available for JIT compilation, requires 'Ascend' as the device_target and
+           `ms_backend` as the compilation backend (please refer to the backend parameter in
+           `jit <https://www.mindspore.cn/docs/en/master/api_python/mindspore/mindspore.jit.html>`_),
+           and does not support fused operators.
+        2. This API only supports being called before training starts.
+           If you call this API during training, it may not be effective.
+        3. After using `set_dump(Cell, True)` , operators in forward and backward
+           computation  (computation generated by the grad operations) of the
+           cell will be dumped.
+        4. For :class:`mindspore.nn.SoftmaxCrossEntropyWithLogits` layer, the forward
+           computation and backward computation use the same set of
+           operators. So you can only see dump data from backward computation.
+           Please note that :class:`mindspore.nn.SoftmaxCrossEntropyWithLogits` layer will also use
+           the above operators internally when initialized with `sparse=True` and
+           `reduction="mean"` .
+    Args:
+        target (Union[Cell, Primitive]): The Cell instance or Primitive instance
+            to which the dump flag is set.
+        enabled (bool, optional): ``True`` indicates that the dump is enabled, and ``False`` indicates that
+            the dump is disabled.
+            Default: ``True`` .
+    Supported Platforms:
+        ``Ascend``
+    Examples:
+        .. note::
+            Please set environment variable `MINDSPORE_DUMP_CONFIG` to the dump config file and set `dump_mode` field
+            in dump config file to 2 before running this example.
+            See `dump document <https://www.mindspore.cn/tutorials/en/master/debug/dump.html>`_ for details.
+        >>> import numpy as np
+        >>> import mindspore as ms
+        >>> import mindspore.nn as nn
+        >>> from mindspore import Tensor, jit
+        >>> from mindspore.tools import set_dump
+        >>>
+        >>> ms.set_device(device_target="Ascend")
+        >>>
+        >>> class MyNet(nn.Cell):
+        ...     def __init__(self):
+        ...         super().__init__()
+        ...         self.conv1 = nn.Conv2d(5, 6, 5, pad_mode='valid')
+        ...         self.relu1 = nn.ReLU()
+        ...
+        ...     @jit
+        ...     def construct(self, x):
+        ...         x = self.conv1(x)
+        ...         x = self.relu1(x)
+        ...         return x
+        >>>
+        >>> if __name__ == "__main__":
+        ...     net = MyNet()
+        ...     set_dump(net.conv1)
+        ...     input_tensor = Tensor(np.ones([1, 5, 10, 10], dtype=np.float32))
+        ...     output = net(input_tensor)
+    """
+    if security.enable_security():
+        raise ValueError('The set_dump API is not supported, please recompile '
+                         'source without "-s on".')
+    import mindspore.nn as nn  # avoid circular import
+    from mindspore.ops import Primitive
+    if not isinstance(target, nn.Cell) and not isinstance(target, Primitive):
+        raise ValueError(f"The \"target\" parameter must be an instance of "
+                         f"Cell or Primitive, "
+                         f"but got an instance of {type(target)}.")
+    if not isinstance(enabled, bool):
+        raise ValueError("The \"enabled\" parameter must be bool.")
+    # Checking for device target and mode.
+    current_target = context.get_context("device_target")
+    if current_target != "Ascend":
+        # We will not return here in case user changed device_target later.
+        warn("Current device_target is {}, which is not supported by set_dump. "
+             "Only Ascend device target is supported currently. "
+             "If you have Ascend device, consider set device_target to Ascend "
+             "before calling set_dump.".format(current_target))
+    # The actual set dump logic.
+    if isinstance(target, nn.Cell):
+        target.add_flags(dump=enabled)
+        for cell in target.cells():
+            set_dump(cell, enabled)
+        primitives = getattr(target, "_primitives", {})
+        for value in primitives.values():
+            if value and "dump" in value.attrs:
+                set_dump(value, enabled)
+    if isinstance(target, Primitive):
+        target.add_prim_attr("dump", "true" if enabled else "false")

mindspore/tools/sdc_detect.py ADDED Viewed

@@ -0,0 +1,91 @@
+# Copyright 2025 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""SDC detect."""
+from mindspore import _c_expression
+def sdc_detect_start():
+    """
+    Start silent data corruption detection. It will check the inputs and outputs of MatMul operations during the
+    forward and backward computations on the current device, which may increase execution time. The overhead of the
+    check time decreases as the matrix shapes increase. Starting sdc detection results in approximately 100%
+    performance degradation for a single 4096-sized MatMul computation, and approximately 90% degradation on the
+    Llama2-7B model (model parallel is 4, pipeline parallel is 2, and using qkv concatenation and ffn concatenation in
+    decoder layers).
+    Supported Platforms:
+        ``Ascend``
+    Examples:
+        >>> from mindspore.tools import sdc_detect_start
+        >>> sdc_detect_start()
+    """
+    _c_expression.sdc_detect_start()
+def sdc_detect_stop():
+    """
+    Stop silent data corruption detection.
+    Supported Platforms:
+        ``Ascend``
+    Examples:
+        >>> from mindspore.tools import sdc_detect_stop
+        >>> sdc_detect_stop()
+    """
+    _c_expression.sdc_detect_stop()
+def get_sdc_detect_result():
+    """
+    Get the result of silent data corruption detection.
+    Returns:
+        bool, indicating whether silent data corruption has occurred after detection start.
+    Supported Platforms:
+        ``Ascend``
+    Examples:
+        >>> from mindspore.tools import get_sdc_detect_result
+        >>> result = get_sdc_detect_result()
+        >>> print(result)
+        False
+    """
+    return _c_expression.get_sdc_detect_result()
+class _SdcDetector:
+    """
+    Manager of feature value sampling for SDC detect
+    """
+    def __init__(self):
+        self.param_count = -1
+    def need_sample(self):
+        """"If need to sample feature value."""
+        if not _c_expression.is_silent_detect_enable():
+            return False
+        grad_sample_interval = _c_expression.get_silent_detect_config('grad_sample_interval')
+        self.param_count = (self.param_count + 1) % grad_sample_interval
+        return self.param_count == 0
+    @staticmethod
+    def get_dump_name(param_name):
+        """Get dump file name with sdc prefix."""
+        return _c_expression.get_silent_detect_feature_name(param_name)
+_sdc_detector = _SdcDetector()

mindspore/tools/stress_detect.py ADDED Viewed

@@ -0,0 +1,63 @@
+# Copyright 2025 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Stress detect."""
+from mindspore import _c_expression
+from mindspore import log as logger
+from mindspore.communication import init, create_group, get_rank
+from mindspore.communication import get_local_rank_size
+def stress_detect(detect_type="aic"):
+    """
+    Used to detect whether there are faults in hardware accuracy or communication between links.
+    The common usage scenario is to initiate a new thread or call this interface through a Callback function
+    at each step or when saving checkpoints, to check whether hardware malfunctions could affect accuracy.
+    Args:
+        detect_type (str, optional): The type of stress test to perform. There are two options available: ``'aic'`` and
+            ``'hccs'``, which perform AiCore and HCCS link stress tests on the device, respectively. Default: "aic".
+    Returns:
+        int, the return value represents the error type. 0 indicates normal. 1 indicates failure to start some or
+        all test cases. 2 indicates a hardware failure, and it is recommended to replace the device.
+    Supported Platforms:
+        ``Ascend``
+    Examples:
+        >>> from mindspore.tools import stress_detect
+        >>> ret = stress_detect()
+        >>> print(ret)
+        0
+    """
+    if detect_type not in ["aic", "hccs"]:
+        logger.error(f"For stress detect, detection type must be 'aic' or 'hccs'."
+                     f"But got {detect_type}. Exiting stress detect.")
+        return 1
+    if detect_type == "aic":
+        return _c_expression.stress_detect("aic")
+    init()
+    local_ranks = []
+    local_rank_size = get_local_rank_size()
+    node_num = get_rank() // local_rank_size
+    for i in range(local_rank_size):
+        local_ranks.append(local_rank_size * node_num + i)
+    if get_rank() in local_ranks:
+        group = f"new_group_{node_num}"
+        create_group(group, local_ranks)
+    return _c_expression.stress_detect(group)

mindspore/train/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
+# Copyright 2025 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -25,8 +25,8 @@ from mindspore.train import amp
 from mindspore.train.amp import build_train_network
 from mindspore.train.loss_scale_manager import LossScaleManager, FixedLossScaleManager, DynamicLossScaleManager
 from mindspore.train.serialization import save_checkpoint, load_checkpoint, load_param_into_net, export, \
-    load, parse_print, async_ckpt_thread_status, convert_model, export_split_mindir, \
-    load_checkpoint_async, check_checkpoint, get_ckpt_path_with_strategy, ckpt_to_safetensors, safetensors_to_ckpt, \
+    load, async_ckpt_thread_status, export_split_mindir, \
+    load_checkpoint_async, get_ckpt_path_with_strategy, ckpt_to_safetensors, safetensors_to_ckpt, \
     build_searched_strategy, merge_sliced_parameter, load_distributed_checkpoint, restore_group_info_list
 from mindspore.train.callback import Callback, LossMonitor, TimeMonitor, ModelCheckpoint, SummaryCollector, \
     CheckpointConfig, RunContext, LearningRateScheduler, SummaryLandscape, FlopsUtilizationCollector, \
@@ -37,9 +37,9 @@ from mindspore.train.metrics import *
 from mindspore.train.data_sink import data_sink
 __all__ = ["Model", "DatasetHelper", "connect_network_with_dataset", "build_train_network", "LossScaleManager",
-           "FixedLossScaleManager", "DynamicLossScaleManager", "save_checkpoint", "load_checkpoint", "check_checkpoint",
-           "load_param_into_net", "export", "load", "export_split_mindir", "parse_print", "async_ckpt_thread_status",
-           "convert_model", "data_sink", "load_checkpoint_async", "get_ckpt_path_with_strategy", "ckpt_to_safetensors",
+           "FixedLossScaleManager", "DynamicLossScaleManager", "save_checkpoint", "load_checkpoint",
+           "load_param_into_net", "export", "load", "export_split_mindir", "async_ckpt_thread_status",
+           "data_sink", "load_checkpoint_async", "get_ckpt_path_with_strategy", "ckpt_to_safetensors",
            "safetensors_to_ckpt", "build_searched_strategy", "merge_sliced_parameter", "load_distributed_checkpoint",
            "restore_group_info_list"]
 __all__.extend(callback.__all__)

mindspore/train/_utils.py CHANGED Viewed

@@ -344,15 +344,7 @@ def _get_layout_opt_shard(layout_obj, param_redundancy_dict):
     """Layout ckpt append opt shard."""
     for key, value in layout_obj.items():
         if value[5]:
-            world_groups = ("hccl_world_group", "nccl_world_group", "mccl_world_group")
-            if value[5] in world_groups:
-                opt_para_num = get_group_size()
-            elif "-" in value[5]:
-                opt_para_str = value[5].split("-")[0]
-                opt_para_num = int(opt_para_str)
-            else:
-                raise ValueError(f"For get_parameter_redundancy, the format of the parallel communication domain for "
-                                 f"the optimizer is incorrect.")
+            opt_para_num = get_group_size(value[5])
             param_redundancy_ranks = param_redundancy_dict.get(key)
             res = []
             for param_ranks in param_redundancy_ranks:
@@ -582,17 +574,12 @@ def _progress_bar(iterable, total=None):
         print_progress_bar(i)
-def _load_and_transform(path, name_map, load_func, transform_func=None):
+def _load_and_transform(path, name_map, load_func):
     """use load_func to load and use transform_func to convert"""
-    if load_func is not None:
-        param_dict = load_func(path)
-    else:
-        param_dict = path
+    param_dict = load_func(path)
     transform_dict = {}
     for k, v in param_dict.items():
         new_name = name_map.get(k, k) if name_map is not None else k
-        if transform_func is not None:
-            transform_dict[new_name] = transform_func(v, new_name)
-        else:
-            transform_dict[new_name] = v
+        transform_dict[new_name] = v
     return transform_dict

mindspore/train/amp.py CHANGED Viewed

@@ -818,8 +818,10 @@ def get_white_list():
          <class 'mindspore.ops.operations.nn_ops.Conv2DTranspose'>,
          <class 'mindspore.ops.operations.nn_ops.Conv3DTranspose'>,
          <class 'mindspore.ops.operations.nn_ops.Conv2DBackpropInput'>,
-         <class 'mindspore.ops.operations.math_ops.MatMul'>, <class 'mindspore.ops.operations.math_ops.BatchMatMul'>,
-         <class 'mindspore.ops.operations.nn_ops.PReLU'>, <class 'mindspore.ops.operations.nn_ops.ReLU'>,
+         <class 'mindspore.ops.auto_generate.gen_ops_prim.MatMul'>,
+         <class 'mindspore.ops.auto_generate.gen_ops_prim.BatchMatMul'>,
+         <class 'mindspore.ops.auto_generate.gen_ops_prim.PReLU'>,
+         <class 'mindspore.ops.auto_generate.gen_ops_prim.ReLU'>,
          <class 'mindspore.ops.operations.math_ops.Ger'>]
     """
     white_list = AMP_WHITE_LIST.copy()
@@ -871,8 +873,8 @@ def custom_mixed_precision(network, *, white_list=None, black_list=None, dtype=m
             white list is not used.
         black_list (list[Cell], optional): Black list of custom mixed precision. Defaults: ``None`` , means
             black list is not used.
-        dtype (Type): The type used in lower precision calculations, can be ``mstype.float16`` or ``mstype.bfloat16`` ,
-            default: ``mstype.float16`` .
+        dtype (Type, optional): The type used in lower precision calculations, can be ``mstype.float16`` or
+            ``mstype.bfloat16`` , default: ``mstype.float16`` .
     Returns:
         network (Cell), A network supporting mixed precision.

mindspore/train/callback/_checkpoint.py CHANGED Viewed

@@ -27,7 +27,6 @@ from mindspore.train._utils import _make_directory
 from mindspore.train.serialization import save_checkpoint, _save_graph, _wait_async_process_save_ckpt, \
     _wait_async_thread_save_ckpt, _check_async_save
 from mindspore.parallel._cell_wrapper import destroy_allgather_cell
-from mindspore.parallel._recovery_context import _set_recovery_context, _get_recovery_context
 from mindspore.communication.management import get_rank, get_group_size
 from mindspore.train._utils import get_parameter_redundancy, remove_param_redundancy, _get_pp_size_from_redundancy_map
 from mindspore.train.callback._callback import Callback
@@ -509,9 +508,6 @@ class ModelCheckpoint(Callback):
         if callable(prefix):
             self._prefix_func = prefix
-        if context.get_context("device_target") == "GPU" and _get_recovery_context("enable_recovery"):
-            _set_recovery_context(ckpt_path=self._directory)
         if config is None:
             self._config = CheckpointConfig()
         else:
@@ -577,11 +573,6 @@ class ModelCheckpoint(Callback):
             self._directory = self._directory_func(cb_params)
             _make_directory(self._directory)
         collect_host_info("Callback", "ModelCheckpoint", "step_end", start_time=get_clock_syscnt(), level=1)
-        # In disaster recovery scenario, the training process may be rolled back to the last step where
-        # the ckpt was successfully saved, so the _last_triggered_step should be updated.
-        if _get_recovery_context("enable_recovery") and cb_params.last_save_ckpt_step is not None:
-            self._last_triggered_step = cb_params.last_save_ckpt_step
-            cb_params.last_save_ckpt_step = None
         # save graph (only once)
         if not self._graph_saved: