PyPI - mindspore - Versions diffs - 2.3.0__cp310-cp310-win_amd64.whl → 2.4.1__cp310-cp310-win_amd64.whl - Mend

mindspore 2.3.0__cp310-cp310-win_amd64.whl → 2.4.1__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mindspore might be problematic. Click here for more details.

Files changed (275) hide show

mindspore/.commit_id +1 -1
mindspore/__init__.py +3 -1
mindspore/_c_dataengine.cp310-win_amd64.pyd +0 -0
mindspore/_c_expression.cp310-win_amd64.pyd +0 -0
mindspore/_c_mindrecord.cp310-win_amd64.pyd +0 -0
mindspore/_checkparam.py +50 -9
mindspore/_extends/parse/compile_config.py +41 -0
mindspore/_extends/parse/parser.py +9 -7
mindspore/_extends/parse/standard_method.py +52 -14
mindspore/_extends/pijit/pijit_func_white_list.py +350 -24
mindspore/amp.py +24 -10
mindspore/common/__init__.py +6 -4
mindspore/common/_pijit_context.py +190 -0
mindspore/common/_register_for_tensor.py +2 -1
mindspore/common/_tensor_overload.py +139 -0
mindspore/common/api.py +102 -87
mindspore/common/dump.py +5 -6
mindspore/common/generator.py +1 -7
mindspore/common/hook_handle.py +14 -26
mindspore/common/initializer.py +51 -15
mindspore/common/mindir_util.py +2 -2
mindspore/common/parameter.py +62 -15
mindspore/common/recompute.py +39 -9
mindspore/common/sparse_tensor.py +7 -3
mindspore/common/tensor.py +183 -37
mindspore/communication/__init__.py +1 -1
mindspore/communication/_comm_helper.py +38 -3
mindspore/communication/comm_func.py +315 -60
mindspore/communication/management.py +14 -14
mindspore/context.py +132 -22
mindspore/dataset/__init__.py +1 -1
mindspore/dataset/audio/__init__.py +1 -1
mindspore/dataset/core/config.py +7 -0
mindspore/dataset/core/validator_helpers.py +7 -0
mindspore/dataset/engine/cache_client.py +1 -1
mindspore/dataset/engine/datasets.py +72 -44
mindspore/dataset/engine/datasets_audio.py +7 -7
mindspore/dataset/engine/datasets_standard_format.py +53 -3
mindspore/dataset/engine/datasets_text.py +20 -20
mindspore/dataset/engine/datasets_user_defined.py +174 -104
mindspore/dataset/engine/datasets_vision.py +33 -33
mindspore/dataset/engine/iterators.py +29 -0
mindspore/dataset/engine/obs/util.py +7 -0
mindspore/dataset/engine/queue.py +114 -60
mindspore/dataset/engine/serializer_deserializer.py +2 -2
mindspore/dataset/engine/validators.py +34 -14
mindspore/dataset/text/__init__.py +1 -4
mindspore/dataset/transforms/__init__.py +0 -3
mindspore/dataset/utils/line_reader.py +2 -0
mindspore/dataset/vision/__init__.py +1 -4
mindspore/dataset/vision/utils.py +1 -1
mindspore/dataset/vision/validators.py +2 -1
mindspore/{nn/extend → experimental/es}/__init__.py +4 -11
mindspore/experimental/es/embedding_service.py +883 -0
mindspore/{nn/layer → experimental/es}/embedding_service_layer.py +218 -30
mindspore/experimental/llm_boost/__init__.py +21 -0
mindspore/{nn/extend/layer → experimental/llm_boost/atb}/__init__.py +4 -8
mindspore/experimental/llm_boost/atb/boost_base.py +211 -0
mindspore/experimental/llm_boost/atb/llama_boost.py +115 -0
mindspore/experimental/llm_boost/atb/qwen_boost.py +101 -0
mindspore/experimental/llm_boost/register.py +129 -0
mindspore/experimental/llm_boost/utils.py +31 -0
mindspore/experimental/optim/adamw.py +85 -0
mindspore/experimental/optim/optimizer.py +3 -0
mindspore/hal/__init__.py +3 -3
mindspore/hal/contiguous_tensors_handle.py +175 -0
mindspore/hal/stream.py +18 -0
mindspore/include/api/model_group.h +13 -1
mindspore/include/api/types.h +10 -10
mindspore/include/dataset/config.h +2 -2
mindspore/include/dataset/constants.h +2 -2
mindspore/include/dataset/execute.h +2 -2
mindspore/include/dataset/vision.h +4 -0
mindspore/log.py +1 -1
mindspore/mindrecord/filewriter.py +68 -51
mindspore/mindspore_backend.dll +0 -0
mindspore/mindspore_common.dll +0 -0
mindspore/mindspore_core.dll +0 -0
mindspore/mindspore_np_dtype.dll +0 -0
mindspore/mindspore_ops.dll +0 -0
mindspore/mint/__init__.py +983 -46
mindspore/mint/distributed/__init__.py +31 -0
mindspore/mint/distributed/distributed.py +254 -0
mindspore/mint/nn/__init__.py +268 -23
mindspore/mint/nn/functional.py +125 -19
mindspore/mint/nn/layer/__init__.py +39 -0
mindspore/mint/nn/layer/activation.py +133 -0
mindspore/mint/nn/layer/normalization.py +477 -0
mindspore/mint/nn/layer/pooling.py +110 -0
mindspore/mint/optim/adamw.py +26 -13
mindspore/mint/special/__init__.py +63 -0
mindspore/multiprocessing/__init__.py +2 -1
mindspore/nn/__init__.py +0 -1
mindspore/nn/cell.py +276 -96
mindspore/nn/layer/activation.py +211 -44
mindspore/nn/layer/basic.py +137 -10
mindspore/nn/layer/embedding.py +137 -2
mindspore/nn/layer/normalization.py +101 -5
mindspore/nn/layer/padding.py +34 -48
mindspore/nn/layer/pooling.py +161 -7
mindspore/nn/layer/transformer.py +3 -3
mindspore/nn/loss/__init__.py +2 -2
mindspore/nn/loss/loss.py +84 -6
mindspore/nn/optim/__init__.py +2 -1
mindspore/nn/optim/adadelta.py +1 -1
mindspore/nn/optim/adam.py +1 -1
mindspore/nn/optim/lamb.py +1 -1
mindspore/nn/optim/tft_wrapper.py +124 -0
mindspore/nn/wrap/cell_wrapper.py +12 -23
mindspore/nn/wrap/grad_reducer.py +5 -5
mindspore/nn/wrap/loss_scale.py +17 -3
mindspore/numpy/__init__.py +1 -1
mindspore/numpy/array_creations.py +65 -68
mindspore/numpy/array_ops.py +64 -60
mindspore/numpy/fft.py +610 -75
mindspore/numpy/logic_ops.py +11 -10
mindspore/numpy/math_ops.py +85 -84
mindspore/numpy/utils_const.py +4 -4
mindspore/opencv_core452.dll +0 -0
mindspore/opencv_imgcodecs452.dll +0 -0
mindspore/opencv_imgproc452.dll +0 -0
mindspore/ops/__init__.py +6 -4
mindspore/ops/_grad_experimental/grad_array_ops.py +0 -11
mindspore/ops/_grad_experimental/grad_comm_ops.py +67 -4
mindspore/ops/_grad_experimental/grad_math_ops.py +0 -22
mindspore/ops/_vmap/vmap_array_ops.py +2 -4
mindspore/ops/_vmap/vmap_math_ops.py +17 -1
mindspore/ops/_vmap/vmap_nn_ops.py +43 -2
mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +91 -7
mindspore/ops/auto_generate/gen_arg_dtype_cast.py +2 -0
mindspore/ops/auto_generate/gen_extend_func.py +767 -13
mindspore/ops/auto_generate/gen_ops_def.py +2452 -364
mindspore/ops/auto_generate/gen_ops_prim.py +5442 -1756
mindspore/ops/auto_generate/pyboost_inner_prim.py +176 -56
mindspore/ops/composite/base.py +85 -48
mindspore/ops/composite/multitype_ops/_compile_utils.py +1 -0
mindspore/ops/composite/multitype_ops/not_in_impl.py +2 -2
mindspore/ops/function/__init__.py +22 -0
mindspore/ops/function/array_func.py +492 -153
mindspore/ops/function/debug_func.py +113 -1
mindspore/ops/function/fft_func.py +15 -2
mindspore/ops/function/grad/grad_func.py +3 -2
mindspore/ops/function/math_func.py +564 -207
mindspore/ops/function/nn_func.py +817 -383
mindspore/ops/function/other_func.py +3 -2
mindspore/ops/function/random_func.py +402 -12
mindspore/ops/function/reshard_func.py +13 -11
mindspore/ops/function/sparse_unary_func.py +1 -1
mindspore/ops/function/vmap_func.py +3 -2
mindspore/ops/functional.py +24 -14
mindspore/ops/op_info_register.py +3 -3
mindspore/ops/operations/__init__.py +7 -2
mindspore/ops/operations/_grad_ops.py +2 -76
mindspore/ops/operations/_infer_ops.py +1 -1
mindspore/ops/operations/_inner_ops.py +71 -94
mindspore/ops/operations/array_ops.py +14 -146
mindspore/ops/operations/comm_ops.py +63 -53
mindspore/ops/operations/custom_ops.py +83 -19
mindspore/ops/operations/debug_ops.py +42 -10
mindspore/ops/operations/manually_defined/_inner.py +12 -0
mindspore/ops/operations/manually_defined/ops_def.py +273 -20
mindspore/ops/operations/math_ops.py +12 -223
mindspore/ops/operations/nn_ops.py +20 -114
mindspore/ops/operations/other_ops.py +7 -4
mindspore/ops/operations/random_ops.py +46 -1
mindspore/ops/primitive.py +18 -6
mindspore/ops_generate/arg_dtype_cast.py +2 -0
mindspore/ops_generate/gen_aclnn_implement.py +11 -11
mindspore/ops_generate/gen_constants.py +36 -0
mindspore/ops_generate/gen_ops.py +67 -52
mindspore/ops_generate/gen_ops_inner_prim.py +1 -1
mindspore/ops_generate/gen_pyboost_func.py +131 -47
mindspore/ops_generate/op_proto.py +10 -3
mindspore/ops_generate/pyboost_utils.py +14 -1
mindspore/ops_generate/template.py +43 -21
mindspore/parallel/__init__.py +3 -1
mindspore/parallel/_auto_parallel_context.py +31 -9
mindspore/parallel/_cell_wrapper.py +85 -0
mindspore/parallel/_parallel_serialization.py +47 -19
mindspore/parallel/_tensor.py +127 -13
mindspore/parallel/_utils.py +53 -22
mindspore/parallel/algo_parameter_config.py +5 -5
mindspore/parallel/checkpoint_transform.py +46 -39
mindspore/parallel/cluster/process_entity/__init__.py +1 -1
mindspore/parallel/cluster/process_entity/_api.py +31 -23
mindspore/parallel/cluster/process_entity/_utils.py +2 -27
mindspore/parallel/parameter_broadcast.py +3 -4
mindspore/parallel/shard.py +162 -31
mindspore/parallel/transform_safetensors.py +1146 -0
mindspore/profiler/__init__.py +2 -1
mindspore/profiler/common/constant.py +29 -0
mindspore/profiler/common/registry.py +47 -0
mindspore/profiler/common/util.py +28 -0
mindspore/profiler/dynamic_profiler.py +694 -0
mindspore/profiler/envprofiling.py +17 -19
mindspore/profiler/parser/ascend_analysis/constant.py +18 -0
mindspore/profiler/parser/ascend_analysis/file_manager.py +25 -4
mindspore/profiler/parser/ascend_analysis/function_event.py +43 -19
mindspore/profiler/parser/ascend_analysis/fwk_cann_parser.py +31 -26
mindspore/profiler/parser/ascend_analysis/fwk_file_parser.py +56 -10
mindspore/profiler/parser/ascend_analysis/msprof_timeline_parser.py +55 -8
mindspore/profiler/parser/ascend_analysis/path_manager.py +313 -0
mindspore/profiler/parser/ascend_analysis/profiler_info_parser.py +27 -20
mindspore/profiler/parser/ascend_analysis/trace_event_manager.py +9 -2
mindspore/profiler/parser/ascend_msprof_exporter.py +5 -4
mindspore/profiler/parser/ascend_timeline_generator.py +27 -25
mindspore/profiler/parser/base_timeline_generator.py +19 -25
mindspore/profiler/parser/cpu_gpu_timeline_generator.py +25 -12
mindspore/profiler/parser/framework_parser.py +1 -391
mindspore/profiler/parser/gpu_analysis/__init__.py +14 -0
mindspore/profiler/parser/gpu_analysis/function_event.py +44 -0
mindspore/profiler/parser/gpu_analysis/fwk_file_parser.py +89 -0
mindspore/profiler/parser/gpu_analysis/profiler_info_parser.py +72 -0
mindspore/profiler/parser/memory_usage_parser.py +0 -154
mindspore/profiler/parser/profiler_info.py +78 -6
mindspore/profiler/profiler.py +153 -0
mindspore/profiler/profiling.py +285 -413
mindspore/rewrite/__init__.py +1 -2
mindspore/rewrite/common/namespace.py +4 -4
mindspore/rewrite/symbol_tree/symbol_tree.py +3 -3
mindspore/run_check/_check_version.py +39 -104
mindspore/safeguard/rewrite_obfuscation.py +591 -247
mindspore/train/__init__.py +4 -3
mindspore/train/_utils.py +105 -19
mindspore/train/amp.py +171 -53
mindspore/train/callback/__init__.py +2 -2
mindspore/train/callback/_callback.py +4 -4
mindspore/train/callback/_checkpoint.py +97 -31
mindspore/train/callback/_cluster_monitor.py +1 -1
mindspore/train/callback/_flops_collector.py +1 -0
mindspore/train/callback/_loss_monitor.py +3 -3
mindspore/train/callback/_on_request_exit.py +145 -31
mindspore/train/callback/_summary_collector.py +5 -5
mindspore/train/callback/_tft_register.py +375 -0
mindspore/train/dataset_helper.py +15 -3
mindspore/train/metrics/metric.py +3 -3
mindspore/train/metrics/roc.py +4 -4
mindspore/train/mind_ir_pb2.py +44 -39
mindspore/train/model.py +154 -58
mindspore/train/serialization.py +342 -128
mindspore/utils/__init__.py +21 -0
mindspore/utils/utils.py +60 -0
mindspore/version.py +1 -1
{mindspore-2.3.0.dist-info → mindspore-2.4.1.dist-info}/METADATA +13 -7
{mindspore-2.3.0.dist-info → mindspore-2.4.1.dist-info}/RECORD +248 -242
mindspore/include/c_api/ms/abstract.h +0 -67
mindspore/include/c_api/ms/attribute.h +0 -197
mindspore/include/c_api/ms/base/handle_types.h +0 -43
mindspore/include/c_api/ms/base/macros.h +0 -32
mindspore/include/c_api/ms/base/status.h +0 -33
mindspore/include/c_api/ms/base/types.h +0 -283
mindspore/include/c_api/ms/context.h +0 -102
mindspore/include/c_api/ms/graph.h +0 -160
mindspore/include/c_api/ms/node.h +0 -606
mindspore/include/c_api/ms/tensor.h +0 -161
mindspore/include/c_api/ms/value.h +0 -84
mindspore/mindspore_shared_lib.dll +0 -0
mindspore/nn/extend/basic.py +0 -140
mindspore/nn/extend/embedding.py +0 -143
mindspore/nn/extend/layer/normalization.py +0 -109
mindspore/nn/extend/pooling.py +0 -117
mindspore/nn/layer/embedding_service.py +0 -531
mindspore/ops/_op_impl/aicpu/strided_slice_v2.py +0 -93
mindspore/ops/_op_impl/aicpu/strided_slice_v2_grad.py +0 -66
mindspore/ops/extend/__init__.py +0 -53
mindspore/ops/extend/array_func.py +0 -218
mindspore/ops/extend/math_func.py +0 -76
mindspore/ops/extend/nn_func.py +0 -308
mindspore/ops/silent_check.py +0 -162
mindspore/profiler/parser/msadvisor_analyzer.py +0 -82
mindspore/profiler/parser/msadvisor_parser.py +0 -240
mindspore/train/callback/_mindio_ttp.py +0 -443
{mindspore-2.3.0.dist-info → mindspore-2.4.1.dist-info}/WHEEL +0 -0
{mindspore-2.3.0.dist-info → mindspore-2.4.1.dist-info}/entry_points.txt +0 -0
{mindspore-2.3.0.dist-info → mindspore-2.4.1.dist-info}/top_level.txt +0 -0

mindspore/train/callback/_checkpoint.py CHANGED Viewed

@@ -18,8 +18,8 @@ from __future__ import absolute_import
 import os
 import stat
 import time
 import threading
 import mindspore.context as context
 from mindspore import log as logger
 from mindspore import nn
@@ -37,14 +37,22 @@ from mindspore.common.tensor import Tensor
 from mindspore.common.parameter import Parameter
 from mindspore.common.generator import Generator
 from mindspore.common.api import _cell_graph_executor
-from mindspore._c_expression import _collect_host_info
+from mindspore._c_expression import collect_host_info, get_clock_syscnt
 _cur_dir = os.getcwd()
 SAVE_DIR = _cur_dir
 _info_list = ["epoch_num", "step_num"]
+def _wait_async_save_ckpt(async_save=False):
+    """Waiting for asynchronous saving of ckpt to complete."""
+    if async_save:
+        thread_list = threading.enumerate()
+        for thread in thread_list:
+            if thread.getName() == "asyn_save_ckpt":
+                thread.join()
 def _get_dp_tp_from_redundancy(redundancy_tuple):
     """From redundancy get dp and tp"""
     dp = []
@@ -88,9 +96,9 @@ def _chg_ckpt_file_name_if_same_exist(directory, prefix, exception=False):
             if index == 0:
                 suffix_num = max(suffix_num, 1)
             elif index != -1:
-                num = filename[pre_len+1:pre_len+index]
+                num = filename[pre_len + 1:pre_len + index]
                 if num.isdigit():
-                    suffix_num = max(suffix_num, int(num)+1)
+                    suffix_num = max(suffix_num, int(num) + 1)
     if suffix_num != 0:
         prefix = f'{prefix}_{suffix_num}'
@@ -98,6 +106,14 @@ def _chg_ckpt_file_name_if_same_exist(directory, prefix, exception=False):
     return prefix
+def _check_format_and_other_params(format, enc_key, enc_mode, crc_check=False, async_save=False, exception_save=False,
+                                   map_param_inc=False, global_step_num=None):
+    param_not_default = (enc_key is not None or enc_mode != "AES-GCM" or crc_check or async_save
+                         or exception_save or map_param_inc or global_step_num is not None)
+    if format == "safetensors" and param_not_default:
+        raise ValueError("For 'save_checkpoint', when format is 'safetensors', other param must be default.")
 class CheckpointConfig:
     """
     The configuration of model checkpoint.
@@ -136,6 +152,10 @@ class CheckpointConfig:
         exception_save (bool): Whether to save the current checkpoint when an exception occurs. Default: ``False`` .
         crc_check (bool): Whether to perform crc32 calculation when saving checkpoint and save the calculation
                           result to the end of ckpt. Default: ``False`` .
+        remove_redundancy (bool): Whether to enable saving the checkpoint with redundancy removal.
+            Redundancy removal refers to eliminating redundant data in data parallelism mode. Default: ``False`` , means
+            redundant-free saving is not enabled.
+        format (str): Format of the output file, can be "ckpt" or "safetensors". Default: "ckpt".
         kwargs (dict): Configuration options dictionary.
     Raises:
@@ -188,6 +208,8 @@ class CheckpointConfig:
                  enc_mode='AES-GCM',
                  exception_save=False,
                  crc_check=False,
+                 remove_redundancy=False,
+                 format="ckpt",
                  **kwargs):
         if save_checkpoint_steps is not None:
@@ -231,8 +253,13 @@ class CheckpointConfig:
         self._enc_key = Validator.check_isinstance('enc_key', enc_key, (type(None), bytes))
         self._enc_mode = Validator.check_isinstance('enc_mode', enc_mode, str)
         self._crc_check = Validator.check_isinstance('crc_check', crc_check, bool)
+        self._format = Validator.check_isinstance('format', format, str)
         self._map_param_inc = kwargs.get('incremental', False)
         self.enable_redundance = kwargs.get('enable_redundance', False)
+        self.remove_redundancy = Validator.check_isinstance('remove_redundancy', remove_redundancy, bool)
+        _check_format_and_other_params(format, enc_key, enc_mode, crc_check, async_save, exception_save,
+                                       self._map_param_inc)
     @property
     def save_checkpoint_steps(self):
@@ -333,6 +360,10 @@ class CheckpointConfig:
         """
         return self._crc_check
+    @property
+    def format(self):
+        return self._format
     @property
     def append_dict(self):
         """
@@ -495,10 +526,10 @@ class ModelCheckpoint(Callback):
         self._aiturbo_init_flag = os.getenv("AITURBO") == "1"
         # get existing checkpoint files
         if self._aiturbo_init_flag:
-            import aiturbo
-            self._manager = aiturbo.CheckpointShmManager()
+            from aiturbo.checkpoint.aiturbo_mindspore_ckpt import CheckpointShmManager
+            self._manager = CheckpointShmManager()
         else:
-            self._manager = CheckpointManager()
+            self._manager = CheckpointManager(self._config.format)
         if not callable(directory) and not callable(prefix):
             self._prefix = _chg_ckpt_file_name_if_same_exist(self._directory, self._prefix)
         self._append_dict = self._config.append_dict or {}
@@ -517,7 +548,7 @@ class ModelCheckpoint(Callback):
         """
         cb_params = run_context.original_args()
         if self._aiturbo_init_flag:
-            import aiturbo
+            from aiturbo.checkpoint import aiturbo_mindspore as aiturbo
             ckpt_storage_path = self._directory
             rank_id = get_rank()
             stage_num = _get_auto_parallel_context("pipeline_stages")
@@ -536,7 +567,7 @@ class ModelCheckpoint(Callback):
                           "stage_layout": param_redundancy_dict}
                 single_params = remove_param_redundancy(param_redundancy_dict)
                 single_params = {device_id: list(params) for device_id, params in single_params.items()}
-                aiturbo.init(ckpt_storage_path, rank_id, layout, single_params, self._config.enable_redundance, dp)
+                aiturbo.init(ckpt_storage_path, rank_id, layout, single_params, not self._config.enable_redundance, dp)
             self._aiturbo_init_flag = False
         if self._prefix_func:
             self._prefix = self._prefix_func(cb_params)
@@ -546,14 +577,14 @@ class ModelCheckpoint(Callback):
                                  "string that does not contain '/', but got {}.".format(self._prefix))
         if self._directory_func:
             self._directory = self._directory_func(cb_params)
-        _collect_host_info("Callback", "ModelCheckpoint", "step_end", level=1)
+            _make_directory(self._directory)
+        collect_host_info("Callback", "ModelCheckpoint", "step_end", start_time=get_clock_syscnt(), level=1)
         # In disaster recovery scenario, the training process may be rolled back to the last step where
         # the ckpt was successfully saved, so the _last_triggered_step should be updated.
         if _get_recovery_context("enable_recovery") and cb_params.last_save_ckpt_step is not None:
             self._last_triggered_step = cb_params.last_save_ckpt_step
             cb_params.last_save_ckpt_step = None
-        _make_directory(self._directory)
         # save graph (only once)
         if not self._graph_saved:
             graph_file_name = os.path.join(self._directory, self._prefix + '-graph.meta')
@@ -561,10 +592,6 @@ class ModelCheckpoint(Callback):
                 os.remove(graph_file_name)
             _save_graph(cb_params.train_network, graph_file_name)
             self._graph_saved = True
-        thread_list = threading.enumerate()
-        for thread in thread_list:
-            if thread.getName() == "asyn_save_ckpt":
-                thread.join()
         self._save_ckpt(cb_params)
     def end(self, run_context):
@@ -575,15 +602,12 @@ class ModelCheckpoint(Callback):
             run_context (RunContext): Context of the train running.
         """
         cb_params = run_context.original_args()
-        _collect_host_info("Callback", "ModelCheckpoint", "end", level=1)
+        collect_host_info("Callback", "ModelCheckpoint", "end", start_time=get_clock_syscnt(), level=1)
         _to_save_last_ckpt = True
         self._save_ckpt(cb_params, _to_save_last_ckpt)
-        thread_list = threading.enumerate()
-        for thread in thread_list:
-            if thread.getName() == "asyn_save_ckpt":
-                thread.join()
+        _wait_async_save_ckpt(self._config.async_save)
         destroy_allgather_cell()
@@ -601,6 +625,13 @@ class ModelCheckpoint(Callback):
         return False
+    def _append_dict_content(self, epoch_num, step_num):
+        """Append append_dict content."""
+        if "epoch_num" in self._append_dict:
+            self._append_dict["epoch_num"] = self._append_epoch_num + epoch_num
+        if "step_num" in self._append_dict:
+            self._append_dict["step_num"] = self._append_step_num + step_num
     def _save_ckpt(self, cb_params, force_to_save=False):
         """Save checkpoint files."""
         if cb_params.cur_step_num == self._last_triggered_step:
@@ -614,11 +645,12 @@ class ModelCheckpoint(Callback):
         step_num_in_epoch = int((cb_params.cur_step_num - 1) % cb_params.batch_num + 1)
         if save_ckpt:
+            _wait_async_save_ckpt(self._config.async_save)
             if self._prefix_func:
-                cur_ckpoint_file = self._prefix + ".ckpt"
+                cur_ckpoint_file = self._prefix + f".{self._config.format}"
             else:
                 cur_ckpoint_file = self._prefix + "-" + str(cb_params.cur_epoch_num) + "_" \
-                    + str(step_num_in_epoch) + ".ckpt"
+                                   + str(step_num_in_epoch) + f".{self._config.format}"
             # update checkpoint file list.
             self._manager.update_ckpoint_filelist(self._directory, self._prefix)
             # keep checkpoint files number equal max number.
@@ -644,20 +676,51 @@ class ModelCheckpoint(Callback):
                 set_cur_net(cb_params.train_network)
                 cb_params.train_network.add_flags(ge_sync_data=True)
                 _cell_graph_executor(cb_params.train_network, phase='save')
-            if "epoch_num" in self._append_dict:
-                self._append_dict["epoch_num"] = self._append_epoch_num + cb_params.cur_epoch_num
-            if "step_num" in self._append_dict:
-                self._append_dict["step_num"] = self._append_step_num + cb_params.cur_step_num
+            self._append_dict_content(cb_params.cur_epoch_num, cb_params.cur_step_num)
             network = self._config.saved_network if self._config.saved_network is not None else cb_params.train_network
             if os.getenv("AITURBO") == "1":
                 save_checkpoint(network, cur_file, self._config.integrated_save, self._config.async_save,
                                 self._append_dict, self._config.enc_key, self._config.enc_mode,
                                 crc_check=self._config.crc_check, incremental=self._map_param_inc,
                                 global_step_num=cb_params.cur_step_num)
+            elif self._config.remove_redundancy:
+                parallel_mode = context.get_auto_parallel_context("parallel_mode")
+                if parallel_mode == "stand_alone":
+                    raise TypeError(f"The deduplication feature for saving checkpoint can only be used "
+                                    f"in parallel scenarios, but got {parallel_mode}.")
+                param_layout = network.parameter_layout_dict
+                rank_id = get_rank()
+                if param_layout:
+                    device_num = _get_device_num()
+                    stage_num = _get_auto_parallel_context("pipeline_stages")
+                    chunk_size = device_num // stage_num
+                    initial_rank = (rank_id // chunk_size) * chunk_size
+                    param_redundancy_dict = get_parameter_redundancy(param_layout, initial_rank)
+                    single_params = remove_param_redundancy(param_redundancy_dict)
+                    save_param_names = single_params.get(rank_id)
+                    param_layout_set = set(param_layout.keys())
+                    if save_param_names == param_layout.keys():
+                        logger.warning(
+                            f"For remove_redundancy save checkpoint, the saved parameters are non-redundant.")
+                    def choice_func(x):
+                        return x not in param_layout_set or x in save_param_names
+                else:
+                    param_redundancy_dict = get_parameter_redundancy(network)
+                    single_params = remove_param_redundancy(param_redundancy_dict)
+                    save_param_names = single_params.get(rank_id)
+                    def choice_func(x):
+                        return x in save_param_names
+                save_checkpoint(network, cur_file, False, self._config.async_save,
+                                self._append_dict, self._config.enc_key, self._config.enc_mode,
+                                crc_check=self._config.crc_check, format=self._config.format,
+                                incremental=self._map_param_inc, choice_func=choice_func)
             else:
                 save_checkpoint(network, cur_file, self._config.integrated_save, self._config.async_save,
                                 self._append_dict, self._config.enc_key, self._config.enc_mode,
-                                crc_check=self._config.crc_check, incremental=self._map_param_inc)
+                                crc_check=self._config.crc_check, format=self._config.format,
+                                incremental=self._map_param_inc)
             self._latest_ckpt_file_name = cur_file
@@ -691,8 +754,9 @@ class ModelCheckpoint(Callback):
 class CheckpointManager:
     """Manage checkpoint files according to train_config of checkpoint."""
-    def __init__(self):
+    def __init__(self, format='ckpt'):
         self._ckpoint_filelist = []
+        self._format = format
     @property
     def ckpoint_filelist(self):
@@ -707,10 +771,12 @@ class CheckpointManager:
     def update_ckpoint_filelist(self, directory, prefix):
         """Update the checkpoint file list."""
         self._ckpoint_filelist = []
+        format = self._format
+        format_length = len(format) + 1
         files = os.listdir(directory)
         for filename in files:
-            if os.path.splitext(filename)[-1] == ".ckpt" and filename.startswith(prefix + "-"):
-                mid_name = filename[len(prefix):-5]
+            if os.path.splitext(filename)[-1] == f".{format}" and filename.startswith(prefix + "-"):
+                mid_name = filename[len(prefix):-format_length]
                 flag = not (True in [char.isalpha() for char in mid_name])
                 if flag:
                     self._ckpoint_filelist.append(os.path.join(directory, filename))

mindspore/train/callback/_cluster_monitor.py CHANGED Viewed

@@ -150,7 +150,7 @@ class ClusterMonitor(Callback):
         with _perf_mutex:
             dir_path = os.path.dirname(self.full_path)
             if not os.path.exists(dir_path):
-                os.makedirs(dir_path)
+                os.makedirs(dir_path, mode=0o700)
             if os.path.exists(self.full_path):
                 os.chmod(self.full_path, stat.S_IWUSR)
                 os.remove(self.full_path)

mindspore/train/callback/_flops_collector.py CHANGED Viewed

@@ -65,6 +65,7 @@ class FlopsUtilizationCollector(Callback):
     Raises:
         TypeError: If data_size is not positive int.
         TypeError: If full_flops is not bool.
+        AssertionError: If the training mode is not a static graph or not a static shape.
     Examples:
         >>> import numpy as np

mindspore/train/callback/_loss_monitor.py CHANGED Viewed

@@ -19,7 +19,7 @@ import numpy as np
 from mindspore import _checkparam as Validator
 from mindspore.train.callback._callback import Callback, _handle_loss
-from mindspore._c_expression import _collect_host_info
+from mindspore._c_expression import collect_host_info, get_clock_syscnt
 class LossMonitor(Callback):
@@ -70,7 +70,7 @@ class LossMonitor(Callback):
                     please refer to :class:`mindspore.train.RunContext`.
         """
         cb_params = run_context.original_args()
-        _collect_host_info("Callback", "LossMonitor", "step_end", level=1)
+        collect_host_info("Callback", "LossMonitor", "step_end", start_time=get_clock_syscnt(), level=1)
         cur_epoch_num = cb_params.get("cur_epoch_num", 1)
         loss = _handle_loss(cb_params.net_outputs)
@@ -101,7 +101,7 @@ class LossMonitor(Callback):
                     please refer to :class:`mindspore.train.RunContext`.
         """
         cb_params = run_context.original_args()
-        _collect_host_info("Callback", "LossMonitor", "train_epoch_end", level=1)
+        collect_host_info("Callback", "LossMonitor", "train_epoch_end", start_time=get_clock_syscnt(), level=1)
         metrics = cb_params.get("metrics")
         if metrics:
             print("Eval result: epoch %d, metrics: %s" % (cb_params.cur_epoch_num, metrics))

mindspore/train/callback/_on_request_exit.py CHANGED Viewed

@@ -16,12 +16,19 @@
 from __future__ import absolute_import
 import os
+import json
 import signal
-from mindspore import log
+import threading
+from mindspore.common import dtype as mstype
+from mindspore import context
+from mindspore import log as logger
+from mindspore.common.tensor import Tensor
+from mindspore.train._utils import _make_directory
 from mindspore import _checkparam as Validator
 from mindspore.train.serialization import load_checkpoint, save_checkpoint, export
 from mindspore.train.callback._callback import Callback
+from mindspore.parallel._utils import _get_parallel_mode
+from mindspore.context import ParallelMode
 class OnRequestExit(Callback):
@@ -29,7 +36,8 @@ class OnRequestExit(Callback):
     Respond to the user's closing request, exit the training or eval process, and save the checkpoint and mindir.
     Register OnRequestExit Callback before training, when the user want to exit the training process
-    and save the training data, could send the registered exit signal 'sig' to the training process.
+    and save the training data, could send the registered exit signal 'sig' to the training process or modify the
+    'GracefulExit' that a key in the json file specified by the 'config_file' to '1'.
     After the training process executes the current step, saves the current training status,
     including checkpoint and mindir, and then exit the training process.
@@ -38,9 +46,12 @@ class OnRequestExit(Callback):
         save_mindir (bool): Whether save the mindir before the training process exit. Default: ``True`` .
         file_name (str): The saved checkpoint and mindir file name,
             the checkpoint file add suffix '.ckpt', the mindir file add suffix '.mindir'. Default: ``'Net'`` .
-        directory (str): The directory save checkpoint and mindir. Default: ``'./'`` .
+        directory (str): The path to save files. It will generate a 'rank_{id}' path by rank_id
+            to save checkpoint and mindir. Default: ``'./'`` .
         sig (int): The user registered exit signal, it must be a captureable and negligible signal.
             When the process receives the signal, exits the training or eval process. Default: ``signal.SIGTERM`` .
+        config_file (str): A json config file used to exit training process gracefully. Key: ``{"GracefulExit": 1}`` .
+            Default: ``None`` .
     Raises:
         ValueError: If the 'save_ckpt' is not a bool.
@@ -67,20 +78,28 @@ class OnRequestExit(Callback):
         >>> model.train(10, dataset, callbacks=on_request_exit)
     """
-    def __init__(self, save_ckpt=True, save_mindir=True, file_name='Net', directory='./', sig=signal.SIGTERM):
+    def __init__(self, save_ckpt=True, save_mindir=True, file_name='Net', directory='./', config_file=None,
+                 sig=signal.SIGTERM):
         super(OnRequestExit, self).__init__()
         self.save_ckpt = Validator.check_isinstance('save_ckpt', save_ckpt, bool)
         self.save_mindir = Validator.check_isinstance('save_mindir', save_mindir, bool)
-        if self.save_ckpt or self.save_mindir:
-            file_name = Validator.check_isinstance('file_name', file_name, str)
-            directory = Validator.check_isinstance('directory', directory, str)
-            os.makedirs(os.path.abspath(directory), exist_ok=True)
-            self.train_file_path = os.path.abspath(os.path.join(directory, f"{file_name}_train"))
-            self.eval_file_path = os.path.abspath(os.path.join(directory, f"{file_name}_eval"))
         self.sig = Validator.check_isinstance('sig', sig, int)
         if hasattr(signal, "SIGKILL") and self.sig == signal.SIGKILL:
             raise ValueError("Not support send exit request by signal SIGKILL.")
-        self.exit = False
+        self.exit = False  # used signal to exit the training process
+        self.lock = threading.Lock()
+        self.save_path = directory
+        self.key = "GracefulExit"
+        self.remote_config_file = config_file  # used config file to save checkpoint and exit training process
+        self.use_graceful = os.environ.get("MS_ENABLE_GRACEFUL_EXIT") == "1"
+        self.is_distributed = _get_parallel_mode() != ParallelMode.STAND_ALONE
+        self.integrated_save = True
+        if self.is_distributed:
+            self.integrated_save = _get_parallel_mode() == ParallelMode.AUTO_PARALLEL
+        self.stop_train = False
+        self.need_do_step_end = False
+        if self.save_ckpt or self.save_mindir:
+            self.train_name, self.eval_name = self._get_save_path(file_name)
     def on_train_begin(self, run_context):
         """
@@ -91,22 +110,31 @@ class OnRequestExit(Callback):
                 For more details, please refer to :class:`mindspore.train.RunContext`.
         """
         signal.signal(self.sig, self._handle_signal)
-        if self.save_ckpt and os.path.isfile(f"{self.train_file_path}.ckpt"):
+        if self.save_ckpt and os.path.isfile(f"{self.train_name}.ckpt"):
             cb_params = run_context.original_args()
             train_net = cb_params.train_network
-            load_checkpoint(f"{self.train_file_path}.ckpt", net=train_net)
+            load_checkpoint(f"{self.train_name}.ckpt", net=train_net)
+    def on_train_step_begin(self, run_context):
+        """
+        Check whether received the exit signal or
+        whether the value of 'GracefulExit' in 'config_file' was changed to '1'.
+        Args:
+            run_context (RunContext): Context information of the model.
+                For more details, please refer to :class:`mindspore.train.RunContext`.
+        """
+        self._do_step_begin(run_context)
     def on_train_step_end(self, run_context):
         """
-        When the train step end, if received the exit signal, set the 'run_context' attribute '_stop_requested' to True.
-        Then exit the training process after this step training.
+        Save checkpoint file or mindir file according to config, and exit the training process.
         Args:
             run_context (RunContext): Include some information of the model.
                 For more details, please refer to :class:`mindspore.train.RunContext`.
         """
-        if self.exit:
-            run_context.request_stop()
+        self._do_step_end(run_context)
     def on_train_epoch_end(self, run_context):
         """
@@ -118,8 +146,7 @@ class OnRequestExit(Callback):
             run_context (RunContext): Include some information of the model.
                 For more details, please refer to :class:`mindspore.train.RunContext`.
         """
-        if self.exit:
-            run_context.request_stop()
+        self._do_step_end(run_context)
     def on_train_end(self, run_context):
         """
@@ -135,10 +162,10 @@ class OnRequestExit(Callback):
         cb_params = run_context.original_args()
         train_net = cb_params.train_network
         if self.save_ckpt:
-            save_checkpoint(train_net, ckpt_file_name=self.train_file_path)
+            save_checkpoint(train_net, ckpt_file_name=self.train_name)
         if self.save_mindir:
             inputs = cb_params.train_dataset_element
-            export(train_net, *inputs, file_name=self.train_file_path, file_format='MINDIR')
+            export(train_net, *inputs, file_name=self.train_name, file_format='MINDIR')
     def on_eval_begin(self, run_context):
         """
@@ -153,15 +180,15 @@ class OnRequestExit(Callback):
             return
         cb_params = run_context.original_args()
         eval_net = cb_params.eval_network
-        if os.path.isfile(f"{self.eval_file_path}.ckpt"):
-            load_checkpoint(f"{self.eval_file_path}.ckpt", net=eval_net)
-        elif os.path.isfile(f"{self.train_file_path}.ckpt"):
-            load_checkpoint(f"{self.train_file_path}.ckpt", net=eval_net)
+        if os.path.isfile(f"{self.eval_name}.ckpt"):
+            load_checkpoint(f"{self.eval_name}.ckpt", net=eval_net)
+        elif os.path.isfile(f"{self.train_name}.ckpt"):
+            load_checkpoint(f"{self.train_name}.ckpt", net=eval_net)
     def on_eval_step_end(self, run_context):
         """
-        When the eval step end, if received the exit signal, set the 'run_context' attribute '_stop_requested' to True.
-        Then exit the eval process after this step eval.
+        When the eval step end, if received the exit signal, set attribute '_stop_requested' of the
+        'run_context' to True. Then exit the eval process after this step eval.
         Args:
             run_context (RunContext): Include some information of the model.
@@ -184,12 +211,99 @@ class OnRequestExit(Callback):
         cb_params = run_context.original_args()
         eval_net = cb_params.eval_network
         if self.save_ckpt:
-            save_checkpoint(eval_net, ckpt_file_name=self.eval_file_path)
+            save_checkpoint(eval_net, ckpt_file_name=self.eval_name)
         if self.save_mindir:
             inputs = cb_params.eval_dataset_element
-            export(eval_net, *inputs, file_name=self.eval_file_path, file_format='MINDIR')
+            export(eval_net, *inputs, file_name=self.eval_name, file_format='MINDIR')
     def _handle_signal(self, signum, frame):
         """Handle the received signal"""
-        log.debug(f"signum: {signum}, frame: {frame}")
+        logger.debug(f"signum: {signum}, frame: {frame}")
         self.exit = True
+    def _do_step_end(self, run_context):
+        """
+        Save the checkpoint or mindir, and then exit training process.
+        Args:
+            run_context (RunContext): Include some information of the model.
+                For more details, please refer to :class:`mindspore.train.RunContext`.
+        """
+        with self.lock:
+            # save once
+            if self.stop_train or not self.need_do_step_end:
+                return
+            logger.info("Gracefully exiting training process on step end.")
+            call_params = run_context.original_args()
+            net = call_params.train_network
+            for _, param in net.parameters_and_names():
+                if param.name == "graceful_exit" and param.asnumpy() == True:  # pylint: disable=C0121
+                    logger.warning("Graceful exit is triggered, stop training.")
+                    if self.save_ckpt:
+                        append_dict = {"epoch_num": call_params.cur_epoch_num,
+                                       "step_num": call_params.cur_step_num,
+                                       "batch_num": call_params.batch_num}
+                        if call_params.loss_scale_mananger is not None:
+                            append_dict["loss_scale"] = call_params.loss_scale_mananger.get_loss_scale()
+                        if call_params.optimizer is not None:
+                            global_step = int(call_params.optimizer.global_step.data)
+                        else:
+                            global_step = int(call_params.network.optimizer.global_step.data)
+                        append_dict["global_step"] = global_step
+                        save_checkpoint(net, self.train_name, integrated_save=self.integrated_save,
+                                        append_dict=append_dict)
+                    if self.save_mindir:
+                        inputs = call_params.train_dataset_element
+                        export(net, *inputs, file_name=self.train_name, file_format='MINDIR')
+                    run_context.request_stop()
+                    self.stop_train = True
+    def _do_step_begin(self, run_context):
+        """
+        Check training process exit configuration at the step begin.
+        Args:
+            run_context (RunContext): Include some information of the model.
+                For more details, please refer to :class:`mindspore.train.RunContext`.
+        """
+        with self.lock:
+            # no env
+            if not self.use_graceful:
+                return
+            if self._check_config_info() or self.exit:
+                call_params = run_context.original_args()
+                net = call_params.train_network
+                for _, param in net.parameters_and_names():
+                    if not self.is_distributed and param.name == "graceful_exit":
+                        param.set_data(Tensor(True, mstype.bool_))
+                        self.need_do_step_end = True
+                        break
+                    if param.name == "graceful_init":
+                        param.set_data(Tensor([1], mstype.int32))
+                        self.need_do_step_end = True
+                        break
+    def _check_config_info(self):
+        """check json config info"""
+        if self.remote_config_file is not None and os.path.exists(self.remote_config_file):
+            with open(self.remote_config_file, "r") as f:
+                try:
+                    config_info = json.load(f)
+                except json.JSONDecodeError as e:
+                    logger.warning(f"Parse json file failed: {e}, please check json file: {self.remote_config_file}")
+                    return False
+                if self.key in config_info and config_info[self.key] == 1:
+                    return True
+        return False
+    def _get_save_path(self, file_name):
+        """path to save checkpoint files or mindir files"""
+        device_id = context.get_context("device_id")
+        if self.save_path is None:
+            tmp = os.path.join(os.getcwd(), r"rank_" + str(device_id))
+            path_ = _make_directory(tmp)
+            return os.path.join(path_, f"{file_name}_train"), os.path.join(path_, f"{file_name}_eval")
+        save_path = os.path.join(self.save_path, r"rank_" + str(device_id))
+        save_path = _make_directory(save_path)
+        return os.path.join(save_path, f"{file_name}_train"), os.path.join(save_path, f"{file_name}_eval")

mindspore/train/callback/_summary_collector.py CHANGED Viewed

@@ -41,7 +41,7 @@ from mindspore.nn.optim.optimizer import Optimizer
 from mindspore.nn.loss.loss import LossBase
 from mindspore.train._utils import check_value_type, _make_directory
 from mindspore._c_expression import security
-from mindspore._c_expression import _collect_host_info
+from mindspore._c_expression import collect_host_info, get_clock_syscnt
 HYPER_CONFIG_ENV_NAME = "MINDINSIGHT_HYPER_CONFIG"
 HYPER_CONFIG_LEN_LIMIT = 100000
@@ -472,7 +472,7 @@ class SummaryCollector(Callback):
     def begin(self, run_context):
         cb_params = run_context.original_args()
-        _collect_host_info("Callback", "SummaryCollector", "begin", level=1)
+        collect_host_info("Callback", "SummaryCollector", "begin", start_time=get_clock_syscnt(), level=1)
         self._check_callbacks(cb_params)
         if cb_params.mode not in ModeEnum.to_list():
@@ -484,7 +484,7 @@ class SummaryCollector(Callback):
     def step_end(self, run_context):
         cb_params = run_context.original_args()
-        _collect_host_info("Callback", "SummaryCollector", "step_end", level=1)
+        collect_host_info("Callback", "SummaryCollector", "step_end", start_time=get_clock_syscnt(), level=1)
         if cb_params.mode != ModeEnum.TRAIN.value:
             return
@@ -559,7 +559,7 @@ class SummaryCollector(Callback):
     def epoch_end(self, run_context):
         cb_params = run_context.original_args()
-        _collect_host_info("Callback", "SummaryCollector", "epoch_end", level=1)
+        collect_host_info("Callback", "SummaryCollector", "epoch_end", start_time=get_clock_syscnt(), level=1)
         self._collect_tensor_data(cb_params)
         collect_landscape = self._collect_specified_data.get('collect_landscape')
         if collect_landscape is not None:
@@ -576,7 +576,7 @@ class SummaryCollector(Callback):
     def end(self, run_context):
         cb_params = run_context.original_args()
-        _collect_host_info("Callback", "SummaryCollector", "end", level=1)
+        collect_host_info("Callback", "SummaryCollector", "end", start_time=get_clock_syscnt(), level=1)
         if cb_params.mode == ModeEnum.TRAIN.value:
             self._collect_train_lineage(cb_params)
         else: