mindspore 2.7.0__cp310-cp310-win_amd64.whl → 2.7.0rc1__cp310-cp310-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mindspore might be problematic. Click here for more details.
- mindspore/.commit_id +1 -1
- mindspore/__init__.py +1 -1
- mindspore/_c_dataengine.cp310-win_amd64.pyd +0 -0
- mindspore/_c_expression.cp310-win_amd64.pyd +0 -0
- mindspore/_c_mindrecord.cp310-win_amd64.pyd +0 -0
- mindspore/_checkparam.py +2 -2
- mindspore/_extends/builtin_operations.py +3 -3
- mindspore/_extends/parallel_compile/akg_compiler/gen_custom_op_files.py +1 -1
- mindspore/_extends/parse/__init__.py +3 -3
- mindspore/_extends/parse/deprecated/deprecated_tensor_method.py +1 -0
- mindspore/_extends/parse/parser.py +22 -28
- mindspore/_extends/parse/standard_method.py +1 -15
- mindspore/_extends/pijit/pijit_func_white_list.py +5 -2
- mindspore/_extends/remote/kernel_build_server_ascend.py +75 -0
- mindspore/amp.py +18 -0
- mindspore/avcodec-59.dll +0 -0
- mindspore/avdevice-59.dll +0 -0
- mindspore/avfilter-8.dll +0 -0
- mindspore/avformat-59.dll +0 -0
- mindspore/avutil-57.dll +0 -0
- mindspore/common/__init__.py +12 -18
- mindspore/common/_tensor_cpp_method.py +1 -1
- mindspore/common/_tensor_docs.py +38 -102
- mindspore/common/_utils.py +1 -9
- mindspore/common/api.py +106 -155
- mindspore/common/{dynamic_shape/auto_dynamic_shape.py → auto_dynamic_shape.py} +23 -17
- mindspore/common/dtype.py +57 -98
- mindspore/common/dump.py +1 -1
- mindspore/common/file_system.py +9 -59
- mindspore/common/hook_handle.py +3 -22
- mindspore/common/np_dtype.py +3 -3
- mindspore/common/parameter.py +20 -4
- mindspore/common/recompute.py +4 -2
- mindspore/common/tensor.py +52 -38
- mindspore/communication/_hccl_management.py +297 -0
- mindspore/context.py +21 -15
- mindspore/dataset/__init__.py +1 -1
- mindspore/dataset/audio/transforms.py +1 -1
- mindspore/dataset/core/config.py +1 -35
- mindspore/dataset/engine/datasets.py +315 -330
- mindspore/dataset/engine/datasets_user_defined.py +22 -38
- mindspore/dataset/transforms/c_transforms.py +2 -2
- mindspore/dataset/transforms/transforms.py +3 -3
- mindspore/dataset/vision/__init__.py +1 -1
- mindspore/dataset/vision/py_transforms.py +8 -8
- mindspore/dataset/vision/transforms.py +5 -17
- mindspore/dataset/vision/utils.py +21 -632
- mindspore/device_context/ascend/op_tuning.py +1 -35
- mindspore/dnnl.dll +0 -0
- mindspore/experimental/llm_boost/ascend_native/llama_boost_ascend_native.py +0 -3
- mindspore/include/api/cell.h +4 -28
- mindspore/include/api/cfg.h +7 -24
- mindspore/include/api/context.h +0 -1
- mindspore/include/api/delegate.h +2 -0
- mindspore/include/api/dual_abi_helper.h +19 -100
- mindspore/include/api/graph.h +1 -14
- mindspore/include/api/kernel.h +3 -16
- mindspore/include/api/kernel_api.h +1 -9
- mindspore/include/api/metrics/accuracy.h +0 -9
- mindspore/include/api/model.h +1 -5
- mindspore/include/api/model_group.h +0 -4
- mindspore/include/api/model_parallel_runner.h +0 -2
- mindspore/include/api/status.h +10 -48
- mindspore/include/api/types.h +1 -6
- mindspore/include/dataset/constants.h +0 -9
- mindspore/jpeg62.dll +0 -0
- mindspore/mindrecord/tools/cifar10.py +2 -3
- mindspore/mindrecord/tools/cifar10_to_mr.py +5 -5
- mindspore/mindspore_backend_common.dll +0 -0
- mindspore/mindspore_backend_manager.dll +0 -0
- mindspore/mindspore_common.dll +0 -0
- mindspore/mindspore_core.dll +0 -0
- mindspore/mindspore_cpu_res_manager.dll +0 -0
- mindspore/mindspore_dump.dll +0 -0
- mindspore/mindspore_frontend.dll +0 -0
- mindspore/mindspore_glog.dll +0 -0
- mindspore/mindspore_memory_pool.dll +0 -0
- mindspore/mindspore_ms_backend.dll +0 -0
- mindspore/mindspore_ops.dll +0 -0
- mindspore/mindspore_ops_host.dll +0 -0
- mindspore/mindspore_ops_kernel_common.dll +0 -0
- mindspore/mindspore_profiler.dll +0 -0
- mindspore/mindspore_pyboost.dll +0 -0
- mindspore/mindspore_pynative.dll +0 -0
- mindspore/mindspore_res_manager.dll +0 -0
- mindspore/mindspore_runtime_pipeline.dll +0 -0
- mindspore/mint/distributed/__init__.py +0 -4
- mindspore/mint/distributed/distributed.py +14 -217
- mindspore/mint/nn/layer/_functions.py +2 -1
- mindspore/mint/nn/layer/conv.py +6 -6
- mindspore/mint/nn/layer/normalization.py +3 -3
- mindspore/nn/cell.py +174 -216
- mindspore/nn/layer/activation.py +2 -4
- mindspore/nn/layer/basic.py +13 -7
- mindspore/nn/layer/image.py +1 -1
- mindspore/nn/optim/adam.py +3 -1
- mindspore/nn/optim/lamb.py +3 -1
- mindspore/nn/optim/tft_wrapper.py +3 -2
- mindspore/nn/probability/distribution/_utils/utils.py +2 -2
- mindspore/nn/wrap/cell_wrapper.py +5 -39
- mindspore/nn/wrap/grad_reducer.py +15 -0
- mindspore/numpy/array_creations.py +2 -2
- mindspore/numpy/utils_const.py +1 -1
- mindspore/opencv_core452.dll +0 -0
- mindspore/opencv_imgcodecs452.dll +0 -0
- mindspore/opencv_imgproc452.dll +0 -0
- mindspore/ops/_grad_experimental/grad_inner_ops.py +9 -0
- mindspore/ops/_op_impl/cpu/__init__.py +0 -1
- mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +2 -12
- mindspore/ops/auto_generate/gen_extend_func.py +4 -4
- mindspore/ops/auto_generate/gen_ops_def.py +16 -290
- mindspore/ops/auto_generate/gen_ops_prim.py +76 -563
- mindspore/ops/composite/base.py +1 -1
- mindspore/ops/composite/multitype_ops/_constexpr_utils.py +1 -1
- mindspore/ops/function/__init__.py +0 -1
- mindspore/ops/function/array_func.py +6 -10
- mindspore/ops/function/debug_func.py +2 -4
- mindspore/ops/function/grad/grad_func.py +12 -4
- mindspore/ops/function/math_func.py +32 -44
- mindspore/ops/function/nn_func.py +20 -18
- mindspore/ops/functional.py +1 -2
- mindspore/ops/functional_overload.py +12 -23
- mindspore/ops/operations/_inner_ops.py +12 -11
- mindspore/ops/operations/array_ops.py +50 -4
- mindspore/ops/operations/comm_ops.py +15 -1
- mindspore/ops/operations/custom_ops.py +4 -10
- mindspore/ops/operations/debug_ops.py +6 -6
- mindspore/ops/operations/manually_defined/ops_def.py +12 -12
- mindspore/ops/operations/math_ops.py +5 -5
- mindspore/ops/operations/nn_ops.py +1 -1
- mindspore/ops/primitive.py +10 -3
- mindspore/ops/tensor_method.py +7 -16
- mindspore/ops_generate/pyboost/gen_pyboost_func.py +16 -0
- mindspore/parallel/_auto_parallel_context.py +15 -5
- mindspore/parallel/_parallel_serialization.py +2 -3
- mindspore/parallel/_ps_context.py +2 -2
- mindspore/parallel/_transformer/transformer.py +4 -4
- mindspore/parallel/_utils.py +11 -5
- mindspore/parallel/auto_parallel.py +9 -23
- mindspore/parallel/checkpoint_transform.py +0 -2
- mindspore/parallel/cluster/process_entity/_api.py +1 -4
- mindspore/parallel/cluster/run.py +3 -5
- mindspore/parallel/function/reshard_func.py +5 -6
- mindspore/parallel/nn/parallel_cell_wrapper.py +3 -40
- mindspore/parallel/nn/parallel_grad_reducer.py +8 -0
- mindspore/parallel/shard.py +21 -7
- mindspore/parallel/transform_safetensors.py +4 -10
- mindspore/profiler/analysis/viewer/ascend_kernel_details_viewer.py +9 -10
- mindspore/profiler/analysis/viewer/ascend_op_memory_viewer.py +1 -1
- mindspore/profiler/common/msprof_cmd_tool.py +2 -2
- mindspore/profiler/common/path_manager.py +0 -9
- mindspore/profiler/common/profiler_context.py +2 -25
- mindspore/profiler/common/profiler_meta_data.py +0 -1
- mindspore/profiler/common/profiler_op_analyse.py +6 -10
- mindspore/{ops/_op_impl/cpu/joinedstr_op.py → profiler/common/validator/__init__.py} +1 -15
- mindspore/profiler/common/validator/validate_path.py +84 -0
- mindspore/profiler/dynamic_profiler.py +46 -91
- mindspore/profiler/envprofiler.py +5 -30
- mindspore/profiler/experimental_config.py +1 -16
- mindspore/profiler/platform/cpu_profiler.py +4 -10
- mindspore/profiler/platform/npu_profiler.py +1 -1
- mindspore/profiler/profiler.py +145 -193
- mindspore/profiler/profiler_action_controller.py +1 -1
- mindspore/profiler/profiler_interface.py +2 -2
- mindspore/rewrite/symbol_tree/symbol_tree.py +1 -1
- mindspore/runtime/__init__.py +4 -6
- mindspore/runtime/executor.py +0 -27
- mindspore/runtime/memory.py +0 -1
- mindspore/runtime/thread_bind_core.py +1 -1
- mindspore/swresample-4.dll +0 -0
- mindspore/swscale-6.dll +0 -0
- mindspore/tinyxml2.dll +0 -0
- mindspore/train/_utils.py +3 -3
- mindspore/train/amp.py +3 -0
- mindspore/train/callback/_callback.py +1 -2
- mindspore/train/callback/_checkpoint.py +8 -1
- mindspore/train/callback/_flops_collector.py +6 -10
- mindspore/train/callback/_train_fault_tolerance.py +7 -3
- mindspore/train/data_sink.py +4 -4
- mindspore/train/dataset_helper.py +5 -5
- mindspore/train/model.py +20 -4
- mindspore/train/serialization.py +15 -35
- mindspore/train/train_thor/model_thor.py +2 -2
- mindspore/turbojpeg.dll +0 -0
- mindspore/utils/hooks.py +81 -0
- mindspore/utils/utils.py +8 -8
- mindspore/version.py +1 -1
- {mindspore-2.7.0.dist-info → mindspore-2.7.0rc1.dist-info}/METADATA +1 -1
- {mindspore-2.7.0.dist-info → mindspore-2.7.0rc1.dist-info}/RECORD +193 -192
- mindspore/_extends/parallel_compile/akg_compiler/custom.py +0 -1109
- mindspore/common/dynamic_shape/__init__.py +0 -0
- mindspore/common/dynamic_shape/enable_dynamic.py +0 -197
- /mindspore/common/{dynamic_shape/_auto_dynamic.py → _auto_dynamic.py} +0 -0
- {mindspore-2.7.0.dist-info → mindspore-2.7.0rc1.dist-info}/WHEEL +0 -0
- {mindspore-2.7.0.dist-info → mindspore-2.7.0rc1.dist-info}/entry_points.txt +0 -0
- {mindspore-2.7.0.dist-info → mindspore-2.7.0rc1.dist-info}/top_level.txt +0 -0
|
@@ -628,6 +628,13 @@ class ModelCheckpoint(Callback):
|
|
|
628
628
|
if "step_num" in self._append_dict:
|
|
629
629
|
self._append_dict["step_num"] = self._append_step_num + step_num
|
|
630
630
|
|
|
631
|
+
def _update_save_step(self, cb_params):
|
|
632
|
+
"""update step if used async d2h copy"""
|
|
633
|
+
step_num_in_epoch = int((cb_params.cur_step_num - 1) % cb_params.batch_num + 1)
|
|
634
|
+
if self._d2h_async and self._run_mode == context.GRAPH_MODE:
|
|
635
|
+
step_num_in_epoch -= 1
|
|
636
|
+
return step_num_in_epoch
|
|
637
|
+
|
|
631
638
|
def _save_ckpt(self, cb_params, force_to_save=False):
|
|
632
639
|
"""Save checkpoint files."""
|
|
633
640
|
if cb_params.cur_step_num == self._last_triggered_step:
|
|
@@ -638,7 +645,7 @@ class ModelCheckpoint(Callback):
|
|
|
638
645
|
self._flush_from_cache(cb_params)
|
|
639
646
|
|
|
640
647
|
save_ckpt = self._check_save_ckpt(cb_params, force_to_save)
|
|
641
|
-
step_num_in_epoch =
|
|
648
|
+
step_num_in_epoch = self._update_save_step(cb_params)
|
|
642
649
|
|
|
643
650
|
if save_ckpt:
|
|
644
651
|
|
|
@@ -31,6 +31,7 @@ from mindspore.communication.management import (create_group, get_group_size,
|
|
|
31
31
|
from mindspore.parallel._auto_parallel_context import auto_parallel_context
|
|
32
32
|
from mindspore.ops import operations as P
|
|
33
33
|
from mindspore.common import Tensor
|
|
34
|
+
from mindspore import context
|
|
34
35
|
import mindspore.nn as nn
|
|
35
36
|
|
|
36
37
|
|
|
@@ -151,21 +152,16 @@ class FlopsUtilizationCollector(Callback):
|
|
|
151
152
|
"""
|
|
152
153
|
Check whether FlopsUtilizationCollector is working in the current environment
|
|
153
154
|
"""
|
|
155
|
+
if context.get_context("mode") != context.GRAPH_MODE:
|
|
156
|
+
if self.verbose:
|
|
157
|
+
raise ValueError("FlopsUtilizationCollector now only support graph mode.")
|
|
158
|
+
logger.info("FlopsUtilizationCollector now only support graph mode.")
|
|
159
|
+
return False
|
|
154
160
|
cb_params = run_context.original_args()
|
|
155
161
|
if cb_params.mode == 'train':
|
|
156
162
|
network = cb_params.train_network
|
|
157
|
-
if not network.compiled:
|
|
158
|
-
if self.verbose:
|
|
159
|
-
raise ValueError("FlopsUtilizationCollector now only support graph mode.")
|
|
160
|
-
logger.info("FlopsUtilizationCollector now only support graph mode.")
|
|
161
|
-
return False
|
|
162
163
|
elif cb_params.mode == 'eval':
|
|
163
164
|
network = cb_params.eval_network
|
|
164
|
-
if not network.compiled:
|
|
165
|
-
if self.verbose:
|
|
166
|
-
raise ValueError("FlopsUtilizationCollector now only support graph mode.")
|
|
167
|
-
logger.info("FlopsUtilizationCollector now only support graph mode.")
|
|
168
|
-
return False
|
|
169
165
|
else:
|
|
170
166
|
if self.verbose:
|
|
171
167
|
raise ValueError('FlopsUtilizationCollector only support train and eval mode!')
|
|
@@ -167,6 +167,7 @@ def _tft_stop_callback(args, cb_ctx):
|
|
|
167
167
|
""" Callback used for TFT stop function."""
|
|
168
168
|
logger.warning(f"Enter _tft_stop_callback device_id: {cb_ctx.device_id}")
|
|
169
169
|
_stop_device(cb_ctx.device_id)
|
|
170
|
+
cb_ctx.stop_been_called = True
|
|
170
171
|
if (not cb_ctx.is_uce_rank) and (not cb_ctx._is_params_consistent()): # pylint: disable=W0212
|
|
171
172
|
raise RuntimeError("Can't stop device, because training parameters are left in inconsistent state!")
|
|
172
173
|
cb_ctx.is_uce_rank = False
|
|
@@ -191,7 +192,7 @@ def _tft_rebuild_sub_groups(fault_ranks, args, ctx):
|
|
|
191
192
|
class TrainFaultTolerance(Callback):
|
|
192
193
|
"""
|
|
193
194
|
This callback is used to enable the TFT feature
|
|
194
|
-
`MindIO TFT <https://www.hiascend.com/document/detail/zh/mindx-dl/
|
|
195
|
+
`MindIO TFT <https://www.hiascend.com/document/detail/zh/mindx-dl/60rc2/mindio/mindiottp/mindiottp001.html>`_
|
|
195
196
|
and will execute TFT operations during training process, such as TFT init, report and exception handle.
|
|
196
197
|
|
|
197
198
|
Note:
|
|
@@ -339,6 +340,7 @@ class TrainFaultTolerance(Callback):
|
|
|
339
340
|
self.learning_rate = None
|
|
340
341
|
self.has_init_replica = False
|
|
341
342
|
self.is_uce_rank = False
|
|
343
|
+
self.stop_been_called = False
|
|
342
344
|
|
|
343
345
|
self.assign = mindspore.ops.Assign()
|
|
344
346
|
self.g_one = Parameter(Tensor([1], dtype=mstype.int32))
|
|
@@ -380,9 +382,11 @@ class TrainFaultTolerance(Callback):
|
|
|
380
382
|
_tft_handler.init(config=None)
|
|
381
383
|
self.tft = _tft_handler.get_tft()
|
|
382
384
|
logger.warning(f"TFT handle init ok.")
|
|
385
|
+
mode = context.get_context("mode")
|
|
383
386
|
device_target = context.get_context("device_target")
|
|
384
|
-
if device_target != "Ascend":
|
|
385
|
-
raise ValueError(f"MindIO adataper only support on Ascend device
|
|
387
|
+
if device_target != "Ascend" or mode != context.GRAPH_MODE:
|
|
388
|
+
raise ValueError(f"MindIO adataper only support on Ascend device with GRAPH Mode!"
|
|
389
|
+
f"device:{device_target}, run mode: {mode}")
|
|
386
390
|
|
|
387
391
|
def _is_params_consistent(self):
|
|
388
392
|
for key, param in self.cb_params.train_network.parameters_and_names():
|
mindspore/train/data_sink.py
CHANGED
|
@@ -16,7 +16,7 @@
|
|
|
16
16
|
from functools import wraps
|
|
17
17
|
import mindspore.ops as ops
|
|
18
18
|
from mindspore import context
|
|
19
|
-
from mindspore.common.dtype import
|
|
19
|
+
from mindspore.common.dtype import pytype_to_dtype
|
|
20
20
|
from mindspore.common.api import jit
|
|
21
21
|
from mindspore.train._utils import _exec_datagraph, _get_types_and_shapes, enable_data_broadcast
|
|
22
22
|
from mindspore.train.dataset_helper import _has_dynamic_shape, _check_inputs
|
|
@@ -61,7 +61,7 @@ def _init_sink_dataset(dataset, sink_size, input_signature, create_info):
|
|
|
61
61
|
_check_inputs(input_signature, dataset_shapes, dataset_types)
|
|
62
62
|
|
|
63
63
|
queue_name = transfer_dataset.queue_name
|
|
64
|
-
if _need_to_full():
|
|
64
|
+
if _need_to_full() and context.get_context('mode') == context.GRAPH_MODE:
|
|
65
65
|
device_num = _get_device_num() // _get_pipeline_stages()
|
|
66
66
|
dataset_shapes = _to_full_shapes(dataset_shapes, device_num)
|
|
67
67
|
next_op = ops.GetNext(dataset_types, dataset_shapes, len(dataset_types), queue_name)
|
|
@@ -94,12 +94,12 @@ def _get_next_op(dataset, ori_next_op, is_info_queue):
|
|
|
94
94
|
|
|
95
95
|
queue_name = dataset.__transfer_dataset__.queue_name
|
|
96
96
|
dataset_types, dataset_shapes = dataset.__transfer_dataset__.get_data_info()
|
|
97
|
-
dataset_types = [
|
|
97
|
+
dataset_types = [pytype_to_dtype(x) for x in dataset_types]
|
|
98
98
|
key = str(dataset_types) + str(dataset_shapes)
|
|
99
99
|
if key in dataset.__sink_aux__.next_ops:
|
|
100
100
|
next_op = dataset.__sink_aux__.next_ops[key]
|
|
101
101
|
else:
|
|
102
|
-
if _need_to_full():
|
|
102
|
+
if _need_to_full() and context.get_context('mode') == context.GRAPH_MODE:
|
|
103
103
|
device_num = _get_device_num() // _get_pipeline_stages()
|
|
104
104
|
dataset_shapes = _to_full_shapes(dataset_shapes, device_num)
|
|
105
105
|
next_op = ops.GetNext(dataset_types, dataset_shapes, len(dataset_types), queue_name)
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright 2020
|
|
1
|
+
# Copyright 2020 Huawei Technologies Co., Ltd
|
|
2
2
|
#
|
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
4
|
# you may not use this file except in compliance with the License.
|
|
@@ -20,8 +20,8 @@ import copy
|
|
|
20
20
|
|
|
21
21
|
from mindspore import _checkparam as Validator
|
|
22
22
|
from mindspore import log as logger
|
|
23
|
-
from mindspore.common.
|
|
24
|
-
from mindspore.common.dtype import
|
|
23
|
+
from mindspore.common._auto_dynamic import is_auto_dynamic, convert_new_shapes
|
|
24
|
+
from mindspore.common.dtype import pytype_to_dtype
|
|
25
25
|
from mindspore.common.api import _cell_graph_executor, _is_args_fullmode, ARG_SPECIFIED
|
|
26
26
|
from mindspore.common._utils import is_shape_unknown
|
|
27
27
|
from mindspore.dataset.core import config as dataset_config
|
|
@@ -34,7 +34,7 @@ from mindspore.parallel._utils import _get_device_num, _get_global_rank, _need_t
|
|
|
34
34
|
_origin_shapes, _dynamic_shape_for_dataset
|
|
35
35
|
from mindspore.parallel._ps_context import _is_role_sched
|
|
36
36
|
from mindspore.ops import operations as P
|
|
37
|
-
from mindspore.common.
|
|
37
|
+
from mindspore.common.auto_dynamic_shape import _auto_dynamic_shape
|
|
38
38
|
|
|
39
39
|
|
|
40
40
|
def _send_data(dataset, epoch_num):
|
|
@@ -275,7 +275,7 @@ def connect_network_with_dataset(network, dataset_helper):
|
|
|
275
275
|
# Need to do full_batch for shapes which also do in the _DatasetIterMSLoopSink
|
|
276
276
|
if _need_to_full():
|
|
277
277
|
dataset_shapes = _to_full_shapes(dataset_shapes, _get_device_num() // _get_pipeline_stages())
|
|
278
|
-
dataset_types = [
|
|
278
|
+
dataset_types = [pytype_to_dtype(x) for x in dataset_types]
|
|
279
279
|
if not is_dynamic:
|
|
280
280
|
dataset_shapes = _auto_dynamic_shape.auto_dynamic_generate_compile_args(dataset_shapes, True)
|
|
281
281
|
key = str(dataset_types) + str(dataset_shapes)
|
mindspore/train/model.py
CHANGED
|
@@ -156,7 +156,11 @@ def _handle_exception_info(obj, uce_env, tft, e):
|
|
|
156
156
|
tft.tft_report_error(tft.ReportState.RS_UCE.value)
|
|
157
157
|
elif "HCCEError" in e_str:
|
|
158
158
|
logger.warning("uce wrapper caught HCCEError")
|
|
159
|
-
|
|
159
|
+
if obj.stop_been_called:
|
|
160
|
+
logger.warning("Received HCCEError after force stop been called, so report force stopped error to MindIO.")
|
|
161
|
+
tft.tft_report_error(tft.ReportState.RS_NORMAL.value)
|
|
162
|
+
else:
|
|
163
|
+
tft.tft_report_error(tft.ReportState.RS_HCCL_FAILED.value)
|
|
160
164
|
elif "ForceStopError" in e_str:
|
|
161
165
|
logger.warning("uce wrapper caught RuntimeError ForceStopError")
|
|
162
166
|
force_stop_err = tft.ReportState.RS_NORMAL.value
|
|
@@ -266,6 +270,7 @@ def _handle_tft(func):
|
|
|
266
270
|
ret = obj.tft.tft_wait_next_action()
|
|
267
271
|
if ret == obj.tft.Action.EXIT.value:
|
|
268
272
|
raise e
|
|
273
|
+
obj.stop_been_called = False
|
|
269
274
|
repair_step = obj.tft.tft_get_repair_step()
|
|
270
275
|
logger.warning(
|
|
271
276
|
"uce wrapper caught repair finish REPAIR STEP: {} batch_num:{}".format(repair_step,
|
|
@@ -303,6 +308,9 @@ def _check_tft():
|
|
|
303
308
|
ascend_target = MSContext.get_instance().get_ascend_soc_version()
|
|
304
309
|
if ascend_target == 'ascend910':
|
|
305
310
|
raise ValueError("TFT is not supported when using ascend910")
|
|
311
|
+
ms_mode = context.get_context("mode")
|
|
312
|
+
if ms_mode != mindspore.GRAPH_MODE:
|
|
313
|
+
raise ValueError("TFT is only supported in GRAPH_MODE")
|
|
306
314
|
jit_level = context.get_context("jit_level")
|
|
307
315
|
if jit_level == "O2" and ("UCE:1" in tft_env or "ARF:1" in tft_env):
|
|
308
316
|
raise ValueError("TFT is not supported when using jit_level == O2")
|
|
@@ -812,7 +820,7 @@ class Model:
|
|
|
812
820
|
"""
|
|
813
821
|
if os.environ.get("MS_ENABLE_CKPT_D2H_ASYNC") != "1":
|
|
814
822
|
return
|
|
815
|
-
if context.get_context("device_target") == "Ascend":
|
|
823
|
+
if (context.get_context("mode") == context.GRAPH_MODE) and (context.get_context("device_target") == "Ascend"):
|
|
816
824
|
cb_params.need_ckpt, cb_params.save_checkpoint_steps, \
|
|
817
825
|
cb_params.last_triggered_step = self._check_need_ckpt(cb_params.list_callback)
|
|
818
826
|
logger.info(f"need_ckpt:{cb_params.need_ckpt},"
|
|
@@ -880,8 +888,8 @@ class Model:
|
|
|
880
888
|
sink_size (int): Control the amount of data in each sink. Default: -1.
|
|
881
889
|
epoch (int): Total number of iterations on the data. Default: 1.
|
|
882
890
|
"""
|
|
883
|
-
if context.get_context("device_target") != "Ascend":
|
|
884
|
-
raise RuntimeError('Pre-init process only supports Ascend target currently.')
|
|
891
|
+
if context.get_context("mode") != context.GRAPH_MODE or context.get_context("device_target") != "Ascend":
|
|
892
|
+
raise RuntimeError('Pre-init process only supports GRAPH MODE and Ascend target currently.')
|
|
885
893
|
|
|
886
894
|
if not train_dataset and not valid_dataset:
|
|
887
895
|
raise ValueError("The argument 'train_dataset' and 'valid_dataset' can not both be None or empty.")
|
|
@@ -1212,6 +1220,8 @@ class Model:
|
|
|
1212
1220
|
if not enable_recovery:
|
|
1213
1221
|
self.enable_recovery = False
|
|
1214
1222
|
else:
|
|
1223
|
+
if context.get_context("mode") != context.GRAPH_MODE:
|
|
1224
|
+
raise RuntimeError("Recovery for training only support graph mode currently.")
|
|
1215
1225
|
self.enable_recovery = enable_recovery and _is_role_worker()
|
|
1216
1226
|
|
|
1217
1227
|
def _check_need_load_ckpt(self, cb_params, dataset_size, sink_size=-1):
|
|
@@ -2189,6 +2199,9 @@ class Model:
|
|
|
2189
2199
|
dataset_sink_mode (bool): Determines whether to pass the data through dataset channel.
|
|
2190
2200
|
sink_size (int): Control the amount of data in each sink.
|
|
2191
2201
|
"""
|
|
2202
|
+
if context.get_context("mode") != context.GRAPH_MODE:
|
|
2203
|
+
raise RuntimeError("Pre-compile process that generate parameter layout for the train network "
|
|
2204
|
+
"only supports GRAPH MODE and Ascend target currently.")
|
|
2192
2205
|
if _get_parallel_mode() not in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL):
|
|
2193
2206
|
raise RuntimeError("'infer_train_layout' only supports 'semi_auto_parallel' and 'auto_parallel' "
|
|
2194
2207
|
"mode, but got {}.".format(_get_parallel_mode()))
|
|
@@ -2348,6 +2361,9 @@ class Model:
|
|
|
2348
2361
|
>>> predict_map = model.infer_predict_layout(inputs)
|
|
2349
2362
|
"""
|
|
2350
2363
|
_init_auto_parallel_context(self._network)
|
|
2364
|
+
if context.get_context("mode") != context.GRAPH_MODE:
|
|
2365
|
+
raise RuntimeError("Pre-compile process that generate parameter layout for the predict network "
|
|
2366
|
+
"only supports GRAPH MODE and Ascend target currently.")
|
|
2351
2367
|
if _get_parallel_mode() not in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL):
|
|
2352
2368
|
raise RuntimeError('Infer predict layout only supports semi auto parallel and auto parallel mode.')
|
|
2353
2369
|
_parallel_predict_check()
|
mindspore/train/serialization.py
CHANGED
|
@@ -52,6 +52,7 @@ from mindspore.log import vlog_print
|
|
|
52
52
|
from mindspore._checkparam import check_input_data, check_input_dataset
|
|
53
53
|
from mindspore import _checkparam as Validator
|
|
54
54
|
from mindspore.common import dtype as mstype
|
|
55
|
+
from mindspore.common import np_dtype
|
|
55
56
|
from mindspore.common.api import _cell_graph_executor as _executor
|
|
56
57
|
from mindspore.common.api import _JitExecutor
|
|
57
58
|
from mindspore.common.api import _get_parameter_layout
|
|
@@ -85,9 +86,12 @@ tensor_to_ms_type = {"Int8": mstype.int8, "UInt8": mstype.uint8, "Int16": mstype
|
|
|
85
86
|
"Float16": mstype.float16, "Float32": mstype.float32, "Float64": mstype.float64,
|
|
86
87
|
"Bool": mstype.bool_, "str": mstype.string, "BFloat16": mstype.bfloat16, "Int4": mstype.qint4x2}
|
|
87
88
|
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
89
|
+
tensor_to_np_type = {"Int8": np.int8, "UInt8": np.uint8, "Int16": np.int16, "UInt16": np.uint16,
|
|
90
|
+
"Int32": np.int32, "UInt32": np.uint32, "Int64": np.int64, "UInt64": np.uint64,
|
|
91
|
+
"Float16": np.float16, "Float32": np.float32, "Float64": np.float64, "Bool": np.bool_, "str": "U"}
|
|
92
|
+
|
|
93
|
+
if hasattr(np_dtype, "bfloat16"):
|
|
94
|
+
tensor_to_np_type["BFloat16"] = np_dtype.bfloat16
|
|
91
95
|
|
|
92
96
|
np_type_convert = {"int32": np.int32, "float32": np.float32, "float16": np.float16, "float64": np.float64}
|
|
93
97
|
|
|
@@ -110,21 +114,6 @@ INT_64_MAX = 9223372036854775807
|
|
|
110
114
|
cpu_cast = Cast().set_device("CPU")
|
|
111
115
|
|
|
112
116
|
_ckpt_fs = FileSystem()
|
|
113
|
-
_ckpt_fs_initialized = False
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
def tensor_to_np_type(tensor_type_str):
|
|
117
|
-
"""tensor to numpy type"""
|
|
118
|
-
if tensor_type_str == "BFloat16":
|
|
119
|
-
from mindspore.common import np_dtype
|
|
120
|
-
if not np_dtype.np_dtype_valid(True):
|
|
121
|
-
raise TypeError(
|
|
122
|
-
"The Numpy bfloat16 data type is not supported now, please ensure that the current "
|
|
123
|
-
"Numpy version is not less than the version when the mindspore is compiled, "
|
|
124
|
-
"and the major versions are same."
|
|
125
|
-
)
|
|
126
|
-
return np_dtype.bfloat16
|
|
127
|
-
return _tensor_to_np_type.get(tensor_type_str)
|
|
128
117
|
|
|
129
118
|
|
|
130
119
|
def init_ckpt_file_system(fs: FileSystem):
|
|
@@ -134,12 +123,8 @@ def init_ckpt_file_system(fs: FileSystem):
|
|
|
134
123
|
_register_basic_file_system(fs)
|
|
135
124
|
|
|
136
125
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
global _ckpt_fs_initialized
|
|
140
|
-
if not _ckpt_fs_initialized:
|
|
141
|
-
init_ckpt_file_system(_ckpt_fs)
|
|
142
|
-
_ckpt_fs_initialized = True
|
|
126
|
+
# Initialize checkpoint file system
|
|
127
|
+
init_ckpt_file_system(_ckpt_fs)
|
|
143
128
|
|
|
144
129
|
|
|
145
130
|
def _wait_async_process_save_ckpt():
|
|
@@ -473,7 +458,7 @@ def _exec_save(ckpt_file_name, data_list, enc_key=None, enc_mode="AES-GCM", map_
|
|
|
473
458
|
f"simultaneously modified a file.")
|
|
474
459
|
elif _ckpt_fs.backend != "mindio":
|
|
475
460
|
os.rename(tmp_name, ckpt_file_name)
|
|
476
|
-
|
|
461
|
+
os.chmod(ckpt_file_name, stat.S_IRUSR)
|
|
477
462
|
except BaseException as e:
|
|
478
463
|
logger.critical("Failed to save the checkpoint file %s. Maybe don't have the permission to write files, "
|
|
479
464
|
"or the disk space is insufficient and so on.", ckpt_file_name)
|
|
@@ -733,7 +718,6 @@ def save_checkpoint(save_obj, ckpt_file_name, integrated_save=True,
|
|
|
733
718
|
<https://mindspore.cn/tutorials/en/master/beginner/save_load.html#saving-and-loading-the-model-weight>`_
|
|
734
719
|
"""
|
|
735
720
|
start_save_time = time.time()
|
|
736
|
-
_ensure_ckpt_fs_initialized()
|
|
737
721
|
ckpt_file_name = _check_save_obj_and_ckpt_file_name(save_obj, ckpt_file_name, format)
|
|
738
722
|
integrated_save = Validator.check_bool(integrated_save)
|
|
739
723
|
async_save = _check_async_save(async_save)
|
|
@@ -1284,7 +1268,11 @@ def _load_into_param_dict(ckpt_file_name, parameter_dict, specify_prefix, filter
|
|
|
1284
1268
|
continue
|
|
1285
1269
|
data = element.tensor.tensor_content
|
|
1286
1270
|
data_type = element.tensor.tensor_type
|
|
1271
|
+
np_type = tensor_to_np_type.get(data_type)
|
|
1287
1272
|
ms_type = tensor_to_ms_type[data_type]
|
|
1273
|
+
if data_type == 'str':
|
|
1274
|
+
str_length = int(len(data) / 4)
|
|
1275
|
+
np_type = np_type + str(str_length)
|
|
1288
1276
|
param_data_list.append(data)
|
|
1289
1277
|
if (element_id == len(checkpoint_list.value) - 1) or \
|
|
1290
1278
|
(element.tag != checkpoint_list.value[element_id + 1].tag):
|
|
@@ -1292,8 +1280,6 @@ def _load_into_param_dict(ckpt_file_name, parameter_dict, specify_prefix, filter
|
|
|
1292
1280
|
param_data_list.clear()
|
|
1293
1281
|
dims = element.tensor.dims
|
|
1294
1282
|
if data_type == 'str':
|
|
1295
|
-
str_length = int(len(data) / 4)
|
|
1296
|
-
np_type = "U" + str(str_length)
|
|
1297
1283
|
str_value = np.frombuffer(new_data, np_type)
|
|
1298
1284
|
parameter_dict[element.tag] = str(str_value[0])
|
|
1299
1285
|
else:
|
|
@@ -1400,7 +1386,6 @@ def load_checkpoint(ckpt_file_name, net=None, strict_load=False, filter_prefix=N
|
|
|
1400
1386
|
"""
|
|
1401
1387
|
start_load_time = time.time()
|
|
1402
1388
|
vlog_print("1", "ME", __file__, sys._getframe().f_lineno, "Begin load checkpoint.")
|
|
1403
|
-
_ensure_ckpt_fs_initialized()
|
|
1404
1389
|
specify_prefix = _check_prefix(specify_prefix)
|
|
1405
1390
|
filter_prefix = _check_prefix(filter_prefix)
|
|
1406
1391
|
dec_key = Validator.check_isinstance('dec_key', dec_key, (type(None), bytes))
|
|
@@ -2213,11 +2198,6 @@ def _save_onnx(net, file_name, *inputs, **kwargs):
|
|
|
2213
2198
|
file_name += ".onnx"
|
|
2214
2199
|
if os.path.exists(file_name):
|
|
2215
2200
|
os.chmod(file_name, stat.S_IWUSR)
|
|
2216
|
-
else:
|
|
2217
|
-
dir_path = os.path.dirname(file_name)
|
|
2218
|
-
if not os.path.exists(dir_path):
|
|
2219
|
-
os.makedirs(dir_path, mode=0o700, exist_ok=True)
|
|
2220
|
-
os.chmod(dir_path, 0o700)
|
|
2221
2201
|
with open(file_name, 'wb') as f:
|
|
2222
2202
|
f.write(onnx_stream)
|
|
2223
2203
|
os.chmod(file_name, stat.S_IRUSR)
|
|
@@ -2614,7 +2594,7 @@ def parse_print(print_file_name):
|
|
|
2614
2594
|
dims = print_.tensor.dims
|
|
2615
2595
|
data_type = print_.tensor.tensor_type
|
|
2616
2596
|
data = print_.tensor.tensor_content
|
|
2617
|
-
np_type = tensor_to_np_type(data_type)
|
|
2597
|
+
np_type = tensor_to_np_type.get(data_type)
|
|
2618
2598
|
param_data = np.fromstring(data, np_type)
|
|
2619
2599
|
ms_type = tensor_to_ms_type.get(data_type)
|
|
2620
2600
|
if dims and dims != [0]:
|
|
@@ -29,7 +29,7 @@ from mindspore import nn
|
|
|
29
29
|
from mindspore.train.model import Model
|
|
30
30
|
from mindspore.train.dataset_helper import connect_network_with_dataset
|
|
31
31
|
from mindspore.parallel._utils import _need_to_full, _to_full_tensor
|
|
32
|
-
from mindspore.common.dtype import
|
|
32
|
+
from mindspore.common.dtype import pytype_to_dtype
|
|
33
33
|
from mindspore._c_expression import init_exec_dataset
|
|
34
34
|
from mindspore.train.train_thor.dataset_helper import DatasetHelper
|
|
35
35
|
|
|
@@ -46,7 +46,7 @@ def _convert_to_ms_type(types):
|
|
|
46
46
|
"""
|
|
47
47
|
ms_types = []
|
|
48
48
|
for numpy_type in types:
|
|
49
|
-
ms_type =
|
|
49
|
+
ms_type = pytype_to_dtype(numpy_type)
|
|
50
50
|
ms_types.append(ms_type)
|
|
51
51
|
return ms_types
|
|
52
52
|
|
mindspore/turbojpeg.dll
CHANGED
|
Binary file
|
mindspore/utils/hooks.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# Copyright 2025 Huawei Technologies Co., Ltd
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
# ============================================================================
|
|
15
|
+
"""hooks"""
|
|
16
|
+
from collections import OrderedDict
|
|
17
|
+
import weakref
|
|
18
|
+
from typing import Any, Tuple
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class _RemovableHandle:
|
|
22
|
+
r"""
|
|
23
|
+
A handle which provides the capability to remove a hook.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
hooks_dict (dict): A dictionary of hooks, indexed by hook `id`.
|
|
27
|
+
|
|
28
|
+
Keyword Args:
|
|
29
|
+
extra_dict (Union[dict, list[dict]], optional): An additional dictionary or list of
|
|
30
|
+
dictionaries whose keys will be deleted when the same keys are
|
|
31
|
+
removed from `hooks_dict`. Default ``None``.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
id: int
|
|
35
|
+
next_id: int = 0
|
|
36
|
+
|
|
37
|
+
def __init__(self, hooks_dict: Any, *, extra_dict: Any = None) -> None:
|
|
38
|
+
self.hooks_dict_ref = weakref.ref(hooks_dict)
|
|
39
|
+
self.id = _RemovableHandle.next_id
|
|
40
|
+
_RemovableHandle.next_id += 1
|
|
41
|
+
|
|
42
|
+
self.extra_dict_ref: Tuple = ()
|
|
43
|
+
if isinstance(extra_dict, dict):
|
|
44
|
+
self.extra_dict_ref = (weakref.ref(extra_dict),)
|
|
45
|
+
elif isinstance(extra_dict, list):
|
|
46
|
+
self.extra_dict_ref = tuple(weakref.ref(d) for d in extra_dict)
|
|
47
|
+
|
|
48
|
+
def remove(self) -> None:
|
|
49
|
+
hooks_dict = self.hooks_dict_ref()
|
|
50
|
+
if hooks_dict is not None and self.id in hooks_dict:
|
|
51
|
+
del hooks_dict[self.id]
|
|
52
|
+
|
|
53
|
+
for ref in self.extra_dict_ref:
|
|
54
|
+
extra_dict = ref()
|
|
55
|
+
if extra_dict is not None and self.id in extra_dict:
|
|
56
|
+
del extra_dict[self.id]
|
|
57
|
+
|
|
58
|
+
def __getstate__(self):
|
|
59
|
+
if self.extra_dict_ref is None:
|
|
60
|
+
return (self.hooks_dict_ref(), self.id)
|
|
61
|
+
return (self.hooks_dict_ref(), self.id, tuple(ref() for ref in self.extra_dict_ref))
|
|
62
|
+
|
|
63
|
+
def __setstate__(self, state) -> None:
|
|
64
|
+
if state[0] is None:
|
|
65
|
+
# create a dead reference
|
|
66
|
+
self.hooks_dict_ref = weakref.ref(OrderedDict())
|
|
67
|
+
else:
|
|
68
|
+
self.hooks_dict_ref = weakref.ref(state[0])
|
|
69
|
+
self.id = state[1]
|
|
70
|
+
_RemovableHandle.next_id = max(_RemovableHandle.next_id, self.id + 1)
|
|
71
|
+
|
|
72
|
+
if len(state) < 3 or state[2] is None:
|
|
73
|
+
self.extra_dict_ref = ()
|
|
74
|
+
else:
|
|
75
|
+
self.extra_dict_ref = tuple(weakref.ref(d) for d in state[2])
|
|
76
|
+
|
|
77
|
+
def __enter__(self) -> "_RemovableHandle":
|
|
78
|
+
return self
|
|
79
|
+
|
|
80
|
+
def __exit__(self, type: Any, value: Any, tb: Any) -> None:
|
|
81
|
+
self.remove()
|
mindspore/utils/utils.py
CHANGED
|
@@ -132,16 +132,16 @@ class TftHandle:
|
|
|
132
132
|
if "ARF:1" in tft_env:
|
|
133
133
|
logger.warning(f"Disable hccl watchdog when using ARF.")
|
|
134
134
|
context.set_context(ascend_config={"hccl_watchdog": False})
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
all_opts = [part.strip() for part in tft_env.split(",")] + ["TTP:1"]
|
|
139
|
-
os.environ["MS_ENABLE_TFT"] = "{" + ",".join(all_opts) + "}"
|
|
135
|
+
logger.warning(f"Turn on TTP config when using ARF.")
|
|
136
|
+
if "TTP:1,UCE:1,ARF:1" not in tft_env:
|
|
137
|
+
os.environ["MS_ENABLE_TFT"] = "{TTP:1,ARF:1}"
|
|
140
138
|
os.environ["MS_ENABLE_RECOVERY"] = "1"
|
|
141
139
|
|
|
140
|
+
mode = context.get_context("mode")
|
|
142
141
|
device_target = context.get_context("device_target")
|
|
143
|
-
if device_target != "Ascend":
|
|
144
|
-
logger.warning(f"MindIO adataper only support on Ascend device
|
|
142
|
+
if device_target != "Ascend" or mode != context.GRAPH_MODE:
|
|
143
|
+
logger.warning(f"MindIO adataper only support on Ascend device with GRAPH Mode!"
|
|
144
|
+
f"device:{device_target}, run mode: {mode}")
|
|
145
145
|
return
|
|
146
146
|
|
|
147
147
|
ctrl_port = int(os.getenv("MS_TFT_PORT"))
|
|
@@ -154,7 +154,7 @@ class TftHandle:
|
|
|
154
154
|
from mindio_ttp import framework_ttp as tft
|
|
155
155
|
self.tft = tft
|
|
156
156
|
except BaseException as e:
|
|
157
|
-
raise ModuleNotFoundError(f"Module
|
|
157
|
+
raise ModuleNotFoundError(f"Module nopt found. Detail info {str(e)}")
|
|
158
158
|
world_size = int(os.getenv("MS_WORKER_NUM")) # from msrun
|
|
159
159
|
cur_rank = int(os.getenv("MS_NODE_ID")) # from msrun
|
|
160
160
|
enable_local_copy = False
|
mindspore/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = '2.7.
|
|
1
|
+
__version__ = '2.7.0rc1'
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: mindspore
|
|
3
|
-
Version: 2.7.
|
|
3
|
+
Version: 2.7.0rc1
|
|
4
4
|
Summary: MindSpore is a new open source deep learning training/inference framework that could be used for mobile, edge and cloud scenarios.
|
|
5
5
|
Home-page: https://www.mindspore.cn
|
|
6
6
|
Author: The MindSpore Authors
|