mindspore 2.7.0__cp311-cp311-win_amd64.whl → 2.7.1__cp311-cp311-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mindspore might be problematic. Click here for more details.
- mindspore/.commit_id +1 -1
- mindspore/__init__.py +4 -1
- mindspore/_c_dataengine.cp311-win_amd64.pyd +0 -0
- mindspore/_c_expression.cp311-win_amd64.pyd +0 -0
- mindspore/_c_mindrecord.cp311-win_amd64.pyd +0 -0
- mindspore/_extends/parse/compile_config.py +24 -1
- mindspore/_extends/parse/deprecated/deprecated_tensor_method.py +6 -2
- mindspore/_extends/parse/resources.py +1 -1
- mindspore/_extends/parse/standard_method.py +8 -1
- mindspore/_extends/parse/trope.py +2 -1
- mindspore/_extends/pijit/pijit_func_white_list.py +7 -22
- mindspore/avcodec-59.dll +0 -0
- mindspore/avdevice-59.dll +0 -0
- mindspore/avfilter-8.dll +0 -0
- mindspore/avformat-59.dll +0 -0
- mindspore/avutil-57.dll +0 -0
- mindspore/boost/base.py +29 -2
- mindspore/common/_decorator.py +3 -2
- mindspore/common/_grad_function.py +3 -1
- mindspore/common/_tensor_cpp_method.py +1 -1
- mindspore/common/_tensor_docs.py +275 -64
- mindspore/common/_utils.py +0 -44
- mindspore/common/api.py +285 -35
- mindspore/common/dump.py +7 -108
- mindspore/common/dynamic_shape/auto_dynamic_shape.py +1 -3
- mindspore/common/hook_handle.py +60 -0
- mindspore/common/jit_config.py +5 -1
- mindspore/common/jit_trace.py +27 -12
- mindspore/common/lazy_inline.py +5 -3
- mindspore/common/parameter.py +13 -107
- mindspore/common/recompute.py +4 -11
- mindspore/common/tensor.py +16 -169
- mindspore/communication/_comm_helper.py +11 -1
- mindspore/communication/comm_func.py +138 -4
- mindspore/communication/management.py +85 -1
- mindspore/config/op_info.config +0 -15
- mindspore/context.py +5 -85
- mindspore/dataset/engine/datasets.py +8 -4
- mindspore/dataset/engine/datasets_vision.py +1 -1
- mindspore/dataset/engine/validators.py +1 -15
- mindspore/dnnl.dll +0 -0
- mindspore/{experimental/llm_boost/ascend_native → graph}/__init__.py +7 -7
- mindspore/graph/custom_pass.py +55 -0
- mindspore/include/dataset/execute.h +2 -2
- mindspore/jpeg62.dll +0 -0
- mindspore/mindrecord/__init__.py +3 -3
- mindspore/mindrecord/common/exceptions.py +1 -0
- mindspore/mindrecord/config.py +1 -1
- mindspore/{parallel/mpi → mindrecord/core}/__init__.py +4 -1
- mindspore/mindrecord/{shardheader.py → core/shardheader.py} +2 -1
- mindspore/mindrecord/{shardindexgenerator.py → core/shardindexgenerator.py} +1 -1
- mindspore/mindrecord/{shardreader.py → core/shardreader.py} +2 -1
- mindspore/mindrecord/{shardsegment.py → core/shardsegment.py} +2 -2
- mindspore/mindrecord/{shardutils.py → core/shardutils.py} +1 -1
- mindspore/mindrecord/{shardwriter.py → core/shardwriter.py} +1 -1
- mindspore/mindrecord/filereader.py +4 -4
- mindspore/mindrecord/filewriter.py +5 -5
- mindspore/mindrecord/mindpage.py +2 -2
- mindspore/mindrecord/tools/cifar10.py +1 -1
- mindspore/mindrecord/tools/cifar100.py +1 -1
- mindspore/mindrecord/tools/cifar100_to_mr.py +1 -1
- mindspore/mindrecord/tools/cifar10_to_mr.py +1 -1
- mindspore/mindrecord/tools/csv_to_mr.py +1 -1
- mindspore/mindrecord/tools/imagenet_to_mr.py +1 -1
- mindspore/mindrecord/tools/mnist_to_mr.py +1 -1
- mindspore/mindrecord/tools/tfrecord_to_mr.py +1 -1
- mindspore/mindspore_backend_common.dll +0 -0
- mindspore/mindspore_backend_manager.dll +0 -0
- mindspore/mindspore_cluster.dll +0 -0
- mindspore/mindspore_common.dll +0 -0
- mindspore/mindspore_core.dll +0 -0
- mindspore/mindspore_cpu.dll +0 -0
- mindspore/mindspore_dump.dll +0 -0
- mindspore/mindspore_frontend.dll +0 -0
- mindspore/mindspore_glog.dll +0 -0
- mindspore/mindspore_hardware_abstract.dll +0 -0
- mindspore/mindspore_memory_pool.dll +0 -0
- mindspore/mindspore_ms_backend.dll +0 -0
- mindspore/mindspore_ops.dll +0 -0
- mindspore/{mindspore_ops_host.dll → mindspore_ops_cpu.dll} +0 -0
- mindspore/mindspore_profiler.dll +0 -0
- mindspore/mindspore_pyboost.dll +0 -0
- mindspore/mindspore_pynative.dll +0 -0
- mindspore/mindspore_runtime_pipeline.dll +0 -0
- mindspore/mindspore_runtime_utils.dll +0 -0
- mindspore/mindspore_tools.dll +0 -0
- mindspore/mint/__init__.py +15 -10
- mindspore/mint/distributed/distributed.py +182 -62
- mindspore/mint/nn/__init__.py +2 -16
- mindspore/mint/nn/functional.py +4 -110
- mindspore/mint/nn/layer/__init__.py +0 -2
- mindspore/mint/nn/layer/activation.py +0 -6
- mindspore/mint/nn/layer/basic.py +0 -47
- mindspore/mint/nn/layer/conv.py +4 -4
- mindspore/mint/nn/layer/normalization.py +8 -13
- mindspore/mint/nn/layer/pooling.py +0 -4
- mindspore/nn/__init__.py +1 -3
- mindspore/nn/cell.py +16 -66
- mindspore/nn/layer/basic.py +49 -1
- mindspore/nn/layer/container.py +16 -0
- mindspore/nn/layer/embedding.py +4 -169
- mindspore/nn/layer/normalization.py +2 -1
- mindspore/nn/layer/thor_layer.py +4 -85
- mindspore/nn/optim/ada_grad.py +0 -1
- mindspore/nn/optim/adafactor.py +0 -1
- mindspore/nn/optim/adam.py +31 -124
- mindspore/nn/optim/adamax.py +0 -1
- mindspore/nn/optim/asgd.py +0 -1
- mindspore/nn/optim/ftrl.py +8 -102
- mindspore/nn/optim/lamb.py +0 -1
- mindspore/nn/optim/lars.py +0 -3
- mindspore/nn/optim/lazyadam.py +25 -218
- mindspore/nn/optim/momentum.py +5 -43
- mindspore/nn/optim/optimizer.py +6 -55
- mindspore/nn/optim/proximal_ada_grad.py +0 -1
- mindspore/nn/optim/rmsprop.py +0 -1
- mindspore/nn/optim/rprop.py +0 -1
- mindspore/nn/optim/sgd.py +0 -1
- mindspore/nn/optim/tft_wrapper.py +0 -1
- mindspore/nn/optim/thor.py +0 -2
- mindspore/nn/probability/bijector/bijector.py +7 -8
- mindspore/nn/probability/bijector/gumbel_cdf.py +2 -2
- mindspore/nn/probability/bijector/power_transform.py +20 -21
- mindspore/nn/probability/bijector/scalar_affine.py +5 -5
- mindspore/nn/probability/bijector/softplus.py +13 -14
- mindspore/nn/wrap/grad_reducer.py +4 -74
- mindspore/numpy/array_creations.py +2 -2
- mindspore/numpy/fft.py +9 -9
- mindspore/{nn/reinforcement → onnx}/__init__.py +5 -8
- mindspore/onnx/onnx_export.py +137 -0
- mindspore/opencv_core4110.dll +0 -0
- mindspore/opencv_imgcodecs4110.dll +0 -0
- mindspore/{opencv_imgproc452.dll → opencv_imgproc4110.dll} +0 -0
- mindspore/ops/__init__.py +2 -0
- mindspore/ops/_grad_experimental/grad_comm_ops.py +38 -2
- mindspore/ops/_op_impl/aicpu/__init__.py +0 -10
- mindspore/ops/_op_impl/cpu/__init__.py +0 -5
- mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +16 -22
- mindspore/ops/auto_generate/gen_extend_func.py +2 -7
- mindspore/ops/auto_generate/gen_ops_def.py +98 -141
- mindspore/ops/auto_generate/gen_ops_prim.py +12708 -12686
- mindspore/ops/communication.py +97 -0
- mindspore/ops/composite/__init__.py +5 -2
- mindspore/ops/composite/base.py +15 -1
- mindspore/ops/composite/multitype_ops/__init__.py +3 -1
- mindspore/ops/composite/multitype_ops/_compile_utils.py +150 -8
- mindspore/ops/composite/multitype_ops/add_impl.py +7 -0
- mindspore/ops/composite/multitype_ops/mod_impl.py +27 -0
- mindspore/ops/function/__init__.py +1 -0
- mindspore/ops/function/array_func.py +14 -12
- mindspore/ops/function/comm_func.py +3883 -0
- mindspore/ops/function/debug_func.py +3 -4
- mindspore/ops/function/math_func.py +45 -54
- mindspore/ops/function/nn_func.py +75 -294
- mindspore/ops/function/random_func.py +9 -18
- mindspore/ops/functional.py +2 -0
- mindspore/ops/functional_overload.py +354 -18
- mindspore/ops/operations/__init__.py +2 -5
- mindspore/ops/operations/_custom_ops_utils.py +7 -9
- mindspore/ops/operations/_inner_ops.py +1 -38
- mindspore/ops/operations/_rl_inner_ops.py +0 -933
- mindspore/ops/operations/array_ops.py +1 -0
- mindspore/ops/operations/comm_ops.py +94 -2
- mindspore/ops/operations/custom_ops.py +228 -19
- mindspore/ops/operations/debug_ops.py +27 -29
- mindspore/ops/operations/manually_defined/ops_def.py +27 -306
- mindspore/ops/operations/nn_ops.py +2 -2
- mindspore/ops/operations/sparse_ops.py +0 -83
- mindspore/ops/primitive.py +1 -17
- mindspore/ops/tensor_method.py +72 -3
- mindspore/ops_generate/aclnn/aclnn_kernel_register_auto_cc_generator.py +5 -5
- mindspore/ops_generate/aclnn/gen_aclnn_implement.py +8 -8
- mindspore/ops_generate/api/functions_cc_generator.py +53 -4
- mindspore/ops_generate/api/tensor_func_reg_cpp_generator.py +25 -11
- mindspore/ops_generate/common/gen_constants.py +11 -10
- mindspore/ops_generate/common/op_proto.py +18 -1
- mindspore/ops_generate/common/template.py +102 -245
- mindspore/ops_generate/common/template_utils.py +212 -0
- mindspore/ops_generate/gen_custom_ops.py +69 -0
- mindspore/ops_generate/op_def/ops_def_cc_generator.py +78 -7
- mindspore/ops_generate/op_def_py/base_op_prim_py_generator.py +360 -0
- mindspore/ops_generate/op_def_py/custom_op_prim_py_generator.py +140 -0
- mindspore/ops_generate/op_def_py/op_def_py_generator.py +54 -7
- mindspore/ops_generate/op_def_py/op_prim_py_generator.py +5 -312
- mindspore/ops_generate/pyboost/auto_grad_impl_cc_generator.py +74 -17
- mindspore/ops_generate/pyboost/auto_grad_reg_cc_generator.py +22 -5
- mindspore/ops_generate/pyboost/op_template_parser.py +3 -2
- mindspore/ops_generate/pyboost/pyboost_functions_cpp_generator.py +21 -5
- mindspore/ops_generate/pyboost/pyboost_functions_h_generator.py +2 -2
- mindspore/ops_generate/pyboost/pyboost_functions_impl_cpp_generator.py +30 -10
- mindspore/ops_generate/pyboost/pyboost_grad_function_cpp_generator.py +10 -3
- mindspore/ops_generate/pyboost/pyboost_internal_kernel_info_adapter_generator.py +1 -1
- mindspore/ops_generate/pyboost/pyboost_native_grad_functions_generator.py +19 -9
- mindspore/ops_generate/pyboost/pyboost_op_cpp_code_generator.py +71 -28
- mindspore/ops_generate/pyboost/pyboost_overload_functions_cpp_generator.py +10 -9
- mindspore/ops_generate/pyboost/pyboost_utils.py +27 -16
- mindspore/ops_generate/resources/yaml_loader.py +13 -0
- mindspore/ops_generate/tensor_py_cc_generator.py +2 -2
- mindspore/parallel/_cell_wrapper.py +1 -1
- mindspore/parallel/_parallel_serialization.py +1 -4
- mindspore/parallel/_utils.py +29 -6
- mindspore/parallel/checkpoint_transform.py +18 -2
- mindspore/parallel/cluster/process_entity/_api.py +24 -32
- mindspore/parallel/cluster/process_entity/_utils.py +9 -5
- mindspore/{experimental/llm_boost/atb → parallel/distributed}/__init__.py +21 -23
- mindspore/parallel/distributed/distributed_data_parallel.py +393 -0
- mindspore/parallel/distributed/flatten_grad_buffer.py +295 -0
- mindspore/parallel/strategy.py +336 -0
- mindspore/parallel/transform_safetensors.py +117 -16
- mindspore/profiler/analysis/viewer/ascend_kernel_details_viewer.py +3 -0
- mindspore/profiler/analysis/viewer/ms_minddata_viewer.py +1 -1
- mindspore/profiler/common/constant.py +5 -0
- mindspore/profiler/common/file_manager.py +9 -0
- mindspore/profiler/common/msprof_cmd_tool.py +38 -2
- mindspore/profiler/common/path_manager.py +56 -24
- mindspore/profiler/common/profiler_context.py +2 -12
- mindspore/profiler/common/profiler_info.py +3 -3
- mindspore/profiler/common/profiler_path_manager.py +13 -0
- mindspore/profiler/common/util.py +30 -3
- mindspore/profiler/experimental_config.py +2 -1
- mindspore/profiler/platform/npu_profiler.py +33 -6
- mindspore/run_check/_check_version.py +108 -24
- mindspore/runtime/__init__.py +3 -2
- mindspore/runtime/executor.py +11 -3
- mindspore/runtime/memory.py +112 -0
- mindspore/swresample-4.dll +0 -0
- mindspore/swscale-6.dll +0 -0
- mindspore/tinyxml2.dll +0 -0
- mindspore/{experimental/llm_boost → tools}/__init__.py +5 -5
- mindspore/tools/data_dump.py +130 -0
- mindspore/tools/sdc_detect.py +91 -0
- mindspore/tools/stress_detect.py +63 -0
- mindspore/train/__init__.py +6 -6
- mindspore/train/_utils.py +5 -18
- mindspore/train/amp.py +6 -4
- mindspore/train/callback/_checkpoint.py +0 -9
- mindspore/train/callback/_train_fault_tolerance.py +69 -18
- mindspore/train/data_sink.py +1 -5
- mindspore/train/model.py +38 -211
- mindspore/train/serialization.py +126 -387
- mindspore/turbojpeg.dll +0 -0
- mindspore/utils/__init__.py +6 -3
- mindspore/utils/dlpack.py +92 -0
- mindspore/utils/dryrun.py +1 -1
- mindspore/utils/runtime_execution_order_check.py +10 -0
- mindspore/utils/sdc_detect.py +14 -12
- mindspore/utils/stress_detect.py +43 -0
- mindspore/utils/utils.py +144 -8
- mindspore/version.py +1 -1
- {mindspore-2.7.0.dist-info → mindspore-2.7.1.dist-info}/METADATA +3 -2
- {mindspore-2.7.0.dist-info → mindspore-2.7.1.dist-info}/RECORD +254 -267
- mindspore/experimental/llm_boost/ascend_native/llama_boost_ascend_native.py +0 -210
- mindspore/experimental/llm_boost/ascend_native/llm_boost.py +0 -52
- mindspore/experimental/llm_boost/atb/boost_base.py +0 -385
- mindspore/experimental/llm_boost/atb/llama_boost.py +0 -137
- mindspore/experimental/llm_boost/atb/qwen_boost.py +0 -124
- mindspore/experimental/llm_boost/register.py +0 -130
- mindspore/experimental/llm_boost/utils.py +0 -31
- mindspore/include/OWNERS +0 -7
- mindspore/mindspore_cpu_res_manager.dll +0 -0
- mindspore/mindspore_ops_kernel_common.dll +0 -0
- mindspore/mindspore_res_manager.dll +0 -0
- mindspore/nn/optim/_dist_optimizer_registry.py +0 -111
- mindspore/nn/reinforcement/_batch_read_write.py +0 -142
- mindspore/nn/reinforcement/_tensors_queue.py +0 -152
- mindspore/nn/reinforcement/tensor_array.py +0 -145
- mindspore/opencv_core452.dll +0 -0
- mindspore/opencv_imgcodecs452.dll +0 -0
- mindspore/ops/_op_impl/aicpu/priority_replay_buffer.py +0 -113
- mindspore/ops/_op_impl/aicpu/reservoir_replay_buffer.py +0 -96
- mindspore/ops/_op_impl/aicpu/sparse_cross.py +0 -42
- mindspore/ops/_op_impl/cpu/buffer_append.py +0 -28
- mindspore/ops/_op_impl/cpu/buffer_get.py +0 -28
- mindspore/ops/_op_impl/cpu/buffer_sample.py +0 -28
- mindspore/ops/_op_impl/cpu/priority_replay_buffer.py +0 -42
- mindspore/ops/operations/_tensor_array.py +0 -359
- mindspore/ops/operations/rl_ops.py +0 -288
- mindspore/parallel/_offload_context.py +0 -275
- mindspore/parallel/_recovery_context.py +0 -115
- mindspore/parallel/_transformer/__init__.py +0 -35
- mindspore/parallel/_transformer/layers.py +0 -765
- mindspore/parallel/_transformer/loss.py +0 -251
- mindspore/parallel/_transformer/moe.py +0 -693
- mindspore/parallel/_transformer/op_parallel_config.py +0 -222
- mindspore/parallel/_transformer/transformer.py +0 -3124
- mindspore/parallel/mpi/_mpi_config.py +0 -116
- mindspore/train/memory_profiling_pb2.py +0 -298
- {mindspore-2.7.0.dist-info → mindspore-2.7.1.dist-info}/WHEEL +0 -0
- {mindspore-2.7.0.dist-info → mindspore-2.7.1.dist-info}/entry_points.txt +0 -0
- {mindspore-2.7.0.dist-info → mindspore-2.7.1.dist-info}/top_level.txt +0 -0
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
# ============================================================================
|
|
15
15
|
"""
|
|
16
|
-
Generates mindspore/ccsrc/pybind_api/ir/tensor_py.cc which includes the CPython Tensor APIs.
|
|
16
|
+
Generates mindspore/ccsrc/pybind_api/ir/tensor/tensor_py.cc which includes the CPython Tensor APIs.
|
|
17
17
|
"""
|
|
18
18
|
|
|
19
19
|
import os
|
|
@@ -26,7 +26,7 @@ from pyboost import pyboost_utils
|
|
|
26
26
|
|
|
27
27
|
class TensorPyCppGenerator(BaseGenerator):
|
|
28
28
|
"""
|
|
29
|
-
This class is responsible for generating mindspore/ccsrc/pybind_api/ir/tensor_register/
|
|
29
|
+
This class is responsible for generating mindspore/ccsrc/pybind_api/ir/tensor/tensor_register/
|
|
30
30
|
auto_generate/tensor_py_gen.cc
|
|
31
31
|
"""
|
|
32
32
|
def __init__(self):
|
|
@@ -263,7 +263,7 @@ def _single_parameter_broadcast(net, layout, param_not_load=None, param_loaded=N
|
|
|
263
263
|
if not single_params:
|
|
264
264
|
return
|
|
265
265
|
param_redundancy_reversed = _get_param_redundancy_reversed(param_redundancy, cur_rank)
|
|
266
|
-
if not param_redundancy_reversed
|
|
266
|
+
if not param_redundancy_reversed:
|
|
267
267
|
return
|
|
268
268
|
net_param_dict = net.parameters_dict()
|
|
269
269
|
_chang_parallel_context(origin_dataset_strategy)
|
|
@@ -526,10 +526,7 @@ def _make_dir(path, arg_name):
|
|
|
526
526
|
else:
|
|
527
527
|
ms.log.debug("The directory(%s) doesn't exist, will create it", path)
|
|
528
528
|
try:
|
|
529
|
-
|
|
530
|
-
os.umask(permissions << 3 | permissions)
|
|
531
|
-
mode = permissions << 6
|
|
532
|
-
os.makedirs(path, mode=mode, exist_ok=True)
|
|
529
|
+
os.makedirs(path, mode=0o700, exist_ok=True)
|
|
533
530
|
real_path = path
|
|
534
531
|
except PermissionError as e:
|
|
535
532
|
ms.log.critical("No write permission on the directory(%r), error = %r", path, e)
|
mindspore/parallel/_utils.py
CHANGED
|
@@ -14,12 +14,13 @@
|
|
|
14
14
|
# ============================================================================
|
|
15
15
|
"""Utils of auto parallel"""
|
|
16
16
|
import os
|
|
17
|
+
import re
|
|
17
18
|
from time import perf_counter
|
|
18
19
|
from importlib import import_module
|
|
19
20
|
import numpy as np
|
|
20
21
|
import mindspore as ms
|
|
21
22
|
from mindspore import context, log as logger
|
|
22
|
-
from mindspore._c_expression import reset_op_id
|
|
23
|
+
from mindspore._c_expression import reset_op_id
|
|
23
24
|
from mindspore.common.tensor import Tensor
|
|
24
25
|
from mindspore.common.dtype import _dtype_to_nptype
|
|
25
26
|
from mindspore.common import dtype as mstype
|
|
@@ -584,11 +585,6 @@ def _reset_op_id():
|
|
|
584
585
|
reset_op_id()
|
|
585
586
|
|
|
586
587
|
|
|
587
|
-
def _reset_op_id_with_offset():
|
|
588
|
-
"""Reset op id with offset."""
|
|
589
|
-
reset_op_id_with_offset()
|
|
590
|
-
|
|
591
|
-
|
|
592
588
|
def _parallel_predict_check():
|
|
593
589
|
"""validate parallel model prediction"""
|
|
594
590
|
if _is_in_auto_parallel_mode():
|
|
@@ -798,3 +794,30 @@ def _check_rank(cur_rank, initial_rank, pipeline_stages):
|
|
|
798
794
|
raise ValueError(f"For parameter broadcast, the cur_rank: {cur_rank} is wrong.")
|
|
799
795
|
if initial_rank % (get_group_size() / pipeline_stages) != 0:
|
|
800
796
|
raise ValueError(f"For parameter broadcast, the initial_rank: {initial_rank} is wrong.")
|
|
797
|
+
|
|
798
|
+
|
|
799
|
+
def _check_path_safe(path, arg_name):
|
|
800
|
+
"""
|
|
801
|
+
Check input path string is safe.
|
|
802
|
+
"""
|
|
803
|
+
illegal_patterns = [
|
|
804
|
+
r"\.\.",
|
|
805
|
+
r"//+",
|
|
806
|
+
r"~",
|
|
807
|
+
r"^\s*$",
|
|
808
|
+
r"\./\."
|
|
809
|
+
]
|
|
810
|
+
for pattern in illegal_patterns:
|
|
811
|
+
if re.search(pattern, path):
|
|
812
|
+
pattern_info = pattern.replace('\\', '')
|
|
813
|
+
raise ValueError(f"{arg_name} contains '{pattern_info}' is not safe, please use a safe one.")
|
|
814
|
+
|
|
815
|
+
|
|
816
|
+
def _check_path_writable(path):
|
|
817
|
+
"""
|
|
818
|
+
Check the write permission of the input path.
|
|
819
|
+
"""
|
|
820
|
+
if not os.path.exists(path):
|
|
821
|
+
raise RuntimeError(f"{path} Path does not exist.")
|
|
822
|
+
if not os.access(path, os.W_OK):
|
|
823
|
+
raise PermissionError(f"Don't have the write permission on the directory {path}.")
|
|
@@ -31,7 +31,7 @@ from mindspore.communication.management import get_rank, get_group_size
|
|
|
31
31
|
from mindspore.parallel._tensor import _load_tensor, _reshape_param_data, _reshape_param_data_with_weight, \
|
|
32
32
|
_get_tensor_slice_index, _get_tensor_strategy
|
|
33
33
|
from mindspore.parallel._utils import _is_in_auto_parallel_mode, _get_pipeline_stages, _infer_rank_list, \
|
|
34
|
-
_remove_repeated_slices, _get_auto_parallel_net
|
|
34
|
+
_remove_repeated_slices, _get_auto_parallel_net, _check_path_safe, _check_path_writable
|
|
35
35
|
from mindspore.parallel._parallel_serialization import _rank_list_for_transform_parallel_checkpoint, \
|
|
36
36
|
_transform_parallel_checkpoint, _get_device_num_from_strategy, _make_dir, _build_searched_strategy, \
|
|
37
37
|
_extract_layout_map, _extract_src_dst_layout_map, _parameter_not_in_local_stage, _extract_pipeline_stage_num, \
|
|
@@ -69,7 +69,9 @@ def merge_pipeline_strategys(src_strategy_dirs, dst_strategy_file):
|
|
|
69
69
|
>>> ms.parallel.merge_pipeline_strategys("./src_strategy_dir", "./dst_strategy.ckpt")
|
|
70
70
|
|
|
71
71
|
"""
|
|
72
|
-
|
|
72
|
+
dst_strategy_file = os.path.normpath(dst_strategy_file)
|
|
73
|
+
dst_strategy_file = os.path.abspath(dst_strategy_file)
|
|
74
|
+
dst_strategy_dir = os.path.dirname(dst_strategy_file)
|
|
73
75
|
if not os.path.exists(dst_strategy_dir):
|
|
74
76
|
_make_dir(dst_strategy_dir, "path")
|
|
75
77
|
if not os.path.isdir(src_strategy_dirs):
|
|
@@ -495,6 +497,9 @@ def _transform_checkpoint_by_stage(src_checkpoints_dir, dst_checkpoints_dir, ckp
|
|
|
495
497
|
def _transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix, src_strategy_file=None,
|
|
496
498
|
dst_strategy_file=None):
|
|
497
499
|
"""Transform checkpoints for all stages in src_strategy_file"""
|
|
500
|
+
_check_path_safe(dst_checkpoints_dir, "dst_checkpoints_dir")
|
|
501
|
+
dst_checkpoints_dir = os.path.realpath(dst_checkpoints_dir)
|
|
502
|
+
_check_path_safe(ckpt_prefix, "ckpt_prefix")
|
|
498
503
|
checkpoints_rank_dir_list = os.path.join(src_checkpoints_dir, "rank_[0-9]*")
|
|
499
504
|
all_checkpoint_files_map = {}
|
|
500
505
|
for checkpoint_dir in glob.glob(checkpoints_rank_dir_list):
|
|
@@ -563,6 +568,7 @@ def _transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix
|
|
|
563
568
|
save_checkpoint_file_dir = os.path.join(dst_checkpoints_dir, "rank_{}".format(transform_rank))
|
|
564
569
|
if not os.path.exists(save_checkpoint_file_dir):
|
|
565
570
|
_make_dir(save_checkpoint_file_dir, "path")
|
|
571
|
+
_check_path_writable(save_checkpoint_file_dir)
|
|
566
572
|
save_checkpoint_file_name = os.path.join(save_checkpoint_file_dir, save_checkpoint_file)
|
|
567
573
|
ms.save_checkpoint(transform_param_list, save_checkpoint_file_name)
|
|
568
574
|
del param_total_dict_copy
|
|
@@ -913,6 +919,15 @@ def set_op_strategy_config(mode="SAVE", path=""):
|
|
|
913
919
|
if file_type != ".json":
|
|
914
920
|
raise KeyError("File type must be .json")
|
|
915
921
|
dir_path = os.path.dirname(path)
|
|
922
|
+
|
|
923
|
+
normalized_path = os.path.abspath(os.path.realpath(path))
|
|
924
|
+
dangerous_paths = ['/etc', '/usr', '/bin', '/sbin', '/boot', '/proc', '/sys']
|
|
925
|
+
for dangerous_path in dangerous_paths:
|
|
926
|
+
if normalized_path.startswith(dangerous_path):
|
|
927
|
+
raise PermissionError(
|
|
928
|
+
f"Writing to system directory '{dangerous_path}' is not allowed"
|
|
929
|
+
)
|
|
930
|
+
|
|
916
931
|
if dir_path and not os.path.exists(dir_path):
|
|
917
932
|
os.makedirs(dir_path, mode=0o700, exist_ok=True)
|
|
918
933
|
check_mode_type = ["SAVE", "LOAD"]
|
|
@@ -1182,6 +1197,7 @@ def load_distributed_checkpoint(network, checkpoint_filenames=None, predict_stra
|
|
|
1182
1197
|
|
|
1183
1198
|
param_total_dict = defaultdict(dict)
|
|
1184
1199
|
for file_index, file_name in enumerate(checkpoint_filenames):
|
|
1200
|
+
file_name = os.path.abspath(file_name)
|
|
1185
1201
|
ckpt_dict = ms.load_checkpoint(file_name, dec_key=dec_key, dec_mode=dec_mode)
|
|
1186
1202
|
for param_name, param in ckpt_dict.items():
|
|
1187
1203
|
param_total_dict[param_name][file_index] = param
|
|
@@ -21,6 +21,7 @@ import subprocess
|
|
|
21
21
|
import socket
|
|
22
22
|
import psutil
|
|
23
23
|
import mindspore.log as logger
|
|
24
|
+
from mindspore.utils import RSCPluginHandle
|
|
24
25
|
from ._utils import _generate_cmd_args_list, _generate_cmd_args_list_with_core, _generate_url, \
|
|
25
26
|
_is_local_ip, _convert_addr_to_ip, _send_scale_num, _get_local_ip, _generate_auto_bind_core_strategy, \
|
|
26
27
|
_generate_bind_core_strategy
|
|
@@ -221,23 +222,28 @@ class _ProcessManager:
|
|
|
221
222
|
|
|
222
223
|
self.proc_rank_map = {}
|
|
223
224
|
self.enable_mindx = False
|
|
225
|
+
self.handler = None
|
|
224
226
|
self._check_taskd()
|
|
225
227
|
|
|
226
228
|
def _check_taskd(self):
|
|
227
229
|
"""check if enable taskd."""
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
230
|
+
self.handler = RSCPluginHandle()
|
|
231
|
+
self.enable_mindx = self.handler.check_enable()
|
|
232
|
+
if self.enable_mindx is False:
|
|
233
|
+
self.handler = None
|
|
234
|
+
return
|
|
235
|
+
ret = self.handler.register_callback({"KILL_WORKER": self.kill_workers,
|
|
236
|
+
"START_ALL_WORKER": self.start_all_workers,
|
|
237
|
+
"START_WORKER_LIST": self.start_worker_list,
|
|
238
|
+
"MONITOR": self.monitor_rank_status
|
|
239
|
+
})
|
|
240
|
+
if not ret:
|
|
241
|
+
logger.warning(f"Register callback to mindx failed, process controlled by msrun.")
|
|
242
|
+
self.enable_mindx = False
|
|
243
|
+
self.handler = None
|
|
244
|
+
return
|
|
245
|
+
logger.warning(f"Mindx enabled, process controlled by mindx.")
|
|
246
|
+
os.environ["MS_ENABLE_RECOVERY"] = str(1)
|
|
241
247
|
|
|
242
248
|
def run(self):
|
|
243
249
|
"""
|
|
@@ -260,7 +266,7 @@ class _ProcessManager:
|
|
|
260
266
|
if self.is_master and not self.is_simulation:
|
|
261
267
|
self.start_scheduler()
|
|
262
268
|
if self.enable_mindx:
|
|
263
|
-
self.
|
|
269
|
+
self.handler.start()
|
|
264
270
|
else:
|
|
265
271
|
self.start_workers()
|
|
266
272
|
if self.join:
|
|
@@ -382,8 +388,7 @@ class _ProcessManager:
|
|
|
382
388
|
logger.error(f"Scheduler process {self.msn_process.pid} exit with exception.")
|
|
383
389
|
|
|
384
390
|
if has_exception:
|
|
385
|
-
|
|
386
|
-
self._analyze_log()
|
|
391
|
+
self._analyze_sched_log()
|
|
387
392
|
raise RuntimeError("Distributed job exited with exception. Please check logs in "
|
|
388
393
|
f"directory: {self.log_dir}.")
|
|
389
394
|
|
|
@@ -583,26 +588,13 @@ class _ProcessManager:
|
|
|
583
588
|
log_name = os.path.join(self.log_dir, formatted_log_name + "_" + str(index) + ".log")
|
|
584
589
|
return node_id, log_name
|
|
585
590
|
|
|
586
|
-
def
|
|
591
|
+
def _analyze_sched_log(self):
|
|
587
592
|
"""
|
|
588
|
-
Analyze
|
|
593
|
+
Analyze scheduler log.
|
|
589
594
|
"""
|
|
590
595
|
scheduler_log_path = os.path.join(self.log_dir, "scheduler.log")
|
|
591
|
-
time_out_node_ids = []
|
|
592
596
|
if os.path.exists(scheduler_log_path):
|
|
593
|
-
|
|
594
|
-
scheduler_log = log.read()
|
|
595
|
-
# Filter out abnormal logs.
|
|
596
|
-
time_out_node_log = re.findall(r"node: .* is timed out", scheduler_log)
|
|
597
|
-
|
|
598
|
-
# Filter out node ids of the processes which exit abnormally.
|
|
599
|
-
def node_id_splitter(node_id):
|
|
600
|
-
return re.split(" is timed out", re.split("node: ", node_id)[1])[0]
|
|
601
|
-
for node_id in time_out_node_log:
|
|
602
|
-
time_out_node_ids.append(node_id_splitter(node_id))
|
|
603
|
-
logger.error(f"Time out nodes are {time_out_node_ids}")
|
|
604
|
-
|
|
605
|
-
os.system(f"grep -rn -E 'ERROR|CRITICAL|Traceback|Error' -C 5 {self.log_dir}")
|
|
597
|
+
os.system(f"cat {scheduler_log_path} | grep -E 'ERROR|CRITICAL|Traceback|Error' -C 5")
|
|
606
598
|
|
|
607
599
|
def format_worker_log_name(self):
|
|
608
600
|
"""
|
|
@@ -30,7 +30,7 @@ def _generate_cmd(cmd, cmd_args, output_name):
|
|
|
30
30
|
|
|
31
31
|
"""
|
|
32
32
|
if cmd not in ['python', 'pytest', 'python3']:
|
|
33
|
-
# If user don't set binary file name,
|
|
33
|
+
# If user don't set binary file name, defaultly use 'python' to launch the job.
|
|
34
34
|
command = f"python {cmd} {' '.join(cmd_args)} > {output_name} 2>&1 &"
|
|
35
35
|
else:
|
|
36
36
|
command = f"{cmd} {' '.join(cmd_args)} > {output_name} 2>&1 &"
|
|
@@ -42,7 +42,7 @@ def _generate_cmd_args_list(cmd, cmd_args):
|
|
|
42
42
|
Generates arguments list for 'Popen'. It consists of a binary file name and subsequential arguments.
|
|
43
43
|
"""
|
|
44
44
|
if cmd not in ['python', 'pytest', 'python3']:
|
|
45
|
-
# If user don't set binary file name,
|
|
45
|
+
# If user don't set binary file name, defaultly use 'python' to launch the job.
|
|
46
46
|
return ['python'] + [cmd] + cmd_args
|
|
47
47
|
return [cmd] + cmd_args
|
|
48
48
|
|
|
@@ -55,7 +55,7 @@ def _generate_cmd_args_list_with_core(cmd, cmd_args, affinity_cpu_str):
|
|
|
55
55
|
taskset_args = ['taskset'] + ['-c'] + [affinity_cpu_str]
|
|
56
56
|
final_cmd = []
|
|
57
57
|
if cmd not in ['python', 'pytest', 'python3']:
|
|
58
|
-
# If user don't set binary file name,
|
|
58
|
+
# If user don't set binary file name, defaultly use 'python' to launch the job.
|
|
59
59
|
final_cmd = taskset_args + ['python'] + [cmd] + cmd_args
|
|
60
60
|
else:
|
|
61
61
|
final_cmd = taskset_args + [cmd] + cmd_args
|
|
@@ -143,8 +143,14 @@ def _parse_global_device_to_cpu_map(local_rank_id, physical_device_id, device_to
|
|
|
143
143
|
Parse the global device_to_cpu_map and return a cpu list for assigned local_rank_id.
|
|
144
144
|
|
|
145
145
|
"""
|
|
146
|
+
if local_rank_id >= len(list(device_to_cpu_map.keys())):
|
|
147
|
+
logger.warning(f"Cannot find process[{local_rank_id}] in args '--bind_core'. "
|
|
148
|
+
"Will not launch process with taskset.")
|
|
149
|
+
return ""
|
|
146
150
|
input_device_id = int(list(device_to_cpu_map.keys())[local_rank_id].replace("device", ""))
|
|
147
151
|
if physical_device_id != input_device_id:
|
|
152
|
+
logger.warning(f"Cannot find physical_device_id[{physical_device_id}] for process[{local_rank_id}] "
|
|
153
|
+
"in args '--bind_core'. Will not launch process with taskset.")
|
|
148
154
|
return ""
|
|
149
155
|
affinity_cpu_list = list(device_to_cpu_map.values())[local_rank_id]
|
|
150
156
|
affinity_cpu_str = ",".join(affinity_cpu_list)
|
|
@@ -212,8 +218,6 @@ def _generate_bind_core_strategy(local_rank_id, device_to_cpu_map, arg_bind_core
|
|
|
212
218
|
if isinstance(arg_bind_core, dict):
|
|
213
219
|
affinity_cpu_str = _parse_global_device_to_cpu_map(local_rank_id, physical_device_id, arg_bind_core)
|
|
214
220
|
if not affinity_cpu_str:
|
|
215
|
-
logger.warning(f"Failed to find physical_device_id[{physical_device_id}] for "
|
|
216
|
-
f"process[{local_rank_id}]. Will not launch process with taskset.")
|
|
217
221
|
return None
|
|
218
222
|
elif arg_bind_core is True:
|
|
219
223
|
cpu_list_for_device = device_to_cpu_map.get(physical_device_id, [])
|
|
@@ -1,23 +1,21 @@
|
|
|
1
|
-
# Copyright
|
|
2
|
-
#
|
|
3
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
-
# you may not use this file except in compliance with the License.
|
|
5
|
-
# You may obtain a copy of the License at
|
|
6
|
-
#
|
|
7
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
-
#
|
|
9
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
-
# See the License for the specific language governing permissions and
|
|
13
|
-
# limitations under the License.
|
|
14
|
-
# ============================================================================
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
__all__ = ['LlamaBoost', 'QwenBoost']
|
|
1
|
+
# Copyright 2025 Huawei Technologies Co., Ltd
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
# ============================================================================
|
|
15
|
+
|
|
16
|
+
"""distributed init"""
|
|
17
|
+
from mindspore.parallel.distributed.distributed_data_parallel import DistributedDataParallel
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
"DistributedDataParallel",
|
|
21
|
+
]
|