PyPI - mindspore - Versions diffs - 2.3.0__cp39-none-any.whl → 2.3.0rc2__cp39-none-any.whl - Mend

mindspore 2.3.0cp39-none-any.whl → 2.3.0rc2cp39-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mindspore might be problematic. Click here for more details.

Files changed (423) hide show

mindspore/ops_generate/gen_pyboost_func.py CHANGED Viewed

@@ -158,7 +158,7 @@ def generate_pyboost_op_source_code(work_path, op_proto, template_paths, convert
     operator_name = converter.functional_name
     call_args_tensor = []
     for type, arg_name in zip(converter.call_args_types, converter.call_args):
-        if type in ("BaseTensorPtr", "std::optional<BaseTensorPtr>"):
+        if type in ("TensorPtr", "std::optional<TensorPtr>"):
             call_args_tensor.append(arg_name)
     for call_tpl, src_tpl, view_tpl, cus_tpl, gen_path in zip(template_paths.op_call_template_path,
@@ -181,7 +181,6 @@ def generate_pyboost_op_source_code(work_path, op_proto, template_paths, convert
         get_cube_math_type = ''
         real_output = ', ' + converter.op_outputs
         proto_operator_name = op_proto.operator_name
-        register_custom_kernel = ''
         if is_ascend and op_proto.ascend != 'default':
             call_impl = cus_tpl.replace(call_args=converter.call_args,
                                         return_values=converter.call_func_outputs,
@@ -196,7 +195,6 @@ def generate_pyboost_op_source_code(work_path, op_proto, template_paths, convert
                                         )
             customize_include = "#include \"plugin/device/cpu/kernel/pyboost/customize/{}.h\"".format(
                 operator_name.lower())
-            register_custom_kernel = "MS_REG_PYBOOST_CPU_CUSTOM_KERNEL({});".format(op_name_str)
         elif is_gpu and op_proto.gpu != 'default':
             call_impl = cus_tpl.replace(call_args=converter.call_args,
                                         return_values=converter.call_func_outputs,
@@ -204,7 +202,6 @@ def generate_pyboost_op_source_code(work_path, op_proto, template_paths, convert
                                         )
             customize_include = "#include \"plugin/device/gpu/kernel/pyboost/customize/{}.h\"".format(
                 operator_name.lower())
-            register_custom_kernel = "MS_REG_PYBOOST_GPU_CUSTOM_KERNEL({});".format(op_name_str)
         elif op_proto.is_view:
             set_output_abs = "SetOutputAbstract();"
             if converter.call_func_outputs == "outputs_":
@@ -254,8 +251,7 @@ def generate_pyboost_op_source_code(work_path, op_proto, template_paths, convert
                                                 call_args_with_type=converter.call_args_with_types,
                                                 return_type=converter.cpp_func_return,
                                                 customize_include=customize_include,
-                                                call_impl=call_impl,
-                                                register_custom_kernel=register_custom_kernel)
+                                                call_impl=call_impl)
         op_header_dir_path = os.path.join(work_path, gen_path)
         tmp_op_source_file_path = os.path.join(op_header_dir_path, "tmp_" + operator_name.lower() + ".cc")
         dst_op_source_file_path = os.path.join(op_header_dir_path, operator_name.lower() + ".cc")
@@ -300,16 +296,8 @@ def generate_pyboost_op_return_code(op_proto):
 def generate_pyboost_op_func_return_type(op_proto):
     """ generate_pyboost_op_func_return_type """
     returns_type = []
-    type_convert_to_base = {
-        'std::vector<tensor::TensorPtr>': 'std::vector<tensor::BaseTensorPtr>',
-        'tensor::TensorPtr': 'tensor::BaseTensorPtr'
-    }
     for return_obj in op_proto.returns:
-        temp_return = get_return_type(return_obj.arg_dtype)
-        if temp_return in type_convert_to_base:
-            returns_type.append(type_convert_to_base[temp_return])
-        else:
-            raise Exception("Not return found")
+        returns_type.append(get_return_type(return_obj.arg_dtype))
     if len(returns_type) == 1:
         cpp_func_return = returns_type[0]
     elif len(returns_type) > 1:
@@ -429,7 +417,6 @@ def generate_pyboost_functions(work_path, yaml_data):
         convert_stub_str = ''
         optional_to_value_str = ''
         need_contiguous = 'true'
-        value_str = '_value'
         if op_proto.is_view:
             # view/aclnn op no need to contiguous tensor.
             need_contiguous = 'false'
@@ -445,7 +432,7 @@ def generate_pyboost_functions(work_path, yaml_data):
                                                                            need_contiguous=need_contiguous)
                     cast_output = cast_str + convert_stub_output_name
-                    convert_optional_to_value_name = op_arg.arg_name + value_str
+                    convert_optional_to_value_name = op_arg.arg_name + "_value"
                     optional_to_value_str += \
                         convert_optional_to_value_template.replace(input=cast_output,
                                                                    output=convert_optional_to_value_name)
@@ -461,35 +448,19 @@ def generate_pyboost_functions(work_path, yaml_data):
                     grad_arg = cast_str + convert_stub_output_name
                     cast_arg = grad_arg
             elif pyboost_utils.is_tensor_list(op_arg):
-                if is_optional_param(op_arg):
-                    # to adapt the cases that TensorList is optional.
-                    convert_stub_output_name = op_arg.arg_name + '_optional'
-                    convert_stub_str += convert_to_tensor_list_template.replace(output=convert_stub_output_name,
-                                                                                input=op_arg.arg_name,
-                                                                                need_contiguous=need_contiguous)
-                    cast_output = cast_str + convert_stub_output_name
-                    convert_optional_to_value_name = op_arg.arg_name + value_str
-                    optional_to_value_str += \
-                        convert_optional_to_value_template.replace(input=cast_output,
-                                                                   output=convert_optional_to_value_name)
-                    call_arg = convert_stub_output_name
-                    grad_arg = convert_optional_to_value_name
-                    cast_arg = cast_output
-                else:
-                    convert_stub_output_name = op_arg.arg_name + "_tensor_list"
-                    convert_stub_str += convert_to_tensor_list_template.replace(input=op_arg.arg_name,
-                                                                                output=convert_stub_output_name,
-                                                                                need_contiguous=need_contiguous)
-                    call_arg = convert_stub_output_name
-                    grad_arg = cast_str + convert_stub_output_name
-                    cast_arg = grad_arg
+                convert_stub_output_name = op_arg.arg_name + "_tensor_list"
+                convert_stub_str += convert_to_tensor_list_template.replace(input=op_arg.arg_name,
+                                                                            output=convert_stub_output_name,
+                                                                            need_contiguous=need_contiguous)
+                call_arg = convert_stub_output_name
+                grad_arg = cast_str + convert_stub_output_name
+                cast_arg = grad_arg
             else:
                 call_arg = op_arg.arg_name
                 grad_arg = cast_str + op_arg.arg_name
                 cast_arg = grad_arg
                 if is_optional_param(op_arg):
-                    convert_optional_to_value_name = op_arg.arg_name + value_str
+                    convert_optional_to_value_name = op_arg.arg_name + "_value"
                     optional_to_value_str += \
                         convert_optional_to_value_template.replace(input=call_arg,
                                                                    output=convert_optional_to_value_name)

mindspore/ops_generate/gen_utils.py CHANGED Viewed

@@ -76,12 +76,10 @@ def get_type_str(type_str):
         'tuple[float]',
         'tuple[bool]',
         'tuple[tensor]',
-        'tuple[str]',
         'list[int]',
         'list[float]',
         'list[bool]',
         'list[tensor]',
-        'list[str]',
         'tensor',
         'type',
     }
@@ -147,25 +145,6 @@ def merge_files(origin_dir, merged_file_path, file_format):
     merge_files_to_one_file(op_yaml_file_names, merged_file_path)
-def merge_files_append(origin_dir, merged_file_path, file_format):
-    """
-    Merge multiple files into one file.
-    origin_dir: indicates the origin file directory.
-    merged_file_path: indicates the merged file path.
-    file_format: indicates the format of regular matching.
-    Files whose names meet the regular matching in 'origin_dir' directory will be merged into one file.
-    """
-    file_paths = glob.glob(os.path.join(origin_dir, file_format))
-    merged_content = ''
-    file_paths.sort()
-    for file_path in file_paths:
-        with open(file_path, 'r') as file:
-            merged_content += file.read()
-            merged_content += '\n'
-    with open(merged_file_path, 'a') as file:
-        file.write(merged_content)
 def safe_load_yaml(yaml_file_path):
     """
     Load yaml dictionary from file.

mindspore/ops_generate/pyboost_utils.py CHANGED Viewed

@@ -85,11 +85,9 @@ def get_convert_type_str(dtype: str, optional):
         'tuple[int]': 'ToIntListOptional<py::tuple>',
         'tuple[float]': 'ToFloatListOptional<py::tuple>',
         'tuple[bool]': 'ToBoolListOptional<py::tuple>',
-        'tuple[tensor]': 'ToTensorListOptional<py::tuple>',
         'list[int]': 'ToIntListOptional<py::list>',
         'list[float]': 'ToFloatListOptional<py::list>',
         'list[bool]': 'ToBoolListOptional<py::list>',
-        'list[tensor]': 'ToTensorListOptional<py::list>',
     }
     if optional:
         if dtype in optional_type_convert:
@@ -181,7 +179,7 @@ def get_input_dtype(dtype: str, optional):
         'bool': 'BoolImmPtr',
         'number': 'ScalarPtr',
         'str': 'StringImmPtr',
-        'tensor': 'BaseTensorPtr',
+        'tensor': 'TensorPtr',
         'tuple[int]': value_tuple,
         'tuple[float]': value_tuple,
         'tuple[bool]': value_tuple,
@@ -198,11 +196,10 @@ def get_input_dtype(dtype: str, optional):
         'bool': 'std::optional<BoolImmPtr>',
         'number': 'std::optional<ScalarPtr>',
         'str': 'std::optional<StringImmPtr>',
-        'tensor': 'std::optional<BaseTensorPtr>',
+        'tensor': 'std::optional<TensorPtr>',
         'tuple[int]': value_tuple_optional,
         'tuple[float]': value_tuple_optional,
         'tuple[bool]': value_tuple_optional,
-        'tuple[tensor]': value_tuple_optional,
     }
     if optional:
         if dtype in optional_type_convert:
@@ -284,8 +281,6 @@ def get_tuple_input_convert(arg_name, arg_type):
     :return:
     """
     cpp_type = tuple_input_to_cpp_type(arg_type)
-    if cpp_type == "TensorPtr":
-        cpp_type = "BaseTensorPtr"
     return f"std::vector<{cpp_type}> {arg_name}_vector = ConvertValueTupleToVector<{cpp_type}>({arg_name});\n"

mindspore/ops_generate/template.py CHANGED Viewed

@@ -234,6 +234,5 @@ OpDef g${class_name} = {
   },
   /*.func_impl_=*/g${class_name}FuncImpl,
   /*.enable_dispatch_ =*/${enable_dispatch},
-  /*.is_view_ =*/${is_view},
 };
 """)

mindspore/parallel/_auto_parallel_context.py CHANGED Viewed

@@ -242,19 +242,6 @@ class _AutoParallelContext:
         self.check_context_handle()
         return self._context_handle.get_pipeline_stage_split_num()
-    def set_auto_pipeline(self, auto_pipeline):
-        """Set the pipeline stage number to automatic"""
-        if not isinstance(auto_pipeline, bool):
-            raise TypeError("For 'set_auto_parallel_context', the argument 'auto_pipeline' "
-                            "must be bool, but got the type : {}.".format(type(auto_pipeline)))
-        self.check_context_handle()
-        self._context_handle.set_auto_pipeline(auto_pipeline)
-    def get_auto_pipeline(self):
-        """Get whether the pipeline stage number is automatic"""
-        self.check_context_handle()
-        return self._context_handle.get_auto_pipeline()
     def set_pipeline_result_broadcast(self, pipeline_result_broadcast):
         """
         Set the value of enabling pipeline result broadcast. Default: ``False``.
@@ -584,7 +571,7 @@ class _AutoParallelContext:
         self.check_context_handle()
         dir_path = os.path.dirname(strategy_ckpt_save_file)
         if dir_path and not os.path.exists(dir_path):
-            os.makedirs(dir_path, exist_ok=True)
+            os.makedirs(dir_path)
         self._context_handle.set_strategy_ckpt_save_file(strategy_ckpt_save_file)
     def get_strategy_ckpt_save_file(self):
@@ -1242,7 +1229,6 @@ _set_auto_parallel_context_func_map = {
     "gradient_fp32_sync": auto_parallel_context().set_gradient_fp32_sync,
     "loss_repeated_mean": auto_parallel_context().set_loss_repeated_mean,
     "pipeline_stages": auto_parallel_context().set_pipeline_stages,
-    "auto_pipeline": auto_parallel_context().set_auto_pipeline,
     "pipeline_result_broadcast": auto_parallel_context().set_pipeline_result_broadcast,
     "pipeline_segments": auto_parallel_context().set_pipeline_segments,
     "parallel_mode": auto_parallel_context().set_parallel_mode,
@@ -1275,7 +1261,6 @@ _get_auto_parallel_context_func_map = {
     "gradient_fp32_sync": auto_parallel_context().get_gradient_fp32_sync,
     "loss_repeated_mean": auto_parallel_context().get_loss_repeated_mean,
     "pipeline_stages": auto_parallel_context().get_pipeline_stages,
-    "auto_pipeline": auto_parallel_context().get_auto_pipeline,
     "pipeline_result_broadcast": auto_parallel_context().get_pipeline_result_broadcast,
     "pipeline_interleave": auto_parallel_context().get_pipeline_interleave,
     "pipeline_scheduler": auto_parallel_context().get_pipeline_scheduler,
@@ -1364,9 +1349,6 @@ def _set_auto_parallel_context(**kwargs):
                         the devices are distributed alone the pipeline. The total devices will be divided into
                         'pipeline_stags' stages. This currently could only be used when
                         parallel mode semi_auto_parallel is enabled. Default: 0
-        auto_pipeline (bool): Set the pipeline stage number to automatic. Its value will be selected between 1 and the
-                        parameter `pipeline_stages`. This option requires the `parallel_mode` to be ``auto_parallel``
-                        and the `search_mode` to be ``recursive_programming``. Default: ``False`` .
         pipeline_result_broadcast (bool): A switch that broadcast the last stage result to all other stage in pipeline
                         parallel inference. Default: ``False`` .
         communi_parallel_mode (str): There are tree kinds of communication parallel modes, "all_group_parallel",
@@ -1410,7 +1392,6 @@ def _set_auto_parallel_context(**kwargs):
                         and `size`. Config is same as `allgather`.
     Raises:
         ValueError: If input key is not attribute in auto parallel context.
     """
@@ -1458,7 +1439,6 @@ def _reset_auto_parallel_context():
     - auto_parallel_search_mode: 'recursive_programming
     - sharding_propagation: False
     - pipeline_stages: 0
-    - auto_pipeline: False
     - pipeline_result_broadcast: False
     - gradient_accumulation_shard: True
     - fusion_threshold: 64

mindspore/parallel/_tensor.py CHANGED Viewed

@@ -19,6 +19,7 @@ from __future__ import absolute_import
 import copy
 import numpy as np
 from mindspore.common.tensor import Tensor
+from mindspore.common import dtype as mstype
 from mindspore.communication.management import get_rank, get_group_size
 from mindspore._c_expression import TensorTransform
@@ -223,6 +224,10 @@ def _load_tensor(tensor, dev_mat, tensor_map, full_shape=None, rank_id=-1):
         rank = rank_id
     tensor_strategy = _get_tensor_strategy(dev_mat, tensor_map)
     tensor_slice_index = _get_tensor_slice_index(dev_mat, tensor_strategy, tensor_map, rank)
+    if tensor.dtype == mstype.bfloat16:
+        from mindspore.ops.operations import Cast
+        cpu_cast = Cast().set_device("CPU")
+        tensor = cpu_cast(tensor, mstype.float32)
     np_tensor = tensor.asnumpy()
     if full_shape:
         np_tensor = np_tensor.reshape(full_shape)

mindspore/parallel/_transformer/transformer.py CHANGED Viewed

@@ -400,7 +400,7 @@ class FeedForward(Cell):
             >>> from mindspore.nn.transformer import FeedForward
             >>> from mindspore import dtype as mstype
             >>> from mindspore import Tensor, nn
-            >>> from mindspore import ops
+            >>> import mindspore.ops as ops
             >>> model = FeedForward(hidden_size=15, ffn_hidden_size=30, dropout_rate=0.1)
             >>> tensor = Tensor(np.ones((2, 20, 15)), mstype.float32)
             >>> output = model(tensor)

mindspore/parallel/_utils.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright 2023-2024 Huawei Technologies Co., Ltd
+# Copyright 2023 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -27,8 +27,6 @@ from mindspore.common.seed import get_seed
 from mindspore._c_expression import GraphExecutor_
 from mindspore.parallel._tensor import _load_tensor_by_layout
-SUPPORTED_TUPLE_IN_TUPLE_STRATEGY = ["GroupedMatmul", "FusedInferAttentionScore"]
 def _get_parallel_mode():
     """Get parallel mode."""
@@ -94,18 +92,6 @@ def _need_to_full():
 def _slice_parameter(parameter, phase, layout):
     """Slice python parameter obj according to the layout."""
-    is_train_phase = phase.startswith('train')
-    is_prefill_phase = phase.startswith('prefill')
-    if layout is not None and parameter.from_ckpt and not is_train_phase:
-        is_opt_shard_group = layout[5]
-        if not parameter.sliced and is_prefill_phase and is_opt_shard_group:
-            rank = get_rank()
-            new_tensor = _load_tensor_by_layout(parameter, layout, rank)
-            parameter.set_data(new_tensor, True)
-            return
-        layout_shape = layout[2]
-        parameter.shape = tuple(layout_shape)
-        return
     graph_executor = GraphExecutor_.get_instance()
     new_param = parameter.init_data(layout, set_sliced=True)
     parameter = new_param

mindspore/parallel/algo_parameter_config.py CHANGED Viewed

@@ -227,7 +227,9 @@ get_algo_parameters_config_func_map = {
                  enable_algo_approxi=bool, algo_approxi_epsilon=float)
 def set_algo_parameters(**kwargs):
     """
-    Set parameters in the algorithm for parallel strategy searching.
+    Set parameters in the algorithm for parallel strategy searching. See a typical use in
+    `test_auto_parallel_resnet.py
+    <https://gitee.com/mindspore/mindspore/blob/master/tests/ut/python/parallel/test_auto_parallel_resnet.py>`_.
     Note:
         The attribute name is required. This interface works ONLY in AUTO_PARALLEL mode.

mindspore/parallel/checkpoint_transform.py CHANGED Viewed

@@ -409,22 +409,19 @@ def transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix,
     src_layout_map = _extract_layout_map(src_strategy_file)
     dst_layout_map = _extract_layout_map(dst_strategy_file)
     pipeline_stage_num = _extract_pipeline_stage_num(src_strategy_file)
-    dst_stage_num = _extract_pipeline_stage_num(dst_strategy_file)
     if src_layout_map:
-        src_param_keys = {param_name for param_name in src_layout_map if
-                          not param_name.startswith(("accu_grads", "adam_v", "adam_m"))}
+        src_param_keys = {param_name for param_name in src_layout_map if not param_name.startswith("accu_grads")}
     if dst_layout_map:
-        dst_param_keys = {param_name for param_name in dst_layout_map if
-                          not param_name.startswith(("accu_grads", "adam_v", "adam_m"))}
-    layout_is_passed = src_layout_map and dst_layout_map
-    if layout_is_passed and pipeline_stage_num == 1 and dst_stage_num == 1 and \
-        src_param_keys.issubset(dst_param_keys) and len(src_param_keys) < len(dst_param_keys):
-        ms.log.info("Transform checkpoint by every pipeline stage.")
+        dst_param_keys = {param_name for param_name in dst_layout_map if not param_name.startswith("accu_grads")}
+    if src_layout_map and dst_layout_map and pipeline_stage_num == 1 \
+        and src_param_keys.issubset(dst_param_keys) and len(src_param_keys) < len(dst_param_keys):
+        dst_stage_num = _extract_pipeline_stage_num(dst_strategy_file)
+        if dst_stage_num > 1:
+            raise NotImplementedError("When using unmerged src strategy, dst strategy doesn't \
+                                       support strategy with pipeline parallel.")
         _transform_checkpoint_by_stage(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix,
                                        src_strategy_file, dst_strategy_file)
     else:
-        ms.log.info("Transform checkpoints by all pipeline stage.")
         _transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix,
                                src_strategy_file, dst_strategy_file)
@@ -451,7 +448,7 @@ def _sync_params(name, param, layout):
             self.is_send = is_send
             self.ret = ms.Tensor([0])
-            from mindspore.ops import Send, Receive
+            from mindspore.ops.operations._inner_ops import Send, Receive
             if self.is_send:
                 self.send = Send(sr_tag=sr_tag, dest_rank=peer_rank)
             else:

mindspore/parallel/cluster/process_entity/_api.py CHANGED Viewed

@@ -117,7 +117,6 @@ class _ProcessManager:
         self.join = args.join
         self.cluster_time_out = args.cluster_time_out
         self.bind_core = args.bind_core
-        self.rank_table_file = args.rank_table_file
         self.sim_level = args.sim_level
         self.sim_rank_id = args.sim_rank_id
@@ -128,14 +127,6 @@ class _ProcessManager:
             self.worker_num = 1
             self.local_worker_num = 1
             os.environ["MS_SIMULATION_LEVEL"] = str(self.sim_level)
-        elif os.getenv("MS_SIMULATION_LEVEL"):
-            # If simulation level env is set, load RANK_ID and RANK_SIZE envs.
-            self.worker_num = 1
-            self.local_worker_num = 1
-            self.is_simulation = True
-            self.sim_rank_id = os.getenv("RANK_ID", "0")
-            if os.getenv("RANK_SIZE"):
-                self.exported_rank_size = os.getenv("RANK_SIZE")
         self.cmd = args.task_script
         self.cmd_args = args.task_script_args
@@ -161,10 +152,6 @@ class _ProcessManager:
         """
         os.environ["RANK_SIZE"] = str(self.exported_rank_size)
-        if self.rank_table_file != "":
-            os.environ["RANK_TABLE_FILE"] = self.rank_table_file
-            logger.warning(f"msrun launching distributed job with user configured rank table file path:"
-                           f"{self.rank_table_file}")
         if self.is_scale:
             response_message = _send_scale_num(self.scheduler_url, self.scale_num)
             is_first_manager = response_message
@@ -231,8 +218,8 @@ class _ProcessManager:
                 raise RuntimeError("Fail to get cpu number from /proc/cpuinfo.")
             if self.bind_core:
                 avg = int(cpu_num) // self.local_worker_num
-                cpu_start = avg * i
-                cpu_end = cpu_start + avg - 1
+                cpu_start = avg * i + 1
+                cpu_end = avg * (i + 1)
                 cmd = _generate_cmd_args_list_with_core(self.cmd, self.cmd_args, cpu_start, cpu_end)
             else:
                 cmd = _generate_cmd_args_list(self.cmd, self.cmd_args)
@@ -328,17 +315,31 @@ class _ProcessManager:
         """
         scheduler_log_path = os.path.join(self.log_dir, "scheduler.log")
         time_out_node_ids = []
-        if os.path.exists(scheduler_log_path):
-            with open(scheduler_log_path, "r") as log:
-                scheduler_log = log.read()
-                # Filter out abnormal logs.
-                time_out_node_log = re.findall(r"node: .* is timed out", scheduler_log)
-                # Filter out node ids of the processes which exit abnormally.
-                def node_id_splitter(id):
-                    return re.split(" is timed out", re.split("node: ", id)[1])[0]
-                for id in time_out_node_log:
-                    time_out_node_ids.append(node_id_splitter(id))
+        with open(scheduler_log_path, "r") as log:
+            scheduler_log = log.read()
+            # Filter out abnormal logs.
+            time_out_node_log = re.findall(r"node: .* is timed out", scheduler_log)
+            # Filter out node ids of the processes which exit abnormally.
+            def node_id_splitter(id):
+                return re.split(" is timed out", re.split("node: ", id)[1])[0]
+            for id in time_out_node_log:
+                time_out_node_ids.append(node_id_splitter(id))
+        # If 'time_out_node_ids' is not empty, only analyze logs of these time out nodes.
+        # Unless get the error logs of all workers.
+        if time_out_node_ids:
+            os.system(f"cat {scheduler_log_path}|grep -E 'ERROR|CRITICAL|Traceback|Error' -C 5")
             logger.error(f"Time out nodes are {time_out_node_ids}")
-        os.system(f"grep -rn -E 'ERROR|CRITICAL|Traceback|Error' -C 5 {self.log_dir}")
+            # Get the logs which have these timeout node ids.
+            def grepper(id):
+                return subprocess.getoutput(f"grep -rn 'This node {id}' {self.log_dir}"" | awk -F: '{print $1}'")
+            log_names = []
+            for id in time_out_node_ids:
+                log_names.append(grepper(id))
+            for log in log_names:
+                logger.error(f"cat log {log} error info and tail log:"
+                             "==========================")
+                os.system(f"cat {log}|grep -E 'ERROR|CRITICAL|Traceback|Error' -C 5")
+        else:
+            os.system(f"grep -rn -E 'ERROR|CRITICAL|Traceback|Error' -C 5 {self.log_dir}")

mindspore/parallel/cluster/process_entity/_utils.py CHANGED Viewed

@@ -15,7 +15,6 @@
 """Utils for ms_run"""
 import os
 import json
-import socket
 import requests
 import mindspore.log as logger
@@ -37,7 +36,7 @@ def _generate_cmd_args_list(cmd, cmd_args):
     """
     Generates arguments list for 'Popen'. It consists of a binary file name and subsequential arguments.
     """
-    if cmd not in ['python', 'pytest', 'python3']:
+    if cmd not in ['python', 'pytest']:
         # If user don't set binary file name, defaulty use 'python' to launch the job.
         return ['python'] + [cmd] + cmd_args
     return [cmd] + cmd_args
@@ -50,7 +49,7 @@ def _generate_cmd_args_list_with_core(cmd, cmd_args, cpu_start, cpu_end):
     # Bind cpu cores to this process.
     taskset_args = ['taskset'] + ['-c'] + [str(cpu_start) + '-' + str(cpu_end)]
     final_cmd = []
-    if cmd not in ['python', 'pytest', 'python3']:
+    if cmd not in ['python', 'pytest']:
         # If user don't set binary file name, defaulty use 'python' to launch the job.
         final_cmd = taskset_args + ['python'] + [cmd] + cmd_args
     else:
@@ -76,20 +75,11 @@ def _is_local_ip(ip_address):
     p = os.popen("ip -j addr")
     addr_info_str = p.read()
     p.close()
-    if not addr_info_str:
-        # This means this host has no "ip -j addr" command.
-        # We use socket module to get local ip address.
-        s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
-        s.connect((ip_address, 0))
-        current_ip = s.getsockname()[0]
-        s.close()
-        return current_ip == ip_address
     addr_infos = json.loads(addr_info_str)
     for info in addr_infos:
         for addr in info["addr_info"]:
             if addr["local"] == ip_address:
-                logger.info(f"IP address found on this node. Address info:{addr}. Found address:{ip_address}")
+                logger.info(f"IP address found on this node. Address info:{addr}.")
                 return True
     return False

mindspore/parallel/cluster/run.py CHANGED Viewed

@@ -24,37 +24,47 @@ def get_args():
     """
     parser = ArgumentParser()
     parser.add_argument(
-        "--worker_num", type=int, default=8,
+        "--worker_num",
+        type=int,
+        default=8,
         help="the total number of nodes participating in the training, an integer variable, "
         "with a default value of 8."
     )
     parser.add_argument(
         "--local_worker_num",
-        type=int, default=8,
+        type=int,
+        default=8,
         help="the number of nodes participating in local training, an integer variable, "
         "with a default value of 8."
     )
     parser.add_argument(
         "--master_addr",
-        default="127.0.0.1", type=str,
+        default="127.0.0.1",
+        type=str,
         help="specifies the IP address of the scheduler and its data type is string."
         " Allowed values: valid IP addresses."
     )
     parser.add_argument(
-        "--master_port", default=8118, type=int,
+        "--master_port",
+        default=8118,
+        type=int,
         help="specifies the port number of the scheduler, and its data type is integer."
         " Allowed values: port numbers within the range of 1024 to 65535 that are not "
         "already in use."
     )
     parser.add_argument(
-        "--node_rank", default=-1, type=int,
+        "--node_rank",
+        default=-1,
+        type=int,
         help="specifies the rank of current physical node, and its data type is integer."
         " This parameter is used for rank id assignment for each process on the node."
         " If not set, MindSpore will assign rank ids automatically and"
         " rank id of each process on the same node will be continuous."
     )
     parser.add_argument(
-        "--log_dir", default="", type=str,
+        "--log_dir",
+        default="",
+        type=str,
         help="specifies the log output file path."
     )
     parser.add_argument(
@@ -95,13 +105,6 @@ def get_args():
         type=int,
         help="specifies simulation process's rank id. Only one process is spawned in simulation scenario."
     )
-    parser.add_argument(
-        "--rank_table_file",
-        default="",
-        type=str,
-        help="specifies rank table file path. This path is not used to initialize distributed job in "
-             "'rank table file manner' but to help support other features."
-    )
     parser.add_argument(
         "task_script",
         type=str,

mindspore/parallel/parameter_broadcast.py CHANGED Viewed

@@ -131,7 +131,7 @@ def parameter_broadcast(net, layout, cur_rank=0, initial_rank=0):
                 param_redundancy_reversed.setdefault(item, []).append(key)
     if not param_redundancy_reversed:
         return
-    if cur_rank not in single_params:
+    if not cur_rank not in single_params:
         return
     net_param_dict = net.parameters_dict()
     ms.set_auto_parallel_context(parallel_mode="hybrid_parallel")
@@ -140,7 +140,7 @@ def parameter_broadcast(net, layout, cur_rank=0, initial_rank=0):
         allreduce_input = []
         for param in params:
             if param not in net_param_dict:
-                raise ValueError(f"For parameter broadcast, the param: {param} can not be found.")
+                raise ValueError("For parameter broadcast, the param: {param} can not be found.")
             real_param = net_param_dict[param]
             if param not in single_params[cur_rank]:
                 real_param.set_data(Tensor(np.zeros(real_param.shape), dtype=real_param.dtype))

mindspore 2.3.0__cp39-none-any.whl → 2.3.0rc2__cp39-none-any.whl

Potentially problematic release.

mindspore 2.3.0cp39-none-any.whl → 2.3.0rc2cp39-none-any.whl