PyPI - mindspore - Versions diffs - 2.3.0__cp310-cp310-win_amd64.whl → 2.4.1__cp310-cp310-win_amd64.whl - Mend

mindspore 2.3.0__cp310-cp310-win_amd64.whl → 2.4.1__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mindspore might be problematic. Click here for more details.

Files changed (275) hide show

mindspore/.commit_id +1 -1
mindspore/__init__.py +3 -1
mindspore/_c_dataengine.cp310-win_amd64.pyd +0 -0
mindspore/_c_expression.cp310-win_amd64.pyd +0 -0
mindspore/_c_mindrecord.cp310-win_amd64.pyd +0 -0
mindspore/_checkparam.py +50 -9
mindspore/_extends/parse/compile_config.py +41 -0
mindspore/_extends/parse/parser.py +9 -7
mindspore/_extends/parse/standard_method.py +52 -14
mindspore/_extends/pijit/pijit_func_white_list.py +350 -24
mindspore/amp.py +24 -10
mindspore/common/__init__.py +6 -4
mindspore/common/_pijit_context.py +190 -0
mindspore/common/_register_for_tensor.py +2 -1
mindspore/common/_tensor_overload.py +139 -0
mindspore/common/api.py +102 -87
mindspore/common/dump.py +5 -6
mindspore/common/generator.py +1 -7
mindspore/common/hook_handle.py +14 -26
mindspore/common/initializer.py +51 -15
mindspore/common/mindir_util.py +2 -2
mindspore/common/parameter.py +62 -15
mindspore/common/recompute.py +39 -9
mindspore/common/sparse_tensor.py +7 -3
mindspore/common/tensor.py +183 -37
mindspore/communication/__init__.py +1 -1
mindspore/communication/_comm_helper.py +38 -3
mindspore/communication/comm_func.py +315 -60
mindspore/communication/management.py +14 -14
mindspore/context.py +132 -22
mindspore/dataset/__init__.py +1 -1
mindspore/dataset/audio/__init__.py +1 -1
mindspore/dataset/core/config.py +7 -0
mindspore/dataset/core/validator_helpers.py +7 -0
mindspore/dataset/engine/cache_client.py +1 -1
mindspore/dataset/engine/datasets.py +72 -44
mindspore/dataset/engine/datasets_audio.py +7 -7
mindspore/dataset/engine/datasets_standard_format.py +53 -3
mindspore/dataset/engine/datasets_text.py +20 -20
mindspore/dataset/engine/datasets_user_defined.py +174 -104
mindspore/dataset/engine/datasets_vision.py +33 -33
mindspore/dataset/engine/iterators.py +29 -0
mindspore/dataset/engine/obs/util.py +7 -0
mindspore/dataset/engine/queue.py +114 -60
mindspore/dataset/engine/serializer_deserializer.py +2 -2
mindspore/dataset/engine/validators.py +34 -14
mindspore/dataset/text/__init__.py +1 -4
mindspore/dataset/transforms/__init__.py +0 -3
mindspore/dataset/utils/line_reader.py +2 -0
mindspore/dataset/vision/__init__.py +1 -4
mindspore/dataset/vision/utils.py +1 -1
mindspore/dataset/vision/validators.py +2 -1
mindspore/{nn/extend → experimental/es}/__init__.py +4 -11
mindspore/experimental/es/embedding_service.py +883 -0
mindspore/{nn/layer → experimental/es}/embedding_service_layer.py +218 -30
mindspore/experimental/llm_boost/__init__.py +21 -0
mindspore/{nn/extend/layer → experimental/llm_boost/atb}/__init__.py +4 -8
mindspore/experimental/llm_boost/atb/boost_base.py +211 -0
mindspore/experimental/llm_boost/atb/llama_boost.py +115 -0
mindspore/experimental/llm_boost/atb/qwen_boost.py +101 -0
mindspore/experimental/llm_boost/register.py +129 -0
mindspore/experimental/llm_boost/utils.py +31 -0
mindspore/experimental/optim/adamw.py +85 -0
mindspore/experimental/optim/optimizer.py +3 -0
mindspore/hal/__init__.py +3 -3
mindspore/hal/contiguous_tensors_handle.py +175 -0
mindspore/hal/stream.py +18 -0
mindspore/include/api/model_group.h +13 -1
mindspore/include/api/types.h +10 -10
mindspore/include/dataset/config.h +2 -2
mindspore/include/dataset/constants.h +2 -2
mindspore/include/dataset/execute.h +2 -2
mindspore/include/dataset/vision.h +4 -0
mindspore/log.py +1 -1
mindspore/mindrecord/filewriter.py +68 -51
mindspore/mindspore_backend.dll +0 -0
mindspore/mindspore_common.dll +0 -0
mindspore/mindspore_core.dll +0 -0
mindspore/mindspore_np_dtype.dll +0 -0
mindspore/mindspore_ops.dll +0 -0
mindspore/mint/__init__.py +983 -46
mindspore/mint/distributed/__init__.py +31 -0
mindspore/mint/distributed/distributed.py +254 -0
mindspore/mint/nn/__init__.py +268 -23
mindspore/mint/nn/functional.py +125 -19
mindspore/mint/nn/layer/__init__.py +39 -0
mindspore/mint/nn/layer/activation.py +133 -0
mindspore/mint/nn/layer/normalization.py +477 -0
mindspore/mint/nn/layer/pooling.py +110 -0
mindspore/mint/optim/adamw.py +26 -13
mindspore/mint/special/__init__.py +63 -0
mindspore/multiprocessing/__init__.py +2 -1
mindspore/nn/__init__.py +0 -1
mindspore/nn/cell.py +276 -96
mindspore/nn/layer/activation.py +211 -44
mindspore/nn/layer/basic.py +137 -10
mindspore/nn/layer/embedding.py +137 -2
mindspore/nn/layer/normalization.py +101 -5
mindspore/nn/layer/padding.py +34 -48
mindspore/nn/layer/pooling.py +161 -7
mindspore/nn/layer/transformer.py +3 -3
mindspore/nn/loss/__init__.py +2 -2
mindspore/nn/loss/loss.py +84 -6
mindspore/nn/optim/__init__.py +2 -1
mindspore/nn/optim/adadelta.py +1 -1
mindspore/nn/optim/adam.py +1 -1
mindspore/nn/optim/lamb.py +1 -1
mindspore/nn/optim/tft_wrapper.py +124 -0
mindspore/nn/wrap/cell_wrapper.py +12 -23
mindspore/nn/wrap/grad_reducer.py +5 -5
mindspore/nn/wrap/loss_scale.py +17 -3
mindspore/numpy/__init__.py +1 -1
mindspore/numpy/array_creations.py +65 -68
mindspore/numpy/array_ops.py +64 -60
mindspore/numpy/fft.py +610 -75
mindspore/numpy/logic_ops.py +11 -10
mindspore/numpy/math_ops.py +85 -84
mindspore/numpy/utils_const.py +4 -4
mindspore/opencv_core452.dll +0 -0
mindspore/opencv_imgcodecs452.dll +0 -0
mindspore/opencv_imgproc452.dll +0 -0
mindspore/ops/__init__.py +6 -4
mindspore/ops/_grad_experimental/grad_array_ops.py +0 -11
mindspore/ops/_grad_experimental/grad_comm_ops.py +67 -4
mindspore/ops/_grad_experimental/grad_math_ops.py +0 -22
mindspore/ops/_vmap/vmap_array_ops.py +2 -4
mindspore/ops/_vmap/vmap_math_ops.py +17 -1
mindspore/ops/_vmap/vmap_nn_ops.py +43 -2
mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +91 -7
mindspore/ops/auto_generate/gen_arg_dtype_cast.py +2 -0
mindspore/ops/auto_generate/gen_extend_func.py +767 -13
mindspore/ops/auto_generate/gen_ops_def.py +2452 -364
mindspore/ops/auto_generate/gen_ops_prim.py +5442 -1756
mindspore/ops/auto_generate/pyboost_inner_prim.py +176 -56
mindspore/ops/composite/base.py +85 -48
mindspore/ops/composite/multitype_ops/_compile_utils.py +1 -0
mindspore/ops/composite/multitype_ops/not_in_impl.py +2 -2
mindspore/ops/function/__init__.py +22 -0
mindspore/ops/function/array_func.py +492 -153
mindspore/ops/function/debug_func.py +113 -1
mindspore/ops/function/fft_func.py +15 -2
mindspore/ops/function/grad/grad_func.py +3 -2
mindspore/ops/function/math_func.py +564 -207
mindspore/ops/function/nn_func.py +817 -383
mindspore/ops/function/other_func.py +3 -2
mindspore/ops/function/random_func.py +402 -12
mindspore/ops/function/reshard_func.py +13 -11
mindspore/ops/function/sparse_unary_func.py +1 -1
mindspore/ops/function/vmap_func.py +3 -2
mindspore/ops/functional.py +24 -14
mindspore/ops/op_info_register.py +3 -3
mindspore/ops/operations/__init__.py +7 -2
mindspore/ops/operations/_grad_ops.py +2 -76
mindspore/ops/operations/_infer_ops.py +1 -1
mindspore/ops/operations/_inner_ops.py +71 -94
mindspore/ops/operations/array_ops.py +14 -146
mindspore/ops/operations/comm_ops.py +63 -53
mindspore/ops/operations/custom_ops.py +83 -19
mindspore/ops/operations/debug_ops.py +42 -10
mindspore/ops/operations/manually_defined/_inner.py +12 -0
mindspore/ops/operations/manually_defined/ops_def.py +273 -20
mindspore/ops/operations/math_ops.py +12 -223
mindspore/ops/operations/nn_ops.py +20 -114
mindspore/ops/operations/other_ops.py +7 -4
mindspore/ops/operations/random_ops.py +46 -1
mindspore/ops/primitive.py +18 -6
mindspore/ops_generate/arg_dtype_cast.py +2 -0
mindspore/ops_generate/gen_aclnn_implement.py +11 -11
mindspore/ops_generate/gen_constants.py +36 -0
mindspore/ops_generate/gen_ops.py +67 -52
mindspore/ops_generate/gen_ops_inner_prim.py +1 -1
mindspore/ops_generate/gen_pyboost_func.py +131 -47
mindspore/ops_generate/op_proto.py +10 -3
mindspore/ops_generate/pyboost_utils.py +14 -1
mindspore/ops_generate/template.py +43 -21
mindspore/parallel/__init__.py +3 -1
mindspore/parallel/_auto_parallel_context.py +31 -9
mindspore/parallel/_cell_wrapper.py +85 -0
mindspore/parallel/_parallel_serialization.py +47 -19
mindspore/parallel/_tensor.py +127 -13
mindspore/parallel/_utils.py +53 -22
mindspore/parallel/algo_parameter_config.py +5 -5
mindspore/parallel/checkpoint_transform.py +46 -39
mindspore/parallel/cluster/process_entity/__init__.py +1 -1
mindspore/parallel/cluster/process_entity/_api.py +31 -23
mindspore/parallel/cluster/process_entity/_utils.py +2 -27
mindspore/parallel/parameter_broadcast.py +3 -4
mindspore/parallel/shard.py +162 -31
mindspore/parallel/transform_safetensors.py +1146 -0
mindspore/profiler/__init__.py +2 -1
mindspore/profiler/common/constant.py +29 -0
mindspore/profiler/common/registry.py +47 -0
mindspore/profiler/common/util.py +28 -0
mindspore/profiler/dynamic_profiler.py +694 -0
mindspore/profiler/envprofiling.py +17 -19
mindspore/profiler/parser/ascend_analysis/constant.py +18 -0
mindspore/profiler/parser/ascend_analysis/file_manager.py +25 -4
mindspore/profiler/parser/ascend_analysis/function_event.py +43 -19
mindspore/profiler/parser/ascend_analysis/fwk_cann_parser.py +31 -26
mindspore/profiler/parser/ascend_analysis/fwk_file_parser.py +56 -10
mindspore/profiler/parser/ascend_analysis/msprof_timeline_parser.py +55 -8
mindspore/profiler/parser/ascend_analysis/path_manager.py +313 -0
mindspore/profiler/parser/ascend_analysis/profiler_info_parser.py +27 -20
mindspore/profiler/parser/ascend_analysis/trace_event_manager.py +9 -2
mindspore/profiler/parser/ascend_msprof_exporter.py +5 -4
mindspore/profiler/parser/ascend_timeline_generator.py +27 -25
mindspore/profiler/parser/base_timeline_generator.py +19 -25
mindspore/profiler/parser/cpu_gpu_timeline_generator.py +25 -12
mindspore/profiler/parser/framework_parser.py +1 -391
mindspore/profiler/parser/gpu_analysis/__init__.py +14 -0
mindspore/profiler/parser/gpu_analysis/function_event.py +44 -0
mindspore/profiler/parser/gpu_analysis/fwk_file_parser.py +89 -0
mindspore/profiler/parser/gpu_analysis/profiler_info_parser.py +72 -0
mindspore/profiler/parser/memory_usage_parser.py +0 -154
mindspore/profiler/parser/profiler_info.py +78 -6
mindspore/profiler/profiler.py +153 -0
mindspore/profiler/profiling.py +285 -413
mindspore/rewrite/__init__.py +1 -2
mindspore/rewrite/common/namespace.py +4 -4
mindspore/rewrite/symbol_tree/symbol_tree.py +3 -3
mindspore/run_check/_check_version.py +39 -104
mindspore/safeguard/rewrite_obfuscation.py +591 -247
mindspore/train/__init__.py +4 -3
mindspore/train/_utils.py +105 -19
mindspore/train/amp.py +171 -53
mindspore/train/callback/__init__.py +2 -2
mindspore/train/callback/_callback.py +4 -4
mindspore/train/callback/_checkpoint.py +97 -31
mindspore/train/callback/_cluster_monitor.py +1 -1
mindspore/train/callback/_flops_collector.py +1 -0
mindspore/train/callback/_loss_monitor.py +3 -3
mindspore/train/callback/_on_request_exit.py +145 -31
mindspore/train/callback/_summary_collector.py +5 -5
mindspore/train/callback/_tft_register.py +375 -0
mindspore/train/dataset_helper.py +15 -3
mindspore/train/metrics/metric.py +3 -3
mindspore/train/metrics/roc.py +4 -4
mindspore/train/mind_ir_pb2.py +44 -39
mindspore/train/model.py +154 -58
mindspore/train/serialization.py +342 -128
mindspore/utils/__init__.py +21 -0
mindspore/utils/utils.py +60 -0
mindspore/version.py +1 -1
{mindspore-2.3.0.dist-info → mindspore-2.4.1.dist-info}/METADATA +13 -7
{mindspore-2.3.0.dist-info → mindspore-2.4.1.dist-info}/RECORD +248 -242
mindspore/include/c_api/ms/abstract.h +0 -67
mindspore/include/c_api/ms/attribute.h +0 -197
mindspore/include/c_api/ms/base/handle_types.h +0 -43
mindspore/include/c_api/ms/base/macros.h +0 -32
mindspore/include/c_api/ms/base/status.h +0 -33
mindspore/include/c_api/ms/base/types.h +0 -283
mindspore/include/c_api/ms/context.h +0 -102
mindspore/include/c_api/ms/graph.h +0 -160
mindspore/include/c_api/ms/node.h +0 -606
mindspore/include/c_api/ms/tensor.h +0 -161
mindspore/include/c_api/ms/value.h +0 -84
mindspore/mindspore_shared_lib.dll +0 -0
mindspore/nn/extend/basic.py +0 -140
mindspore/nn/extend/embedding.py +0 -143
mindspore/nn/extend/layer/normalization.py +0 -109
mindspore/nn/extend/pooling.py +0 -117
mindspore/nn/layer/embedding_service.py +0 -531
mindspore/ops/_op_impl/aicpu/strided_slice_v2.py +0 -93
mindspore/ops/_op_impl/aicpu/strided_slice_v2_grad.py +0 -66
mindspore/ops/extend/__init__.py +0 -53
mindspore/ops/extend/array_func.py +0 -218
mindspore/ops/extend/math_func.py +0 -76
mindspore/ops/extend/nn_func.py +0 -308
mindspore/ops/silent_check.py +0 -162
mindspore/profiler/parser/msadvisor_analyzer.py +0 -82
mindspore/profiler/parser/msadvisor_parser.py +0 -240
mindspore/train/callback/_mindio_ttp.py +0 -443
{mindspore-2.3.0.dist-info → mindspore-2.4.1.dist-info}/WHEEL +0 -0
{mindspore-2.3.0.dist-info → mindspore-2.4.1.dist-info}/entry_points.txt +0 -0
{mindspore-2.3.0.dist-info → mindspore-2.4.1.dist-info}/top_level.txt +0 -0

mindspore/dataset/engine/datasets.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright 2022-2023 Huawei Technologies Co., Ltd
+# Copyright 2022-2024 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,13 +16,13 @@
 1. This file is an abstraction of the dataset loading class. It contains
 some basic dataset operations(skip, filter, map, batch, ...).
 2. Specific dataset loading classes can be found in datasets_vision.py, datasets_text.py,
-datasets_audio.py, datasets_standard_format.py and dataets_user_defined.py files.
+datasets_audio.py, datasets_standard_format.py and datasets_user_defined.py files.
     datasets_vision.py: contains vision dataset loading classes.
     datasets_text.py: contains text dataset loading classes.
     datasets_audio.py: contains audio dataset loading classes.
     datasets_standard_format.py: contains standard format loading classes which
                                  any other kinds of datasets can be converted to.
-    dataets_user_defined.py: contains basic classes that help users to define
+    datasets_user_defined.py: contains basic classes that help users to define
                              flexible ways to load dataset.
 """
 import atexit
@@ -66,13 +66,13 @@ from mindspore.dataset.debug import DebugHook
 from mindspore.dataset.engine import samplers
 from .iterators import DictIterator, TupleIterator, DummyIterator, check_iterator_cleanup, _set_iterator_cleanup, \
-    ITERATORS_LIST, _unset_iterator_cleanup
+    ITERATORS_LIST, _unset_iterator_cleanup, _cleanup_the_iterators_if_created
 from .queue import _SharedQueue, _Queue
 from .validators import check_batch, check_shuffle, check_map, check_filter, check_repeat, check_skip, check_zip, \
     check_rename, check_device_send, check_take, check_output_shape, check_project, \
     check_sync_wait, check_zip_dataset, check_add_column, check_concat, check_split, check_bucket_batch_by_length, \
     check_save, check_tuple_iterator, check_dict_iterator, check_schema, check_to_device_send, check_padded_batch, \
-    check_total_batch
+    check_total_batch, check_sync_update
 from ..core.config import get_callback_timeout, _init_device_info, get_enable_shared_mem, get_num_parallel_workers, \
     get_enable_watchdog, get_seed, set_seed, get_debug_mode, get_multiprocessing_timeout_interval, _get_debug_hook_list
 from ..core.datatypes import mstype_to_detype
@@ -494,6 +494,12 @@ class Dataset:
         .. image:: bucket_batch_by_length_en.png
+        Note:
+            - When using `Data Sinking <https://www.mindspore.cn/docs/en/master/model_train/train_process/optimize/
+              sink_mode.html#data-sinking>`_ in Graph mode, the input shape of the network should keep consistent.
+              You should set `drop_remainder` to "True" to discard the last incomplete batch of data,
+              or supplement/remove samples to ensure the dataset size is divisible by `batch_size`.
         Args:
             column_names (list[str]): Columns passed to element_length_function.
             bucket_boundaries (list[int]): A list consisting of the upper boundaries
@@ -564,8 +570,12 @@ class Dataset:
         .. image:: batch_en.png
         Note:
-            The order of using repeat and batch reflects the number of batches and per_batch_map.
-            It is recommended that the repeat operation applied after the batch operation finished.
+            - The order of using repeat and batch reflects the number of batches and per_batch_map.
+              It is recommended that the repeat operation applied after the batch operation finished.
+            - When using `Data Sinking <https://www.mindspore.cn/docs/en/master/model_train/train_process/optimize/
+              sink_mode.html#data-sinking>`_ in Graph mode, the input shape of the network should keep consistent.
+              You should set `drop_remainder` to "True" to discard the last incomplete batch of data,
+              or supplement/remove samples to ensure the dataset size is divisible by `batch_size`.
         Args:
             batch_size (Union[int, Callable]): The number of rows each batch is created with. An
@@ -598,10 +608,10 @@ class Dataset:
                   name as the input columns, i.e., the columns will be replaced.
                 - python_multiprocessing (bool, optional): Parallelize Python function `per_batch_map` with
-                  multi-processing or multi-threading mode, ``True`` means multi-processing,
-                  ``False`` means multi-threading If `per_batch_map` is a I/O bound task, use
-                  multi-threading mode. If `per_batch_map` is a CPU bound task, it is recommended to use
-                  multi-processing mode. Default: ``False`` , use python multi-threading mode.
+                  multiprocessing or multithreading mode, ``True`` means multiprocessing,
+                  ``False`` means multithreading If `per_batch_map` is a I/O bound task, use
+                  multithreading mode. If `per_batch_map` is a CPU bound task, it is recommended to use
+                  multiprocessing mode. Default: ``False`` , use python multithreading mode.
                 - max_rowsize(Union[int, list[int]], optional): Maximum size of row in MB that is used for shared memory
                   allocation to copy data between processes, the total occupied shared memory will increase as
@@ -611,7 +621,7 @@ class Dataset:
                   ``input_columns`` and ``output_columns`` use this value as the unit to create shared memory.
                   If it is a list, the first element represents the ``input_columns`` use this value as the unit to
                   create shared memory, and the second element represents ``output_columns`` use this value as the unit
-                  to create shared memory. Default: 16.
+                  to create shared memory. Default: ``None`` , allocate shared memory dynamically.
         Returns:
             Dataset, a new dataset with the above operation applied.
@@ -657,8 +667,12 @@ class Dataset:
         .. image:: padded_batch_en.png
         Note:
-            The order of using repeat and padded_batch reflects the number of batches.
-            It is recommended that the repeat operation applied after the padded_batch operation finished.
+            - The order of using repeat and padded_batch reflects the number of batches.
+              It is recommended that the repeat operation applied after the padded_batch operation finished.
+            - When using `Data Sinking <https://www.mindspore.cn/docs/en/master/model_train/train_process/optimize/
+              sink_mode.html#data-sinking>`_ in Graph mode, the input shape of the network should keep consistent.
+              You should set `drop_remainder` to "True" to discard the last incomplete batch of data,
+              or supplement/remove samples to ensure the dataset size is divisible by `batch_size`.
         Args:
             batch_size (Union[int, Callable]): The number of rows each batch is created with. An
@@ -905,7 +919,7 @@ class Dataset:
                   ``input_columns`` and ``output_columns`` use this value as the unit to create shared memory.
                   If it is a list, the first element represents the ``input_columns`` use this value as the unit to
                   create shared memory, and the second element represents ``output_columns`` use this value as the unit
-                  to create shared memory. Default: 16.
+                  to create shared memory. Default: ``None`` , allocate shared memory dynamically.
                 - cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
                   Default: ``None``, which means no cache is used.
@@ -989,8 +1003,8 @@ class Dataset:
             num_parallel_workers = 1
             logger.warning(
                 "Input 'operations' of 'map' includes network computing operators like in mindspore.nn, mindspore.ops, "
-                "mindspore.numpy module and etc, which do not support multi-thread compiling, recommend to replace it "
-                "with python implemented operator like numpy etc. Here decrease 'num_parallel_workers' into 1.")
+                "mindspore.numpy module and etc, which do not support multithreading compiling, recommend to replace "
+                "it with python implemented operator like numpy etc. Here decrease 'num_parallel_workers' into 1.")
         return MapDataset(self, operations, input_columns, output_columns, num_parallel_workers, **kwargs)
@@ -1523,8 +1537,8 @@ class Dataset:
             2. Before calling the function, do not use batch operation, repeat operation or data augmentation operations
                with random attribute in map operation.
             3. When array dimension is variable, one-dimensional arrays or
-               multi-dimensional arrays with variable dimension 0 are supported.
-            4. MindRecord does not support multi-dimensional string or multi-dimensional bytes.
+               multidimensional arrays with variable dimension 0 are supported.
+            4. MindRecord does not support multidimensional string or multidimensional bytes.
         Args:
             file_name (str): Path to dataset file.
@@ -1741,6 +1755,7 @@ class Dataset:
         return self._col_names
     @check_output_shape
+    @_cleanup_the_iterators_if_created
     def output_shapes(self, estimate=False):
         """
         Get the shapes of output data.
@@ -1792,6 +1807,7 @@ class Dataset:
             self.saved_output_shapes = output_shapes
         return output_shapes
+    @_cleanup_the_iterators_if_created
     def output_types(self):
         """
         Get the types of output data.
@@ -1826,6 +1842,7 @@ class Dataset:
             del self.runtime_context
         return self.saved_output_types
+    @_cleanup_the_iterators_if_created
     def get_dataset_size(self):
         """
         Return the number of batches in an epoch.
@@ -1893,6 +1910,7 @@ class Dataset:
             return self.children[0].is_sync()
         return False
+    @check_sync_update
     def sync_update(self, condition_name, num_batch=None, data=None):
         """
         Release a blocking condition and trigger callback with given data.
@@ -2174,7 +2192,7 @@ class TextBaseDataset(Dataset):
                 Japanese or Chinese character sets, and 1.0 for other languages with small character sets
                 like English or Latin.
             model_type(SentencePieceModel): Model type. Choose from unigram (default), bpe, char, or word.
-                The input sentence must be pretokenized when using word type.
+                The input sentence must be pre-tokenized when using word type.
             params(dict): Any extra optional parameters of sentencepiece library according to your raw data
         Returns:
@@ -2251,7 +2269,7 @@ class TextBaseDataset(Dataset):
                 Japanese or Chinese character sets, and 1.0 for other languages with small character sets
                 like English or Latin.
             model_type(SentencePieceModel): Model type. Choose from unigram (default), bpe, char, or word.
-                The input sentence must be pretokenized when using word type.
+                The input sentence must be pre-tokenized when using word type.
             params(dict): Any extra optional parameters of sentencepiece library according to your raw data
         Returns:
@@ -2629,12 +2647,12 @@ class BatchDataset(UnionBaseDataset):
             ``input_columns`` and ``output_columns`` use this value as the unit to create shared memory.
             If it is a list, the first element represents the ``input_columns`` use this value as the unit to
             create shared memory, and the second element represents ``output_columns`` use this value as the unit
-            to create shared memory. Default: 16.
+            to create shared memory. Default: ``None`` , allocate shared memory dynamically.
     """
     def __init__(self, input_dataset, batch_size, drop_remainder=False, num_parallel_workers=None, per_batch_map=None,
-                 input_columns=None, output_columns=None, python_multiprocessing=False, max_rowsize=16):
+                 input_columns=None, output_columns=None, python_multiprocessing=False, max_rowsize=None):
         super().__init__(children=input_dataset, num_parallel_workers=num_parallel_workers)
         if BatchDataset._is_ancestor_of_repeat(input_dataset):
@@ -2655,7 +2673,9 @@ class BatchDataset(UnionBaseDataset):
         self.python_multiprocessing = python_multiprocessing
         self.process_pool = None
-        if isinstance(max_rowsize, int):
+        if max_rowsize is None:
+            self.max_rowsize = [-1, -1]
+        elif isinstance(max_rowsize, int):
             self.max_rowsize = [max_rowsize * self.batch_size] * 2 if max_rowsize != -1 else [max_rowsize, max_rowsize]
         else:
             self.max_rowsize = [max_rowsize[0] * self.batch_size, max_rowsize[1] * self.batch_size]
@@ -3078,7 +3098,7 @@ class Pipe:
     Class to handle communication between the master process and the worker processes.
     """
-    def __init__(self, warning_ctl, shared_memory=False, max_rowsize=16):
+    def __init__(self, warning_ctl, shared_memory=False, max_rowsize=(-1, -1)):
         self.shared_memory = shared_memory
         self.eof = multiprocessing.Event()
         if self.shared_memory:
@@ -3139,7 +3159,10 @@ def _worker_loop(operations, pipe, worker_id):
     """
     Multiprocess worker process loop.
     """
-    # Ensure that the process does not hung when exiting
+    # Initialize C++ side signal handlers
+    cde.register_worker_handlers()
+    # Ensure that the process does not hang when exiting
     pipe.res_queue.cancel_join_thread()
     def _ignore_sigint():
@@ -3153,6 +3176,7 @@ def _worker_loop(operations, pipe, worker_id):
     # that the random results of each process are different.
     if get_seed() != 5489:
         set_seed(get_seed() + worker_id)
     while not _main_process_already_exit():
         _ignore_sigint()
@@ -3184,7 +3208,7 @@ class _MPWorker(multiprocessing.Process):
     Worker process for multiprocessing.
     """
-    def __init__(self, operations, warning_ctl, max_rowsize=16, worker_id=0):
+    def __init__(self, operations, warning_ctl, max_rowsize=(-1, -1), worker_id=0):
         shared_memory = get_enable_shared_mem()
         self.pipe = Pipe(warning_ctl, shared_memory=shared_memory, max_rowsize=max_rowsize)
         self.check_interval = get_multiprocessing_timeout_interval()
@@ -3216,14 +3240,6 @@ class _MPWorker(multiprocessing.Process):
                     logger.warning("Please `pip install py-spy` to get the stacks of the stuck process.")
             try:
                 res = self.pipe.master_receive()
-                # Because there is no need to copy when creating Tensors in the C++layer, it reduces the time
-                # from np.ndarray to C++Tensor creation. However, when using shared memory in multiple processes,
-                # the address of the shared memory will always be passed to subsequent nodes in the dataset pipeline,
-                # and the shared memory will also be written by the current node, causing dirty data to be accessed
-                # by subsequent nodes in the pipeline. So make a memory copy here to solve the problem of
-                # shared memory being contaminated.
-                if get_enable_shared_mem():
-                    res = copy.deepcopy(res)
             except queue.Empty:
                 continue
             if res is None:
@@ -3286,7 +3302,7 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
             self.origin_hook(ex_type, value, tb)
             self.mp_pool_exit_preprocess()
-    def __init__(self, op_name, num_parallel_workers, operations, max_rowsize=16):
+    def __init__(self, op_name, num_parallel_workers, operations, max_rowsize=(-1, -1)):
         super(_PythonMultiprocessing, self).__init__()
         self.op_name = op_name
         self.num_parallel_workers = num_parallel_workers
@@ -3302,7 +3318,7 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
         self.eot = None
         self.watch_dog = None
-        self.ppid = os.getpid()
+        self.ppid = None
         self.hook = None
         self.warning_ctl = None
         # cache thread (get_ident()) to worker_id mapping in Python layer
@@ -3327,10 +3343,10 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
                 if child_pid == 0:
                     break
         except OSError:
-            # waitpid may be failed for some reasons so we ignore this error
+            # waitpid may fail for some reason, so we ignore this error
             pass
-    # Dataset need watch_dog thread to monitoring fork multi-processing,
+    # Dataset need watch_dog thread to monitoring fork multiprocessing,
     # and thread can't be a member function otherwise python won't collect and release resources.
     @staticmethod
     def _watch_dog(eot, workers):
@@ -3363,6 +3379,8 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
                                 "main process will exit. If this is not an artificial operation, you can use "
                                 "ds.config.set_enable_watchdog(False) to block this error.")
                 os.kill(os.getpid(), signal.SIGTERM)
+            # sleep to release GIL
+            time.sleep(1)
         # release the workers
         del workers
@@ -3451,6 +3469,12 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
         while _PythonMultiprocessing.is_process_alive(ppid):
             if quit_signal.is_set():
                 return
+            # independent dataset mode, the subprocess of GeneratorDataset / map / batch should exit when
+            # independent dataset process have exit
+            if os.getppid() != ppid:
+                break
             time.sleep(0.1)
         _PythonMultiprocessing._terminate_processes(workers)
@@ -3462,10 +3486,10 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
         Launch Python multiprocessing pool.
         Args:
-            pop_id: ID for operation to have Python multiprocessing pool launched
+            op_id: ID for operation to have Python multiprocessing pool launched
         Returns:
-            Python multiprocssing pool is launched.
+            Python multiprocessing pool is launched.
         """
         self.python_threads_to_workers = {}
         self.op_id = op_id
@@ -3476,6 +3500,7 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
             logger.warning(message)
             self.terminate()
             self.reset()
+        self.ppid = os.getpid()
         self.create_pool()
     def create_pool(self):
@@ -3677,12 +3702,13 @@ class MapDataset(UnionBaseDataset):
             ``python_multiprocessing`` is set to True. If it is an int value, it represents ``input_columns`` and
             ``output_columns`` use this value as the unit to create shared memory. If it is a list, the first element
             represents the ``input_columns`` use this value as the unit to create shared memory, and the second element
-            represents ``output_columns`` use this value as the unit to create shared memory. Default: 16.
+            represents ``output_columns`` use this value as the unit to create shared memory. Default: ``None`` ,
+            allocate shared memory dynamically.
         offload (bool, optional): Flag to indicate whether offload is used. Default: ``None``.
     """
     def __init__(self, input_dataset, operations=None, input_columns=None, output_columns=None,
-                 num_parallel_workers=None, python_multiprocessing=False, cache=None, callbacks=None, max_rowsize=16,
+                 num_parallel_workers=None, python_multiprocessing=False, cache=None, callbacks=None, max_rowsize=None,
                  offload=None):
         super().__init__(children=input_dataset, num_parallel_workers=num_parallel_workers, cache=cache)
         self.operations = to_list(operations)
@@ -3708,7 +3734,9 @@ class MapDataset(UnionBaseDataset):
         self.process_pool = None
         self.callbacks = to_list(callbacks)
-        if isinstance(max_rowsize, int):
+        if max_rowsize is None:
+            self.max_rowsize = [-1, -1]
+        elif isinstance(max_rowsize, int):
             self.max_rowsize = [max_rowsize] * 2
         else:
             self.max_rowsize = max_rowsize

mindspore/dataset/engine/datasets_audio.py CHANGED Viewed

@@ -63,7 +63,7 @@ class CMUArcticDataset(MappableDataset, AudioBaseDataset):
         shard_id (int, optional): The shard ID within `num_shards` . Default: ``None``, will use ``0``. This
             argument can only be specified when `num_shards` is also specified.
         cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
-            `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
+            `Single-Node Data Cache <https://www.mindspore.cn/docs/en/master/model_train/dataset/cache.html>`_ .
             Default: ``None``, which means no cache is used.
     Raises:
@@ -180,7 +180,7 @@ class GTZANDataset(MappableDataset, AudioBaseDataset):
         shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
             argument can only be specified when `num_shards` is also specified.
         cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
-            `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
+            `Single-Node Data Cache <https://www.mindspore.cn/docs/en/master/model_train/dataset/cache.html>`_ .
             Default: ``None`` , which means no cache is used.
     Raises:
@@ -298,7 +298,7 @@ class LibriTTSDataset(MappableDataset, AudioBaseDataset):
         shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
             argument can only be specified when `num_shards` is also specified.
         cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
-            `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
+            `Single-Node Data Cache <https://www.mindspore.cn/docs/en/master/model_train/dataset/cache.html>`_ .
             Default: ``None`` , which means no cache is used.
     Raises:
@@ -425,7 +425,7 @@ class LJSpeechDataset(MappableDataset, AudioBaseDataset):
         shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
             argument can only be specified when `num_shards` is also specified.
         cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
-            `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
+            `Single-Node Data Cache <https://www.mindspore.cn/docs/en/master/model_train/dataset/cache.html>`_ .
             Default: ``None`` , which means no cache is used.
     Raises:
@@ -548,7 +548,7 @@ class SpeechCommandsDataset(MappableDataset, AudioBaseDataset):
         shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` .
             This argument can only be specified when `num_shards` is also specified.
         cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
-            `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
+            `Single-Node Data Cache <https://www.mindspore.cn/docs/en/master/model_train/dataset/cache.html>`_ .
             Default: ``None`` , which means no cache is used.
     Raises:
@@ -661,7 +661,7 @@ class TedliumDataset(MappableDataset, AudioBaseDataset):
         shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
             argument can only be specified when `num_shards` is also specified.
         cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
-            `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
+            `Single-Node Data Cache <https://www.mindspore.cn/docs/en/master/model_train/dataset/cache.html>`_ .
             Default: ``None`` , which means no cache is used.
     Raises:
@@ -841,7 +841,7 @@ class YesNoDataset(MappableDataset, AudioBaseDataset):
         shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This argument can only
             be specified when `num_shards` is also specified.
         cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
-            `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
+            `Single-Node Data Cache <https://www.mindspore.cn/docs/en/master/model_train/dataset/cache.html>`_ .
             Default: ``None`` , which means no cache is used.
     Raises:

mindspore/dataset/engine/datasets_standard_format.py CHANGED Viewed

@@ -77,7 +77,7 @@ class CSVDataset(SourceDataset, UnionBaseDataset):
         shard_id (int, optional): The shard ID within `num_shards` . Default: ``None``. This
             argument can only be specified when `num_shards` is also specified.
         cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
-            `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
+            `Single-Node Data Cache <https://www.mindspore.cn/docs/en/master/model_train/dataset/cache.html>`_ .
             Default: ``None``, which means no cache is used.
     Raises:
@@ -156,7 +156,7 @@ class MindDataset(MappableDataset, UnionBaseDataset):
         num_samples (int, optional): The number of samples to be included in the dataset.
             Default: ``None`` , all samples.
         cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
-            `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
+            `Single-Node Data Cache <https://www.mindspore.cn/docs/en/master/model_train/dataset/cache.html>`_ .
             Default: ``None`` , which means no cache is used.
     Raises:
@@ -166,6 +166,52 @@ class MindDataset(MappableDataset, UnionBaseDataset):
         RuntimeError: If `shard_id` is specified but `num_shards` is None.
         ValueError: If `shard_id` is not in range of [0, `num_shards` ).
+    Note:
+        - When sharding MindRecord (by configuring `num_shards` and `shard_id`), there are two strategies to implement
+          the data sharding logic. This API uses the strategy 2.
+        .. list-table:: Data sharding strategy 1
+            :widths: 50 50 50 50
+            :header-rows: 1
+            * - rank 0
+              - rank 1
+              - rank 2
+              - rank 3
+            * - 0
+              - 1
+              - 2
+              - 3
+            * - 4
+              - 5
+              - 6
+              - 7
+            * - 8
+              - 9
+              - 10
+              - 11
+        .. list-table:: Data sharding strategy 2
+            :widths: 50 50 50 50
+            :header-rows: 1
+            * - rank 0
+              - rank 1
+              - rank 2
+              - rank 3
+            * - 0
+              - 3
+              - 6
+              - 9
+            * - 1
+              - 4
+              - 7
+              - 10
+            * - 2
+              - 5
+              - 8
+              - 11
     Note:
         - The parameters `num_samples` , `shuffle` , `num_shards` , `shard_id` can be used to control the sampler
           used in the dataset, and their effects when combined with parameter `sampler` are as follows.
@@ -307,7 +353,7 @@ class TFRecordDataset(SourceDataset, UnionBaseDataset):
             When `compression_type` is not ``None``, and `num_samples` or numRows (parsed from `schema` ) is provided,
             `shard_equal_rows` will be implied as ``True``.
         cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
-            `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
+            `Single-Node Data Cache <https://www.mindspore.cn/docs/en/master/model_train/dataset/cache.html>`_ .
             Default: ``None`` , which means no cache is used.
         compression_type (str, optional): The type of compression used for all files, must be either ``''``,
             ``'GZIP'``, or ``'ZLIB'``. Default: ``None`` , as in empty string. It is highly recommended to
@@ -383,6 +429,10 @@ class OBSMindDataset(GeneratorDataset):
     The columns of generated dataset depend on the source MindRecord files.
+    Note:
+        - This interface accesses the `/cache` directory for node synchronization and requires the user to ensure
+          access to the `/cache` directory.
     Args:
         dataset_files (list[str]): List of files in cloud storage to be read and file path is in
             the format of s3://bucketName/objectKey.