PyPI - mindspore - Versions diffs - 2.4.10__cp311-cp311-win_amd64.whl → 2.5.0__cp311-cp311-win_amd64.whl - Mend

mindspore 2.4.10__cp311-cp311-win_amd64.whl → 2.5.0__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mindspore might be problematic. Click here for more details.

Files changed (366) hide show

mindspore/.commit_id +1 -1
mindspore/__init__.py +8 -3
mindspore/_c_dataengine.cp311-win_amd64.pyd +0 -0
mindspore/_c_expression.cp311-win_amd64.pyd +0 -0
mindspore/_c_mindrecord.cp311-win_amd64.pyd +0 -0
mindspore/_checkparam.py +0 -5
mindspore/_extends/parallel_compile/akg_compiler/gen_custom_op_files.py +1 -1
mindspore/_extends/parse/compile_config.py +64 -0
mindspore/_extends/parse/deprecated/__init__.py +0 -0
mindspore/_extends/parse/deprecated/deprecated_tensor_method.py +375 -0
mindspore/_extends/parse/parser.py +23 -5
mindspore/_extends/parse/standard_method.py +123 -27
mindspore/_extends/pijit/pijit_func_white_list.py +1 -1
mindspore/amp.py +7 -1
mindspore/avcodec-59.dll +0 -0
mindspore/avdevice-59.dll +0 -0
mindspore/avfilter-8.dll +0 -0
mindspore/avformat-59.dll +0 -0
mindspore/avutil-57.dll +0 -0
mindspore/boost/boost_cell_wrapper.py +136 -41
mindspore/common/__init__.py +3 -1
mindspore/common/_register_for_tensor.py +0 -1
mindspore/common/_stub_tensor.py +25 -4
mindspore/common/_tensor_cpp_method.py +17 -0
mindspore/common/_tensor_docs.py +6132 -0
mindspore/common/api.py +98 -21
mindspore/common/dtype.py +34 -34
mindspore/common/dump.py +2 -1
mindspore/common/file_system.py +8 -3
mindspore/common/generator.py +2 -0
mindspore/common/hook_handle.py +3 -1
mindspore/common/initializer.py +3 -4
mindspore/common/lazy_inline.py +8 -2
mindspore/common/mindir_util.py +10 -2
mindspore/common/parameter.py +31 -15
mindspore/common/tensor.py +713 -1337
mindspore/communication/__init__.py +1 -1
mindspore/communication/_comm_helper.py +5 -0
mindspore/communication/comm_func.py +215 -173
mindspore/communication/management.py +23 -20
mindspore/context.py +285 -191
mindspore/dataset/__init__.py +23 -19
mindspore/dataset/callback/ds_callback.py +2 -1
mindspore/dataset/core/config.py +84 -3
mindspore/dataset/engine/cache_admin.py +3 -3
mindspore/dataset/engine/cache_client.py +5 -4
mindspore/dataset/engine/datasets.py +192 -149
mindspore/dataset/engine/datasets_audio.py +14 -0
mindspore/dataset/engine/datasets_standard_format.py +11 -11
mindspore/dataset/engine/datasets_text.py +38 -1
mindspore/dataset/engine/datasets_user_defined.py +100 -66
mindspore/dataset/engine/datasets_vision.py +81 -8
mindspore/dataset/engine/iterators.py +281 -63
mindspore/dataset/engine/obs/util.py +8 -0
mindspore/dataset/engine/queue.py +40 -0
mindspore/dataset/engine/samplers.py +26 -2
mindspore/dataset/engine/serializer_deserializer.py +1 -1
mindspore/dataset/engine/validators.py +43 -11
mindspore/dataset/transforms/py_transforms_util.py +17 -0
mindspore/dataset/transforms/transforms.py +29 -12
mindspore/dataset/vision/validators.py +1 -2
mindspore/device_context/__init__.py +21 -0
mindspore/device_context/ascend/__init__.py +25 -0
mindspore/device_context/ascend/device.py +72 -0
mindspore/device_context/ascend/op_debug.py +94 -0
mindspore/device_context/ascend/op_precision.py +193 -0
mindspore/device_context/ascend/op_tuning.py +127 -0
mindspore/device_context/cpu/__init__.py +25 -0
mindspore/device_context/cpu/device.py +62 -0
mindspore/device_context/cpu/op_tuning.py +43 -0
mindspore/device_context/gpu/__init__.py +21 -0
mindspore/device_context/gpu/device.py +70 -0
mindspore/device_context/gpu/op_precision.py +67 -0
mindspore/device_context/gpu/op_tuning.py +175 -0
mindspore/device_manager.py +134 -0
mindspore/dnnl.dll +0 -0
mindspore/experimental/llm_boost/__init__.py +1 -0
mindspore/experimental/llm_boost/ascend_native/__init__.py +22 -0
mindspore/experimental/llm_boost/ascend_native/llama_boost_ascend_native.py +211 -0
mindspore/experimental/llm_boost/ascend_native/llm_boost.py +52 -0
mindspore/experimental/llm_boost/atb/boost_base.py +2 -3
mindspore/experimental/llm_boost/atb/llama_boost.py +6 -1
mindspore/experimental/llm_boost/register.py +1 -0
mindspore/experimental/optim/adadelta.py +26 -22
mindspore/experimental/optim/adam.py +3 -0
mindspore/experimental/optim/lr_scheduler.py +33 -24
mindspore/experimental/optim/radam.py +33 -30
mindspore/hal/device.py +28 -0
mindspore/hal/event.py +17 -0
mindspore/hal/memory.py +94 -3
mindspore/hal/stream.py +91 -6
mindspore/include/api/context.h +0 -1
mindspore/jpeg62.dll +0 -0
mindspore/log.py +12 -0
mindspore/mindrecord/__init__.py +1 -1
mindspore/mindrecord/config.py +17 -316
mindspore/mindrecord/filereader.py +1 -9
mindspore/mindrecord/filewriter.py +5 -15
mindspore/mindrecord/mindpage.py +1 -9
mindspore/mindspore_backend.dll +0 -0
mindspore/mindspore_common.dll +0 -0
mindspore/mindspore_core.dll +0 -0
mindspore/mindspore_glog.dll +0 -0
mindspore/mindspore_ops.dll +0 -0
mindspore/mint/__init__.py +824 -218
mindspore/mint/distributed/__init__.py +66 -4
mindspore/mint/distributed/distributed.py +2594 -44
mindspore/mint/linalg/__init__.py +6 -0
mindspore/mint/nn/__init__.py +473 -14
mindspore/mint/nn/functional.py +486 -11
mindspore/mint/nn/layer/__init__.py +17 -4
mindspore/mint/nn/layer/_functions.py +330 -0
mindspore/mint/nn/layer/activation.py +169 -1
mindspore/mint/nn/layer/basic.py +123 -0
mindspore/mint/nn/layer/conv.py +727 -0
mindspore/mint/nn/layer/normalization.py +215 -19
mindspore/mint/nn/layer/padding.py +797 -0
mindspore/mint/nn/layer/pooling.py +170 -0
mindspore/mint/optim/__init__.py +2 -1
mindspore/mint/optim/adam.py +223 -0
mindspore/mint/optim/adamw.py +26 -19
mindspore/mint/special/__init__.py +2 -1
mindspore/multiprocessing/__init__.py +5 -0
mindspore/nn/cell.py +126 -19
mindspore/nn/dynamic_lr.py +2 -1
mindspore/nn/layer/activation.py +6 -6
mindspore/nn/layer/basic.py +35 -25
mindspore/nn/layer/channel_shuffle.py +3 -3
mindspore/nn/layer/embedding.py +3 -3
mindspore/nn/layer/normalization.py +8 -7
mindspore/nn/layer/padding.py +4 -3
mindspore/nn/layer/pooling.py +47 -13
mindspore/nn/layer/rnn_cells.py +1 -1
mindspore/nn/layer/rnns.py +2 -1
mindspore/nn/layer/timedistributed.py +5 -5
mindspore/nn/layer/transformer.py +48 -26
mindspore/nn/learning_rate_schedule.py +5 -3
mindspore/nn/loss/loss.py +31 -36
mindspore/nn/optim/ada_grad.py +1 -0
mindspore/nn/optim/adadelta.py +2 -2
mindspore/nn/optim/adam.py +1 -1
mindspore/nn/optim/lars.py +1 -4
mindspore/nn/optim/optimizer.py +1 -1
mindspore/nn/optim/rprop.py +2 -2
mindspore/nn/optim/thor.py +2 -1
mindspore/nn/utils/init.py +13 -11
mindspore/nn/wrap/cell_wrapper.py +4 -6
mindspore/nn/wrap/loss_scale.py +3 -4
mindspore/numpy/array_creations.py +60 -62
mindspore/numpy/array_ops.py +148 -143
mindspore/numpy/logic_ops.py +41 -42
mindspore/numpy/math_ops.py +361 -359
mindspore/numpy/utils.py +16 -16
mindspore/numpy/utils_const.py +4 -4
mindspore/opencv_core452.dll +0 -0
mindspore/opencv_imgcodecs452.dll +0 -0
mindspore/opencv_imgproc452.dll +0 -0
mindspore/ops/__init__.py +2 -1
mindspore/ops/_grad_experimental/grad_comm_ops.py +94 -13
mindspore/ops/_grad_experimental/grad_debug_ops.py +6 -1
mindspore/ops/_grad_experimental/grad_inner_ops.py +9 -0
mindspore/ops/_grad_experimental/grad_math_ops.py +2 -1
mindspore/ops/_op_impl/cpu/__init__.py +1 -0
mindspore/ops/_op_impl/cpu/raise_op.py +28 -0
mindspore/ops/_vmap/vmap_array_ops.py +20 -19
mindspore/ops/_vmap/vmap_base.py +0 -2
mindspore/ops/_vmap/vmap_grad_nn_ops.py +19 -13
mindspore/ops/_vmap/vmap_math_ops.py +11 -9
mindspore/ops/_vmap/vmap_nn_ops.py +20 -34
mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +149 -12
mindspore/ops/auto_generate/gen_arg_handler.py +0 -61
mindspore/ops/auto_generate/gen_extend_func.py +554 -60
mindspore/ops/auto_generate/gen_ops_def.py +1621 -115
mindspore/ops/auto_generate/gen_ops_prim.py +8024 -3409
mindspore/ops/auto_generate/pyboost_inner_prim.py +183 -79
mindspore/ops/composite/base.py +1 -1
mindspore/ops/composite/multitype_ops/_compile_utils.py +229 -30
mindspore/ops/composite/multitype_ops/pow_impl.py +0 -29
mindspore/ops/function/__init__.py +12 -0
mindspore/ops/function/array_func.py +561 -159
mindspore/ops/function/clip_func.py +64 -0
mindspore/ops/function/debug_func.py +28 -20
mindspore/ops/function/image_func.py +1 -1
mindspore/ops/function/linalg_func.py +5 -4
mindspore/ops/function/math_func.py +1659 -290
mindspore/ops/function/nn_func.py +988 -317
mindspore/ops/function/parameter_func.py +3 -56
mindspore/ops/function/random_func.py +243 -33
mindspore/ops/function/sparse_unary_func.py +1 -1
mindspore/ops/functional.py +18 -5
mindspore/ops/functional_overload.py +897 -0
mindspore/ops/operations/__init__.py +3 -2
mindspore/ops/operations/_embedding_cache_ops.py +4 -4
mindspore/ops/operations/_grad_ops.py +2 -34
mindspore/ops/operations/_infer_ops.py +2 -1
mindspore/ops/operations/_inner_ops.py +38 -8
mindspore/ops/operations/array_ops.py +45 -303
mindspore/ops/operations/comm_ops.py +19 -16
mindspore/ops/operations/custom_ops.py +11 -55
mindspore/ops/operations/debug_ops.py +42 -47
mindspore/ops/operations/inner_ops.py +6 -4
mindspore/ops/operations/linalg_ops.py +3 -2
mindspore/ops/operations/manually_defined/ops_def.py +185 -104
mindspore/ops/operations/math_ops.py +11 -216
mindspore/ops/operations/nn_ops.py +146 -308
mindspore/ops/primitive.py +23 -21
mindspore/ops/tensor_method.py +1669 -0
mindspore/ops_generate/aclnn_kernel_register_auto_cc_generator.py +110 -0
mindspore/ops_generate/add_tensor_docs_generator.py +54 -0
mindspore/ops_generate/arg_handler.py +0 -61
mindspore/ops_generate/auto_grad_impl_cc_generator.py +135 -0
mindspore/ops_generate/auto_grad_reg_cc_generator.py +93 -0
mindspore/ops_generate/base_generator.py +11 -0
mindspore/ops_generate/cpp_create_prim_instance_helper_generator.py +108 -0
mindspore/ops_generate/functional_map_cpp_generator.py +491 -0
mindspore/ops_generate/functional_overload_py_generator.py +110 -0
mindspore/ops_generate/functions_cc_generator.py +233 -0
mindspore/ops_generate/gen_aclnn_implement.py +110 -114
mindspore/ops_generate/gen_constants.py +157 -3
mindspore/ops_generate/gen_ops.py +245 -990
mindspore/ops_generate/gen_pyboost_func.py +97 -998
mindspore/ops_generate/gen_utils.py +119 -33
mindspore/ops_generate/lite_ops_cpp_generator.py +155 -0
mindspore/ops_generate/op_api_proto.py +206 -0
mindspore/ops_generate/op_def_py_generator.py +131 -0
mindspore/ops_generate/op_prim_py_generator.py +480 -0
mindspore/ops_generate/op_proto.py +373 -108
mindspore/ops_generate/op_template_parser.py +436 -0
mindspore/ops_generate/ops_def_cc_generator.py +288 -0
mindspore/ops_generate/ops_def_h_generator.py +74 -0
mindspore/ops_generate/ops_name_h_generator.py +68 -0
mindspore/ops_generate/ops_primitive_h_generator.py +81 -0
mindspore/ops_generate/pyboost_functions_cpp_generator.py +370 -0
mindspore/ops_generate/pyboost_functions_h_generator.py +68 -0
mindspore/ops_generate/pyboost_functions_py_generator.py +148 -0
mindspore/ops_generate/pyboost_grad_function_cpp_generator.py +154 -0
mindspore/ops_generate/pyboost_inner_prim_generator.py +131 -0
mindspore/ops_generate/pyboost_native_grad_functions_generator.py +268 -0
mindspore/ops_generate/pyboost_op_cpp_code_generator.py +851 -0
mindspore/ops_generate/pyboost_overload_functions_cpp_generator.py +344 -0
mindspore/ops_generate/pyboost_utils.py +92 -33
mindspore/ops_generate/template.py +294 -44
mindspore/ops_generate/tensor_func_reg_cpp_generator.py +422 -0
mindspore/parallel/__init__.py +3 -3
mindspore/parallel/_auto_parallel_context.py +24 -33
mindspore/parallel/_parallel_serialization.py +13 -2
mindspore/parallel/_utils.py +4 -1
mindspore/parallel/algo_parameter_config.py +1 -1
mindspore/parallel/checkpoint_transform.py +44 -0
mindspore/parallel/cluster/process_entity/_api.py +131 -37
mindspore/parallel/cluster/process_entity/_utils.py +41 -6
mindspore/parallel/cluster/run.py +20 -3
mindspore/parallel/parameter_broadcast.py +1 -1
mindspore/parallel/shard.py +3 -0
mindspore/parallel/transform_safetensors.py +119 -253
mindspore/profiler/__init__.py +17 -4
mindspore/profiler/analysis/__init__.py +0 -0
mindspore/profiler/analysis/parser/__init__.py +0 -0
mindspore/profiler/analysis/parser/ascend_cann_parser.py +166 -0
mindspore/profiler/analysis/parser/base_parser.py +158 -0
mindspore/profiler/analysis/parser/framework_cann_relation_parser.py +45 -0
mindspore/profiler/analysis/parser/ms_framework_parser.py +142 -0
mindspore/profiler/analysis/parser/ms_minddata_parser.py +145 -0
mindspore/profiler/analysis/parser/timeline_assembly_factory/__init__.py +0 -0
mindspore/profiler/analysis/parser/timeline_assembly_factory/ascend_timeline_assembler.py +261 -0
mindspore/profiler/analysis/parser/timeline_assembly_factory/base_timeline_assembler.py +40 -0
mindspore/profiler/analysis/parser/timeline_assembly_factory/trace_view_container.py +84 -0
mindspore/profiler/analysis/parser/timeline_creator/__init__.py +0 -0
mindspore/profiler/analysis/parser/timeline_creator/base_timeline_creator.py +44 -0
mindspore/profiler/analysis/parser/timeline_creator/cpu_op_timeline_creator.py +90 -0
mindspore/profiler/analysis/parser/timeline_creator/fwk_timeline_creator.py +76 -0
mindspore/profiler/analysis/parser/timeline_creator/msprof_timeline_creator.py +103 -0
mindspore/profiler/analysis/parser/timeline_creator/scope_layer_timeline_creator.py +134 -0
mindspore/profiler/analysis/parser/timeline_event/__init__.py +0 -0
mindspore/profiler/analysis/parser/timeline_event/base_event.py +233 -0
mindspore/profiler/analysis/parser/timeline_event/cpu_op_event.py +47 -0
mindspore/profiler/analysis/parser/timeline_event/flow_event.py +36 -0
mindspore/profiler/analysis/parser/timeline_event/fwk_event.py +260 -0
mindspore/profiler/analysis/parser/timeline_event/msprof_event.py +73 -0
mindspore/profiler/analysis/parser/timeline_event/scope_layer_event.py +53 -0
mindspore/profiler/analysis/parser/timeline_event/timeline_event_pool.py +146 -0
mindspore/profiler/analysis/task_manager.py +131 -0
mindspore/profiler/analysis/time_converter.py +84 -0
mindspore/profiler/analysis/viewer/__init__.py +0 -0
mindspore/profiler/analysis/viewer/ascend_communication_viewer.py +333 -0
mindspore/profiler/analysis/viewer/ascend_integrate_viewer.py +87 -0
mindspore/profiler/analysis/viewer/ascend_kernel_details_viewer.py +252 -0
mindspore/profiler/analysis/viewer/ascend_memory_viewer.py +313 -0
mindspore/profiler/analysis/viewer/ascend_op_memory_viewer.py +322 -0
mindspore/profiler/analysis/viewer/ascend_step_trace_time_viewer.py +265 -0
mindspore/profiler/analysis/viewer/ascend_timeline_viewer.py +58 -0
mindspore/profiler/analysis/viewer/base_viewer.py +26 -0
mindspore/profiler/analysis/viewer/ms_dataset_viewer.py +97 -0
mindspore/profiler/analysis/viewer/ms_minddata_viewer.py +581 -0
mindspore/profiler/analysis/work_flow.py +73 -0
mindspore/profiler/common/ascend_msprof_exporter.py +138 -0
mindspore/profiler/common/command_executor.py +90 -0
mindspore/profiler/common/constant.py +174 -3
mindspore/profiler/common/file_manager.py +208 -0
mindspore/profiler/common/log.py +130 -0
mindspore/profiler/common/msprof_cmd_tool.py +202 -0
mindspore/profiler/common/path_manager.py +371 -0
mindspore/profiler/common/process_bar.py +168 -0
mindspore/profiler/common/process_pool.py +9 -3
mindspore/profiler/common/profiler_context.py +476 -0
mindspore/profiler/common/profiler_info.py +304 -0
mindspore/profiler/common/profiler_output_path.py +284 -0
mindspore/profiler/common/profiler_parameters.py +210 -0
mindspore/profiler/common/profiler_path_manager.py +120 -0
mindspore/profiler/common/record_function.py +76 -0
mindspore/profiler/common/tlv_decoder.py +76 -0
mindspore/profiler/common/util.py +75 -2
mindspore/profiler/dynamic_profiler.py +270 -37
mindspore/profiler/envprofiler.py +138 -0
mindspore/profiler/mstx.py +199 -0
mindspore/profiler/platform/__init__.py +21 -0
mindspore/profiler/platform/base_profiler.py +40 -0
mindspore/profiler/platform/cpu_profiler.py +124 -0
mindspore/profiler/platform/gpu_profiler.py +74 -0
mindspore/profiler/platform/npu_profiler.py +309 -0
mindspore/profiler/profiler.py +580 -93
mindspore/profiler/profiler_action_controller.py +187 -0
mindspore/profiler/profiler_interface.py +114 -0
mindspore/profiler/schedule.py +208 -0
mindspore/rewrite/api/symbol_tree.py +1 -2
mindspore/run_check/_check_version.py +2 -6
mindspore/runtime/__init__.py +37 -0
mindspore/runtime/device.py +27 -0
mindspore/runtime/event.py +209 -0
mindspore/runtime/executor.py +148 -0
mindspore/runtime/memory.py +392 -0
mindspore/runtime/stream.py +460 -0
mindspore/runtime/thread_bind_core.py +401 -0
mindspore/swresample-4.dll +0 -0
mindspore/swscale-6.dll +0 -0
mindspore/tinyxml2.dll +0 -0
mindspore/train/__init__.py +2 -2
mindspore/train/_utils.py +53 -18
mindspore/train/amp.py +8 -4
mindspore/train/callback/_checkpoint.py +32 -18
mindspore/train/callback/_early_stop.py +1 -1
mindspore/train/callback/_flops_collector.py +105 -69
mindspore/train/callback/_history.py +1 -1
mindspore/train/callback/_summary_collector.py +44 -6
mindspore/train/callback/_tft_register.py +31 -10
mindspore/train/dataset_helper.py +11 -11
mindspore/train/metrics/precision.py +4 -5
mindspore/train/mind_ir_pb2.py +167 -46
mindspore/train/model.py +13 -15
mindspore/train/serialization.py +462 -76
mindspore/train/summary/summary_record.py +1 -2
mindspore/train/train_thor/model_thor.py +1 -1
mindspore/turbojpeg.dll +0 -0
mindspore/utils/__init__.py +4 -2
mindspore/utils/dryrun.py +138 -0
mindspore/utils/runtime_execution_order_check.py +550 -0
mindspore/version.py +1 -1
{mindspore-2.4.10.dist-info → mindspore-2.5.0.dist-info}/METADATA +2 -3
{mindspore-2.4.10.dist-info → mindspore-2.5.0.dist-info}/RECORD +362 -238
{mindspore-2.4.10.dist-info → mindspore-2.5.0.dist-info}/entry_points.txt +1 -1
mindspore/common/_tensor_overload.py +0 -139
mindspore/mindspore_np_dtype.dll +0 -0
mindspore/profiler/envprofiling.py +0 -254
mindspore/profiler/profiling.py +0 -1926
{mindspore-2.4.10.dist-info → mindspore-2.5.0.dist-info}/WHEEL +0 -0
{mindspore-2.4.10.dist-info → mindspore-2.5.0.dist-info}/top_level.txt +0 -0

mindspore/dataset/engine/datasets_standard_format.py CHANGED Viewed

@@ -34,7 +34,7 @@ from .datasets_user_defined import GeneratorDataset
 from .obs.obs_mindrecord_dataset import MindRecordFromOBS
 from .validators import check_csvdataset, check_minddataset, check_tfrecorddataset, check_obsminddataset
 from ..core.validator_helpers import type_check
-from ...mindrecord.config import _get_enc_key, _get_dec_mode, _get_hash_mode, decrypt, verify_file_hash
+from ...mindrecord.config import _get_enc_key, _get_dec_mode, decrypt
 from ..core.validator_helpers import replace_none
@@ -75,6 +75,8 @@ class CSVDataset(SourceDataset, UnionBaseDataset):
         num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` .
             When this argument is specified, `num_samples` reflects the maximum sample number of per shard.
+            Used in `data parallel training <https://www.mindspore.cn/docs/en/master/model_train/
+            parallel/data_parallel.html#data-parallel-mode-loads-datasets>`_ .
         shard_id (int, optional): The shard ID within `num_shards` . Default: ``None``. This
             argument can only be specified when `num_shards` is also specified.
         cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
@@ -143,6 +145,8 @@ class MindDataset(MappableDataset, UnionBaseDataset):
         num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` .
             When this argument is specified, `num_samples` reflects the maximum sample number of per shard.
+            Used in `data parallel training <https://www.mindspore.cn/docs/en/master/model_train/
+            parallel/data_parallel.html#data-parallel-mode-loads-datasets>`_ .
         shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
             argument can only be specified when `num_shards` is also specified.
         sampler (Sampler, optional): Object used to choose samples from the
@@ -255,19 +259,15 @@ class MindDataset(MappableDataset, UnionBaseDataset):
         # do decrypt & integrity check
         if not isinstance(self.dataset_files, list):
-            if _get_enc_key() is not None or _get_hash_mode() is not None:
+            if _get_enc_key() is not None:
                 logger.warning("When a single mindrecord file which is generated by " +
                                "`mindspore.mindrecord.FileWriter` with `shard_num` > 1 is used as the input, " +
-                               "enabling decryption/integrity check may fail. Please use file list as the input.")
+                               "enabling decryption check may fail. Please use file list as the input.")
             # decrypt the data file and index file
             index_file_name = self.dataset_files + ".db"
             self.dataset_files = decrypt(self.dataset_files, _get_enc_key(), _get_dec_mode())
             decrypt(index_file_name, _get_enc_key(), _get_dec_mode())
-            # verify integrity check
-            verify_file_hash(self.dataset_files)
-            verify_file_hash(self.dataset_files + ".db")
         else:
             file_tuple = []
             for item in self.dataset_files:
@@ -276,10 +276,6 @@ class MindDataset(MappableDataset, UnionBaseDataset):
                 decrypt_filename = decrypt(item, _get_enc_key(), _get_dec_mode())
                 file_tuple.append(decrypt_filename)
                 decrypt(index_file_name, _get_enc_key(), _get_dec_mode())
-                # verify integrity check
-                verify_file_hash(decrypt_filename)
-                verify_file_hash(decrypt_filename + ".db")
             self.dataset_files = file_tuple
         self.columns_list = replace_none(columns_list, [])
@@ -361,6 +357,8 @@ class TFRecordDataset(SourceDataset, UnionBaseDataset):
         num_shards (int, optional): Number of shards that the dataset will be divided
             into. Default: ``None`` . When this argument is specified, `num_samples` reflects
             the maximum sample number per shard.
+            Used in `data parallel training <https://www.mindspore.cn/docs/en/master/model_train/
+            parallel/data_parallel.html#data-parallel-mode-loads-datasets>`_ .
         shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
             argument can only be specified when `num_shards` is also specified.
         shard_equal_rows (bool, optional): Get equal rows for all shards. Default: ``False``. If `shard_equal_rows`
@@ -476,6 +474,8 @@ class OBSMindDataset(GeneratorDataset):
         num_shards (int, optional): Number of shards that the dataset will be divided
             into. Default: ``None`` .
+            Used in `data parallel training <https://www.mindspore.cn/docs/en/master/model_train/
+            parallel/data_parallel.html#data-parallel-mode-loads-datasets>`_ .
         shard_id (int, optional): The shard ID within num_shards. Default: ``None`` . This
             argument can only be specified when `num_shards` is also specified.
         shard_equal_rows (bool, optional): Get equal rows for all shards. Default: ``True``. If shard_equal_rows

mindspore/dataset/engine/datasets_text.py CHANGED Viewed

@@ -67,6 +67,8 @@ class AGNewsDataset(SourceDataset, TextBaseDataset):
         num_shards (int, optional): Number of shards that the dataset will be divided into.
             Default: ``None``. When this argument is specified, `num_samples` reflects the
             max sample number of per shard.
+            Used in `data parallel training <https://www.mindspore.cn/docs/en/master/model_train/
+            parallel/data_parallel.html#data-parallel-mode-loads-datasets>`_ .
         shard_id (int, optional): The shard ID within `num_shards` . This
             argument can only be specified when `num_shards` is also specified. Default: ``None``.
         cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
@@ -175,6 +177,8 @@ class AmazonReviewDataset(SourceDataset, TextBaseDataset):
         num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` .
             When this argument is specified, `num_samples` reflects the max sample number of per shard.
+            Used in `data parallel training <https://www.mindspore.cn/docs/en/master/model_train/
+            parallel/data_parallel.html#data-parallel-mode-loads-datasets>`_ .
         shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
             argument can only be specified when `num_shards` is also specified.
         cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
@@ -273,6 +277,8 @@ class CLUEDataset(SourceDataset, TextBaseDataset):
         num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` .
             When this argument is specified, `num_samples` reflects the maximum sample number of per shard.
+            Used in `data parallel training <https://www.mindspore.cn/docs/en/master/model_train/
+            parallel/data_parallel.html#data-parallel-mode-loads-datasets>`_ .
         shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
             argument can only be specified when `num_shards` is also specified.
         cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
@@ -515,7 +521,8 @@ class CoNLL2000Dataset(SourceDataset, TextBaseDataset):
         num_shards (int, optional): Number of shards that the dataset will be divided into.
             When this argument is specified, `num_samples` reflects the max sample number of per shard.
-            Default: ``None`` .
+            Default: ``None`` . Used in `data parallel training <https://www.mindspore.cn/docs/en/master/model_train/
+            parallel/data_parallel.html#data-parallel-mode-loads-datasets>`_ .
         shard_id (int, optional): The shard ID within `num_shards` . This
             argument can only be specified when `num_shards` is also specified. Default: ``None`` .
         num_parallel_workers (int, optional): Number of worker threads to read the data.
@@ -618,6 +625,8 @@ class DBpediaDataset(SourceDataset, TextBaseDataset):
         num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` .
             When this argument is specified, `num_samples` reflects the maximum sample number of per shard.
+            Used in `data parallel training <https://www.mindspore.cn/docs/en/master/model_train/
+            parallel/data_parallel.html#data-parallel-mode-loads-datasets>`_ .
         shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
             argument can only be specified when `num_shards` is also specified.
         cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
@@ -717,6 +726,8 @@ class EnWik9Dataset(SourceDataset, TextBaseDataset):
         num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` .
             When this argument is specified, `num_samples` reflects the maximum sample number of per shard.
+            Used in `data parallel training <https://www.mindspore.cn/docs/en/master/model_train/
+            parallel/data_parallel.html#data-parallel-mode-loads-datasets>`_ .
         shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
             argument can only be specified when `num_shards` is also specified.
         cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
@@ -805,6 +816,8 @@ class IMDBDataset(MappableDataset, TextBaseDataset):
         num_shards (int, optional): Number of shards that the dataset will be divided
             into. Default: ``None`` . When this argument is specified, `num_samples` reflects
             the maximum sample number of per shard.
+            Used in `data parallel training <https://www.mindspore.cn/docs/en/master/model_train/
+            parallel/data_parallel.html#data-parallel-mode-loads-datasets>`_ .
         shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
             argument can only be specified when `num_shards` is also specified.
         cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
@@ -941,6 +954,8 @@ class IWSLT2016Dataset(SourceDataset, TextBaseDataset):
         num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` .
             When this argument is specified, `num_samples` reflects the max sample number of per shard.
+            Used in `data parallel training <https://www.mindspore.cn/docs/en/master/model_train/
+            parallel/data_parallel.html#data-parallel-mode-loads-datasets>`_ .
         shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
             argument can only be specified when `num_shards` is also specified.
         num_parallel_workers (int, optional): Number of worker threads to read the data.
@@ -1073,6 +1088,8 @@ class IWSLT2017Dataset(SourceDataset, TextBaseDataset):
         num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` .
             When this argument is specified, `num_samples` reflects the max sample number of per shard.
+            Used in `data parallel training <https://www.mindspore.cn/docs/en/master/model_train/
+            parallel/data_parallel.html#data-parallel-mode-loads-datasets>`_ .
         shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
             argument can only be specified when `num_shards` is also specified.
         num_parallel_workers (int, optional): Number of worker threads to read the data.
@@ -1181,6 +1198,8 @@ class Multi30kDataset(SourceDataset, TextBaseDataset):
         num_shards (int, optional): Number of shards that the dataset will be divided
             into. Default: ``None`` . When this argument is specified, `num_samples` reflects
             the max sample number of per shard.
+            Used in `data parallel training <https://www.mindspore.cn/docs/en/master/model_train/
+            parallel/data_parallel.html#data-parallel-mode-loads-datasets>`_ .
         shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
             argument can only be specified when `num_shards` is also specified.
         cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
@@ -1290,6 +1309,8 @@ class PennTreebankDataset(SourceDataset, TextBaseDataset):
         num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` .
             When this argument is specified, `num_samples` reflects the max sample number of per shard.
+            Used in `data parallel training <https://www.mindspore.cn/docs/en/master/model_train/
+            parallel/data_parallel.html#data-parallel-mode-loads-datasets>`_ .
         shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
             argument can only be specified when `num_shards` is also specified.
         cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
@@ -1389,6 +1410,8 @@ class SogouNewsDataset(SourceDataset, TextBaseDataset):
             - ``Shuffle.FILES`` : Shuffle files only.
         num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` .
             When this argument is specified, `num_samples` reflects the max sample number of per shard.
+            Used in `data parallel training <https://www.mindspore.cn/docs/en/master/model_train/
+            parallel/data_parallel.html#data-parallel-mode-loads-datasets>`_ .
         shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
             argument can only be specified when `num_shards` is also specified.
         num_parallel_workers (int, optional): Number of worker threads to read the data.
@@ -1490,6 +1513,8 @@ class SQuADDataset(SourceDataset, TextBaseDataset):
         num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` .
             When this argument is specified, `num_samples` reflects the maximum sample number of per shard.
+            Used in `data parallel training <https://www.mindspore.cn/docs/en/master/model_train/
+            parallel/data_parallel.html#data-parallel-mode-loads-datasets>`_ .
         shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
             argument can only be specified when `num_shards` is also specified.
         cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
@@ -1608,6 +1633,8 @@ class SST2Dataset(SourceDataset, TextBaseDataset):
         num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` .
             When this argument is specified, `num_samples` reflects the maximum sample number of per shard.
+            Used in `data parallel training <https://www.mindspore.cn/docs/en/master/model_train/
+            parallel/data_parallel.html#data-parallel-mode-loads-datasets>`_ .
         shard_id (int, optional): The shard ID within `num_shards`. This argument can only be specified when
             `num_shards` is also specified. Default: ``None`` .
         cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
@@ -1711,6 +1738,8 @@ class TextFileDataset(SourceDataset, TextBaseDataset):
         num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` .
             When this argument is specified, `num_samples` reflects the maximum sample number of per shard.
+            Used in `data parallel training <https://www.mindspore.cn/docs/en/master/model_train/
+            parallel/data_parallel.html#data-parallel-mode-loads-datasets>`_ .
         shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
             argument can only be specified when `num_shards` is also specified.
         cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
@@ -1775,6 +1804,8 @@ class UDPOSDataset(SourceDataset, TextBaseDataset):
         num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` .
             When this argument is specified, `num_samples` reflects the max sample number of per shard.
+            Used in `data parallel training <https://www.mindspore.cn/docs/en/master/model_train/
+            parallel/data_parallel.html#data-parallel-mode-loads-datasets>`_ .
         shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
             argument can only be specified when `num_shards` is also specified.
         num_parallel_workers (int, optional): Number of worker threads to read the data.
@@ -1861,6 +1892,8 @@ class WikiTextDataset(SourceDataset, TextBaseDataset):
         num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` .
             When this argument is specified, `num_samples` reflects the max sample number of per shard.
+            Used in `data parallel training <https://www.mindspore.cn/docs/en/master/model_train/
+            parallel/data_parallel.html#data-parallel-mode-loads-datasets>`_ .
         shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
             argument can only be specified when `num_shards` is also specified.
         cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
@@ -1958,6 +1991,8 @@ class YahooAnswersDataset(SourceDataset, TextBaseDataset):
         num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` .
             When this argument is specified, `num_samples` reflects the maximum sample number of per shard.
+            Used in `data parallel training <https://www.mindspore.cn/docs/en/master/model_train/
+            parallel/data_parallel.html#data-parallel-mode-loads-datasets>`_ .
         shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
             argument can only be specified when `num_shards` is also specified.
         cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
@@ -2058,6 +2093,8 @@ class YelpReviewDataset(SourceDataset, TextBaseDataset):
             - ``Shuffle.FILES`` : Shuffle files only.
         num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` .
             When this argument is specified, `num_samples` reflects the max sample number of per shard.
+            Used in `data parallel training <https://www.mindspore.cn/docs/en/master/model_train/
+            parallel/data_parallel.html#data-parallel-mode-loads-datasets>`_ .
         shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
             argument can only be specified when `num_shards` is also specified.
         num_parallel_workers (int, optional): Number of worker threads to read the data.

mindspore/dataset/engine/datasets_user_defined.py CHANGED Viewed

@@ -28,12 +28,10 @@ import signal
 import time
 from types import GeneratorType
 import multiprocessing
-from multiprocessing.util import Finalize
 import queue
 from functools import partial
 import subprocess
 import threading
-import weakref
 import platform
 import psutil
 import numpy as np
@@ -46,7 +44,7 @@ from mindspore import log as logger
 from .datasets import UnionBaseDataset, MappableDataset, Schema, to_list, _PythonMultiprocessing, _check_shm_usage
 from . import samplers
 from .queue import _SharedQueue
-from .validators import check_generatordataset, check_numpyslicesdataset, check_paddeddataset
+from .validators import check_generator_dataset, check_numpy_slices_dataset, check_padded_dataset
 from ..core.config import get_enable_shared_mem, get_prefetch_size, get_multiprocessing_timeout_interval, \
     get_enable_watchdog, get_debug_mode, get_seed, set_seed
 from ..core.datatypes import mstypelist_to_detypelist
@@ -221,7 +219,6 @@ class SamplerFn(cde.PythonMultiprocessingRuntime):
         self.ppid = os.getpid()
         self.pids = []
         self.check_interval = get_multiprocessing_timeout_interval()  # the interval of check queue's size
-        self._final_join = True
         # Event for end of epoch
         if self.multi_process is True:
@@ -272,8 +269,14 @@ class SamplerFn(cde.PythonMultiprocessingRuntime):
                 worker.daemon = True
                 self.need_join = True
             self.workers.append(worker)
-        if self.multi_process and platform.system().lower() != 'windows':
-            self._launch_cleanup_worker()
+        if self.multi_process:
+            logger.info("Launch generator worker process(es): {}".format([worker.pid for worker in self.workers]))
+            if platform.system().lower() != 'windows':
+                self._launch_monitor()
+    def terminate(self):
+        self._stop_subprocess()
     def _interval_log(self, i, start_time, wait_count):
         cost_time = int(time.time()) - start_time
@@ -394,9 +397,11 @@ class SamplerFn(cde.PythonMultiprocessingRuntime):
                            "the `mindspore.dataset.config.set_multiprocessing_timeout_interval` interface."
         logger.warning(warning_message)
-    def _launch_cleanup_worker(self):
+    def _launch_monitor(self):
         """
-        We need a extra thread and process if main process or subprocess was killed.
+        Launch a clean process and register subprocess to be monitored by the watch dog.
+        The clean process will clean up subprocesses when main process exited.
+        The watch dog will clean up subprocesses and main process when any subprocess exited.
         """
         _clean_worker_func = _PythonMultiprocessing._clean_process  # pylint: disable=W0212
         self.cleaning_process = multiprocessing.Process(target=_clean_worker_func,
@@ -404,21 +409,13 @@ class SamplerFn(cde.PythonMultiprocessingRuntime):
                                                         args=(self.ppid, self.workers, self.eof))
         self.cleaning_process.daemon = True
         self.cleaning_process.start()
+        logger.info("Launch clean process {} to monitor worker "
+                    "process(es): {}".format(self.cleaning_process.pid, [worker.pid for worker in self.workers]))
         if get_enable_watchdog():
-            self.eot = threading.Event()
-            self.watch_dog = threading.Thread(target=_PythonMultiprocessing._watch_dog,  # pylint: disable=W0212
-                                              name="GeneratorWatchDog",
-                                              args=(self.eot, self.workers + [self.cleaning_process]))
-            self.watch_dog.daemon = True
-            self.watch_dog.start()
-            if self._final_join is True:
-                self._jointhread = Finalize(
-                    self.watch_dog, self._finalize_join,
-                    args=(weakref.ref(self.watch_dog), self.eot),
-                    exitpriority=-5
-                )
+            worker_ids = [worker.pid for worker in self.workers]
+            worker_ids.append(self.cleaning_process.pid)
+            cde.register_worker_pids(id(self), set(worker_ids))
     def _release_fd(self):
         """Release the file descriptor by subprocess"""
@@ -454,15 +451,8 @@ class SamplerFn(cde.PythonMultiprocessingRuntime):
     def _stop_subprocess(self):
         """Only the main process can call join. All the sub-process / sub-thread will be stopped."""
         if self.need_join is True and self.ppid == os.getpid():
-            # the sub-process / sub-thread will stop by self.eof.set()
-            if hasattr(self, 'eof') and self.eof is not None:
-                try:
-                    self.eof.set()
-                except AttributeError:  # maybe occur "'NoneType' object has no attribute 'maxsize'"
-                    pass
-            # close the watch dog first
-            self._abort_watchdog()
+            # abort the monitor first
+            self._abort_monitor()
             self.need_join = False
             # waiting for the sub-process stop
@@ -489,10 +479,12 @@ class SamplerFn(cde.PythonMultiprocessingRuntime):
             self.workers.clear()
             self.workers = None
-    def _abort_watchdog(self):
-        """Let watchdog quit."""
-        if hasattr(self, 'eot') and self.eot is not None and not self.eot.is_set():
-            self.eot.set()
+    def _abort_monitor(self):
+        """Deregister workers monitored by the watch dog and join clean process."""
+        if get_enable_watchdog():
+            cde.deregister_worker_pids(id(self))
+        if hasattr(self, 'eof') and self.eof is not None:
+            self.eof.set()
         if hasattr(self, 'cleaning_process') and self.cleaning_process is not None:
             # let the quit event notify the cleaning process to exit
             self.cleaning_process.join(timeout=5)
@@ -503,14 +495,6 @@ class SamplerFn(cde.PythonMultiprocessingRuntime):
         if hasattr(self, 'count'):
             del self.count
-    @classmethod
-    def _finalize_join(cls, twr, eot):
-        thread = twr()
-        if thread is not None:
-            if eot is not None and not eot.is_set():
-                eot.set()
-            thread.join()
     def __del__(self):
         try:
             self._stop_subprocess()
@@ -554,7 +538,7 @@ def _generator_worker_loop(dataset, idx_queue, result_queue, eof, is_multiproces
     cde.register_worker_handlers()
     if is_multiprocessing:
-        result_queue.cancel_join_thread()  # Ensure that the process does not hung when exiting
+        result_queue.cancel_join_thread()  # Ensure that the process does not hang when exiting
         signal.signal(signal.SIGTERM, partial(_subprocess_handle, eof))
         # init the random seed and np.random seed for the subprocess
@@ -694,6 +678,7 @@ class _GeneratorWorkerMp(multiprocessing.Process):
 class _GeneratorWrapper:
     """Wrapper the generator so that it can be iterated multiple times in GeneratorDataset."""
     def __init__(self, generator):
         self.generator = generator
         self.generator_new, self.generator = itertools.tee(self.generator)
@@ -713,13 +698,22 @@ class GeneratorDataset(MappableDataset, UnionBaseDataset):
     The column names and column types of generated dataset depend on Python data defined by users.
     Args:
-        source (Union[Callable, Iterable, Random Accessible]):
-            A generator callable object, an iterable Python object or a random accessible Python object.
-            Callable source is required to return a tuple of NumPy arrays as a row of the dataset on source().next().
-            Iterable source is required to return a tuple of NumPy arrays as a row of the dataset on
-            iter(source).next().
-            Random accessible source is required to return a tuple of NumPy arrays as a row of the dataset on
-            source[idx].
+        source (Union[Random Accessible, Iterable]): A custom dataset from which to load the data.
+            MindSpore supports the following types of datasets:
+            - Random-accessible (map-style) datasets: A dataset object that implements the `__getitem__()`
+              and `__len__()` methods, represents a mapping from indexes/keys to data samples.
+              For example, such a dataset `source`, when accessed with `source[idx]`, can read the idx-th sample
+              from disk, see `Random-accessible dataset example <https://www.mindspore.cn/tutorials/en/master/
+              beginner/dataset.html#random-accessible-dataset>`_ for details.
+            - Iterable-style dataset: An iterable dataset object that implements `__iter__()` and `__next__()` methods,
+              represents an iterable over data samples. This type of dataset is suitable for situations where
+              random reads are costly or even impossible, and where batch sizes depend on the data being acquired.
+              For example, such a dataset `source`, when accessed `iter(source)`, can return a stream of data reading
+              from a database or remote server, see `Iterable-style dataset example
+              <https://www.mindspore.cn/tutorials/en/master/beginner/dataset.html#iterable-dataset>`_ for details.
         column_names (Union[str, list[str]], optional): List of column names of the dataset. Default: ``None`` .
             Users are required to provide either column_names or schema.
         column_types (list[mindspore.dtype], optional): List of column data types of the dataset. Default: ``None`` .
@@ -737,7 +731,8 @@ class GeneratorDataset(MappableDataset, UnionBaseDataset):
             input is required. Default: ``None`` , expected order behavior shown in the table below.
         num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` .
             Random accessible input is required. When this argument is specified, `num_samples` reflects the maximum
-            sample number of per shard.
+            sample number of per shard. Used in `data parallel training <https://www.mindspore.cn/docs/en/master/
+            model_train/parallel/data_parallel.html#data-parallel-mode-loads-datasets>`_ .
         shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` .
             This argument must be specified only when `num_shards` is also specified.
             Random accessible input is required.
@@ -748,6 +743,11 @@ class GeneratorDataset(MappableDataset, UnionBaseDataset):
             ``num_parallel_workers`` and :func:`mindspore.dataset.config.set_prefetch_size` increase. If set to -1,
             shared memory will be dynamically allocated with the actual size of data. This is only used if
             ``python_multiprocessing`` is set to True. Default: ``None`` , allocate shared memory dynamically.
+        batch_sampler (Iterable, optional): Similar to `sampler` , but returns a batch of indices at a time, the
+            corresponding data will be combined into a batch. Mutually exclusive with `num_samples` , `shuffle` ,
+            `num_shards` , `shard_id` and `sampler` . Default: ``None`` , do not use batch sampler.
+        collate_fn (Callable[List[numpy.ndarray]], optional): Define how to merge a list of data into a batch.
+            Only valid if `batch_sampler` is used. Default: ``None`` , do not use collation function.
     Raises:
         RuntimeError: If source raises an exception during execution.
@@ -758,6 +758,11 @@ class GeneratorDataset(MappableDataset, UnionBaseDataset):
         ValueError: If `num_shards` is specified but shard_id is None.
         ValueError: If shard_id is specified but `num_shards` is None.
         ValueError: If `shard_id` is not in range of [0, `num_shards` ).
+        TypeError: If `batch_sampler` is not iterable.
+        ValueError: If `batch_sampler` is specified together with `num_samples` ,
+            `shuffle` , `num_shards` , `shard_id` and `sampler`.
+        TypeError: If `collate_fn` is not callable.
+        ValueError: If `collate_fn` is specified while `batch_sampler` is None.
     Tutorial Examples:
         - `Load & Process Data With Dataset Pipeline
@@ -851,10 +856,10 @@ class GeneratorDataset(MappableDataset, UnionBaseDataset):
         >>> dataset = ds.GeneratorDataset(source=[(np.array(0),), (np.array(1),), (np.array(2),)], column_names=["col"])
     """
-    @check_generatordataset
+    @check_generator_dataset
     def __init__(self, source, column_names=None, column_types=None, schema=None, num_samples=None,
                  num_parallel_workers=1, shuffle=None, sampler=None, num_shards=None, shard_id=None,
-                 python_multiprocessing=True, max_rowsize=None):
+                 python_multiprocessing=True, max_rowsize=None, batch_sampler=None, collate_fn=None):
         super().__init__(num_parallel_workers=num_parallel_workers, sampler=sampler, num_samples=num_samples,
                          shuffle=shuffle, num_shards=num_shards, shard_id=shard_id)
         if isinstance(source, builtins.zip):
@@ -895,18 +900,41 @@ class GeneratorDataset(MappableDataset, UnionBaseDataset):
             self.schema = schema
             if not isinstance(schema, Schema):
                 self.schema = Schema(schema)
+        self.has_batch_sampler = False
+        if batch_sampler is not None:
+            self.has_batch_sampler = True
+            if not isinstance(batch_sampler, samplers.BuiltinSampler):
+                self.sampler = samplers.IterSampler(batch_sampler)
+            else:
+                self.sampler = batch_sampler
         # Move get dataset_size by len from parse to here, because self.source will
         # lose attribution of '__len__' after deepcopy.
+        self._calculate_source_length()
+        self.max_rowsize = max_rowsize if max_rowsize is not None else -1
+        self.sample_fn = None
+        # Ignore batch_info in the input parameter.
+        self.collate_fn = (lambda *args: collate_fn(*args[:-1])) if collate_fn is not None else None
+    def _calculate_source_length(self):
+        """Calculate the source length according to the source and sampler."""
         self.source_len = -1  # unknown
         if hasattr(self.source, "__len__"):
             self.source_len = len(self.source)
             # if user defined sampler, update the self.source_len
             if isinstance(self.sampler, samplers.Sampler) or hasattr(self.sampler, "__iter__"):
-                self.source_len = len(list(sampler))
-        self.max_rowsize = max_rowsize if max_rowsize is not None else -1
-        self.sample_fn = None
+                if self.sampler.child_sampler is not None:
+                    raise RuntimeError("GeneratorDataset does not support user defined sampler with child sampler yet.")
+                if self.sampler.num_samples is not None:
+                    self.source_len = self.sampler.num_samples
+                elif hasattr(self.sampler, "__len__"):
+                    self.source_len = len(self.sampler)
+                else:
+                    # counting on a copied sampler to prevent changing the random state of the original one
+                    self.source_len = len(list(copy.deepcopy(self.sampler)))
     def __deepcopy__(self, memodict):
         if id(self) in memodict:
@@ -917,18 +945,20 @@ class GeneratorDataset(MappableDataset, UnionBaseDataset):
         type_check(index, (int, np.number), "index")
         if not hasattr(self.source, "__getitem__"):
             raise RuntimeError("Dataset don't support randomized access.")
+        if self.has_batch_sampler:
+            raise RuntimeError("GeneratorDataset with batch_sampler does not support random access.")
         if not hasattr(self, "generator_op"):
             dataset = copy.deepcopy(self)
             self.prepared_source = _generator_fn_wrapper(_cpp_sampler_fn, self.source)
             if self.schema is None:
                 dataset.generator_node = cde.GeneratorNode(self.prepared_source, self.column_names, self.column_types,
-                                                           self.source_len, self.sampler, 1, None)
+                                                           self.source_len, self.sampler, 1, None, False)
             else:
                 schema = self.schema
                 if isinstance(schema, Schema):
                     schema = self.schema.cpp_schema
                 dataset.generator_node = cde.GeneratorNode(self.prepared_source, schema, self.source_len,
-                                                           self.sampler, 1, None)
+                                                           self.sampler, 1, None, False)
             self.generator_op = dataset.generator_node.Build()
         sample_id = self.generator_op.GetMappedIndex(index)
         return self.source[sample_id]
@@ -945,9 +975,11 @@ class GeneratorDataset(MappableDataset, UnionBaseDataset):
     def split(self, sizes, randomize=True):
         if hasattr(self.source, "__getitem__"):
-            # If the source has __getitem__ attribute, call the split method of MappableDataset.
-            # Otherwise, call the split method of Dataset.
-            return super().split(sizes, randomize)
+            if not self.has_batch_sampler:
+                # If the source has __getitem__ attribute, call the split method of MappableDataset.
+                # Otherwise, call the split method of Dataset.
+                return super().split(sizes, randomize)
+            logger.warning("The performance of split will be degraded since batch_sampler is detected.")
         return super(MappableDataset, self).split(sizes, randomize)
     def prepare_multiprocessing(self):
@@ -984,12 +1016,12 @@ class GeneratorDataset(MappableDataset, UnionBaseDataset):
         self.prepare_multiprocessing()
         if self.schema is None:
             return cde.GeneratorNode(self.prepared_source, self.column_names, self.column_types, self.source_len,
-                                     self.sampler, self.num_parallel_workers, self.sample_fn)
+                                     self.sampler, self.num_parallel_workers, self.sample_fn, self.has_batch_sampler)
         schema = self.schema
         if isinstance(schema, Schema):
             schema = self.schema.cpp_schema
         return cde.GeneratorNode(self.prepared_source, schema, self.source_len, self.sampler,
-                                 self.num_parallel_workers, self.sample_fn)
+                                 self.num_parallel_workers, self.sample_fn, self.has_batch_sampler)
     def __validate_memory_usage(self):
         """
@@ -1107,6 +1139,8 @@ class NumpySlicesDataset(GeneratorDataset):
             Default: ``None`` , expected order behavior shown in the table below.
         num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` .
             When this argument is specified, `num_samples` reflects the max sample number of per shard.
+            Used in `data parallel training <https://www.mindspore.cn/docs/en/master/model_train/
+            parallel/data_parallel.html#data-parallel-mode-loads-datasets>`_ .
         shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This argument must be
             specified only when `num_shards` is also specified.
@@ -1149,7 +1183,7 @@ class NumpySlicesDataset(GeneratorDataset):
         >>> dataset = ds.NumpySlicesDataset(data=dict(df), shuffle=False)
     """
-    @check_numpyslicesdataset
+    @check_numpy_slices_dataset
     def __init__(self, data, column_names=None, num_samples=None, num_parallel_workers=1, shuffle=None, sampler=None,
                  num_shards=None, shard_id=None):
         dataset = _NumpySlicesDataset(data, column_names)
@@ -1202,7 +1236,7 @@ class PaddedDataset(GeneratorDataset):
         >>> dataset = ds.PaddedDataset(padded_samples=data)
     """
-    @check_paddeddataset
+    @check_padded_dataset
     def __init__(self, padded_samples):
         dataset = _PaddedDataset(padded_samples)
         super().__init__(dataset, column_names=dataset.column_names, num_shards=None, shard_id=None, shuffle=False)