mindspore 2.3.0__cp310-cp310-win_amd64.whl → 2.4.0__cp310-cp310-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mindspore might be problematic. Click here for more details.
- mindspore/.commit_id +1 -1
- mindspore/Microsoft.VisualStudio.Telemetry.dll +0 -0
- mindspore/Newtonsoft.Json.dll +0 -0
- mindspore/__init__.py +3 -1
- mindspore/_c_dataengine.cp310-win_amd64.pyd +0 -0
- mindspore/_c_expression.cp310-win_amd64.pyd +0 -0
- mindspore/_c_mindrecord.cp310-win_amd64.pyd +0 -0
- mindspore/_checkparam.py +50 -9
- mindspore/_extends/parse/compile_config.py +41 -0
- mindspore/_extends/parse/parser.py +9 -7
- mindspore/_extends/parse/standard_method.py +52 -14
- mindspore/_extends/pijit/pijit_func_white_list.py +350 -24
- mindspore/amp.py +24 -10
- mindspore/atlprov.dll +0 -0
- mindspore/avcodec-59.dll +0 -0
- mindspore/avdevice-59.dll +0 -0
- mindspore/avfilter-8.dll +0 -0
- mindspore/avformat-59.dll +0 -0
- mindspore/avutil-57.dll +0 -0
- mindspore/c1.dll +0 -0
- mindspore/c1xx.dll +0 -0
- mindspore/c2.dll +0 -0
- mindspore/common/__init__.py +6 -4
- mindspore/common/_pijit_context.py +190 -0
- mindspore/common/_register_for_tensor.py +2 -1
- mindspore/common/_tensor_overload.py +139 -0
- mindspore/common/api.py +102 -87
- mindspore/common/dump.py +5 -6
- mindspore/common/generator.py +1 -7
- mindspore/common/hook_handle.py +14 -26
- mindspore/common/mindir_util.py +2 -2
- mindspore/common/parameter.py +46 -13
- mindspore/common/recompute.py +39 -9
- mindspore/common/sparse_tensor.py +7 -3
- mindspore/common/tensor.py +209 -29
- mindspore/communication/__init__.py +1 -1
- mindspore/communication/_comm_helper.py +38 -3
- mindspore/communication/comm_func.py +310 -55
- mindspore/communication/management.py +14 -14
- mindspore/context.py +123 -22
- mindspore/dataset/__init__.py +1 -1
- mindspore/dataset/audio/__init__.py +1 -1
- mindspore/dataset/core/config.py +7 -0
- mindspore/dataset/core/validator_helpers.py +7 -0
- mindspore/dataset/engine/cache_client.py +1 -1
- mindspore/dataset/engine/datasets.py +72 -44
- mindspore/dataset/engine/datasets_audio.py +7 -7
- mindspore/dataset/engine/datasets_standard_format.py +53 -3
- mindspore/dataset/engine/datasets_text.py +20 -20
- mindspore/dataset/engine/datasets_user_defined.py +174 -104
- mindspore/dataset/engine/datasets_vision.py +33 -33
- mindspore/dataset/engine/iterators.py +29 -0
- mindspore/dataset/engine/obs/util.py +7 -0
- mindspore/dataset/engine/queue.py +114 -60
- mindspore/dataset/engine/serializer_deserializer.py +2 -2
- mindspore/dataset/engine/validators.py +34 -14
- mindspore/dataset/text/__init__.py +1 -4
- mindspore/dataset/transforms/__init__.py +0 -3
- mindspore/dataset/utils/line_reader.py +2 -0
- mindspore/dataset/vision/__init__.py +1 -4
- mindspore/dataset/vision/utils.py +1 -1
- mindspore/dataset/vision/validators.py +2 -1
- mindspore/dnnl.dll +0 -0
- mindspore/dpcmi.dll +0 -0
- mindspore/{nn/extend → experimental/es}/__init__.py +4 -11
- mindspore/experimental/es/embedding_service.py +883 -0
- mindspore/{nn/layer → experimental/es}/embedding_service_layer.py +218 -30
- mindspore/experimental/llm_boost/__init__.py +21 -0
- mindspore/{nn/extend/layer → experimental/llm_boost/atb}/__init__.py +4 -8
- mindspore/experimental/llm_boost/atb/boost_base.py +211 -0
- mindspore/experimental/llm_boost/atb/llama_boost.py +115 -0
- mindspore/experimental/llm_boost/atb/qwen_boost.py +101 -0
- mindspore/experimental/llm_boost/register.py +129 -0
- mindspore/experimental/llm_boost/utils.py +31 -0
- mindspore/experimental/optim/adamw.py +85 -0
- mindspore/experimental/optim/optimizer.py +3 -0
- mindspore/hal/__init__.py +3 -3
- mindspore/hal/contiguous_tensors_handle.py +175 -0
- mindspore/hal/stream.py +18 -0
- mindspore/include/api/model_group.h +13 -1
- mindspore/include/api/types.h +10 -10
- mindspore/include/dataset/config.h +2 -2
- mindspore/include/dataset/constants.h +2 -2
- mindspore/include/dataset/execute.h +2 -2
- mindspore/include/dataset/vision.h +4 -0
- mindspore/jpeg62.dll +0 -0
- mindspore/log.py +1 -1
- mindspore/mindrecord/filewriter.py +68 -51
- mindspore/mindspore_backend.dll +0 -0
- mindspore/mindspore_common.dll +0 -0
- mindspore/mindspore_core.dll +0 -0
- mindspore/mindspore_glog.dll +0 -0
- mindspore/mindspore_np_dtype.dll +0 -0
- mindspore/mindspore_ops.dll +0 -0
- mindspore/mint/__init__.py +495 -46
- mindspore/mint/distributed/__init__.py +31 -0
- mindspore/mint/distributed/distributed.py +254 -0
- mindspore/mint/nn/__init__.py +266 -21
- mindspore/mint/nn/functional.py +125 -19
- mindspore/mint/nn/layer/__init__.py +39 -0
- mindspore/mint/nn/layer/activation.py +133 -0
- mindspore/mint/nn/layer/normalization.py +477 -0
- mindspore/mint/nn/layer/pooling.py +110 -0
- mindspore/mint/optim/adamw.py +28 -7
- mindspore/mint/special/__init__.py +63 -0
- mindspore/msobj140.dll +0 -0
- mindspore/mspdb140.dll +0 -0
- mindspore/mspdbcore.dll +0 -0
- mindspore/mspdbst.dll +0 -0
- mindspore/mspft140.dll +0 -0
- mindspore/msvcdis140.dll +0 -0
- mindspore/msvcp140_1.dll +0 -0
- mindspore/msvcp140_2.dll +0 -0
- mindspore/msvcp140_atomic_wait.dll +0 -0
- mindspore/msvcp140_codecvt_ids.dll +0 -0
- mindspore/multiprocessing/__init__.py +2 -1
- mindspore/nn/__init__.py +0 -1
- mindspore/nn/cell.py +275 -93
- mindspore/nn/layer/activation.py +211 -44
- mindspore/nn/layer/basic.py +113 -3
- mindspore/nn/layer/embedding.py +120 -2
- mindspore/nn/layer/normalization.py +101 -5
- mindspore/nn/layer/padding.py +34 -48
- mindspore/nn/layer/pooling.py +161 -7
- mindspore/nn/layer/transformer.py +3 -3
- mindspore/nn/loss/__init__.py +2 -2
- mindspore/nn/loss/loss.py +84 -6
- mindspore/nn/optim/__init__.py +2 -1
- mindspore/nn/optim/adadelta.py +1 -1
- mindspore/nn/optim/adam.py +1 -1
- mindspore/nn/optim/lamb.py +1 -1
- mindspore/nn/optim/tft_wrapper.py +127 -0
- mindspore/nn/wrap/cell_wrapper.py +12 -23
- mindspore/nn/wrap/grad_reducer.py +5 -5
- mindspore/nn/wrap/loss_scale.py +17 -3
- mindspore/numpy/__init__.py +1 -1
- mindspore/numpy/array_creations.py +65 -68
- mindspore/numpy/array_ops.py +64 -60
- mindspore/numpy/fft.py +610 -75
- mindspore/numpy/logic_ops.py +11 -10
- mindspore/numpy/math_ops.py +85 -84
- mindspore/numpy/utils_const.py +4 -4
- mindspore/opencv_core452.dll +0 -0
- mindspore/opencv_imgcodecs452.dll +0 -0
- mindspore/opencv_imgproc452.dll +0 -0
- mindspore/ops/__init__.py +6 -4
- mindspore/ops/_grad_experimental/grad_comm_ops.py +47 -3
- mindspore/ops/_grad_experimental/grad_math_ops.py +0 -22
- mindspore/ops/_vmap/vmap_array_ops.py +2 -4
- mindspore/ops/_vmap/vmap_math_ops.py +17 -1
- mindspore/ops/_vmap/vmap_nn_ops.py +43 -2
- mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +85 -7
- mindspore/ops/auto_generate/gen_arg_dtype_cast.py +2 -0
- mindspore/ops/auto_generate/gen_extend_func.py +734 -13
- mindspore/ops/auto_generate/gen_ops_def.py +2420 -381
- mindspore/ops/auto_generate/gen_ops_prim.py +5196 -1659
- mindspore/ops/auto_generate/pyboost_inner_prim.py +176 -56
- mindspore/ops/composite/base.py +85 -48
- mindspore/ops/composite/multitype_ops/_compile_utils.py +1 -0
- mindspore/ops/composite/multitype_ops/not_in_impl.py +2 -2
- mindspore/ops/function/__init__.py +22 -0
- mindspore/ops/function/array_func.py +490 -153
- mindspore/ops/function/debug_func.py +113 -1
- mindspore/ops/function/fft_func.py +15 -2
- mindspore/ops/function/grad/grad_func.py +3 -2
- mindspore/ops/function/math_func.py +558 -207
- mindspore/ops/function/nn_func.py +817 -383
- mindspore/ops/function/other_func.py +3 -2
- mindspore/ops/function/random_func.py +184 -8
- mindspore/ops/function/reshard_func.py +13 -11
- mindspore/ops/function/sparse_unary_func.py +1 -1
- mindspore/ops/function/vmap_func.py +3 -2
- mindspore/ops/functional.py +24 -14
- mindspore/ops/op_info_register.py +3 -3
- mindspore/ops/operations/__init__.py +6 -1
- mindspore/ops/operations/_grad_ops.py +2 -76
- mindspore/ops/operations/_infer_ops.py +1 -1
- mindspore/ops/operations/_inner_ops.py +71 -94
- mindspore/ops/operations/array_ops.py +12 -146
- mindspore/ops/operations/comm_ops.py +42 -53
- mindspore/ops/operations/custom_ops.py +83 -19
- mindspore/ops/operations/debug_ops.py +42 -10
- mindspore/ops/operations/manually_defined/_inner.py +12 -0
- mindspore/ops/operations/manually_defined/ops_def.py +265 -10
- mindspore/ops/operations/math_ops.py +12 -223
- mindspore/ops/operations/nn_ops.py +20 -114
- mindspore/ops/operations/other_ops.py +7 -4
- mindspore/ops/operations/random_ops.py +46 -1
- mindspore/ops/primitive.py +18 -6
- mindspore/ops_generate/arg_dtype_cast.py +2 -0
- mindspore/ops_generate/gen_aclnn_implement.py +11 -11
- mindspore/ops_generate/gen_constants.py +36 -0
- mindspore/ops_generate/gen_ops.py +67 -52
- mindspore/ops_generate/gen_ops_inner_prim.py +1 -1
- mindspore/ops_generate/gen_pyboost_func.py +131 -47
- mindspore/ops_generate/op_proto.py +10 -3
- mindspore/ops_generate/pyboost_utils.py +14 -1
- mindspore/ops_generate/template.py +43 -21
- mindspore/parallel/__init__.py +3 -1
- mindspore/parallel/_auto_parallel_context.py +28 -8
- mindspore/parallel/_cell_wrapper.py +83 -0
- mindspore/parallel/_parallel_serialization.py +47 -19
- mindspore/parallel/_tensor.py +81 -11
- mindspore/parallel/_utils.py +13 -1
- mindspore/parallel/algo_parameter_config.py +5 -5
- mindspore/parallel/checkpoint_transform.py +46 -39
- mindspore/parallel/cluster/process_entity/__init__.py +1 -1
- mindspore/parallel/cluster/process_entity/_api.py +31 -23
- mindspore/parallel/cluster/process_entity/_utils.py +2 -27
- mindspore/parallel/parameter_broadcast.py +3 -4
- mindspore/parallel/shard.py +162 -31
- mindspore/parallel/transform_safetensors.py +993 -0
- mindspore/pgodb140.dll +0 -0
- mindspore/pgort140.dll +0 -0
- mindspore/profiler/__init__.py +2 -1
- mindspore/profiler/common/constant.py +29 -0
- mindspore/profiler/common/registry.py +47 -0
- mindspore/profiler/common/util.py +28 -0
- mindspore/profiler/dynamic_profiler.py +694 -0
- mindspore/profiler/envprofiling.py +17 -19
- mindspore/profiler/parser/ascend_analysis/constant.py +18 -0
- mindspore/profiler/parser/ascend_analysis/file_manager.py +25 -4
- mindspore/profiler/parser/ascend_analysis/function_event.py +43 -19
- mindspore/profiler/parser/ascend_analysis/fwk_cann_parser.py +31 -26
- mindspore/profiler/parser/ascend_analysis/fwk_file_parser.py +56 -10
- mindspore/profiler/parser/ascend_analysis/msprof_timeline_parser.py +55 -8
- mindspore/profiler/parser/ascend_analysis/path_manager.py +313 -0
- mindspore/profiler/parser/ascend_analysis/profiler_info_parser.py +27 -20
- mindspore/profiler/parser/ascend_analysis/trace_event_manager.py +9 -2
- mindspore/profiler/parser/ascend_msprof_exporter.py +5 -4
- mindspore/profiler/parser/ascend_timeline_generator.py +27 -25
- mindspore/profiler/parser/base_timeline_generator.py +19 -25
- mindspore/profiler/parser/cpu_gpu_timeline_generator.py +25 -12
- mindspore/profiler/parser/framework_parser.py +1 -391
- mindspore/profiler/parser/gpu_analysis/__init__.py +14 -0
- mindspore/profiler/parser/gpu_analysis/function_event.py +44 -0
- mindspore/profiler/parser/gpu_analysis/fwk_file_parser.py +89 -0
- mindspore/profiler/parser/gpu_analysis/profiler_info_parser.py +72 -0
- mindspore/profiler/parser/memory_usage_parser.py +0 -154
- mindspore/profiler/parser/profiler_info.py +78 -6
- mindspore/profiler/profiler.py +153 -0
- mindspore/profiler/profiling.py +280 -412
- mindspore/rewrite/__init__.py +1 -2
- mindspore/rewrite/common/namespace.py +4 -4
- mindspore/rewrite/symbol_tree/symbol_tree.py +3 -3
- mindspore/run_check/_check_version.py +36 -103
- mindspore/safeguard/rewrite_obfuscation.py +591 -247
- mindspore/swresample-4.dll +0 -0
- mindspore/swscale-6.dll +0 -0
- mindspore/tbbmalloc.dll +0 -0
- mindspore/tinyxml2.dll +0 -0
- mindspore/train/__init__.py +4 -3
- mindspore/train/_utils.py +28 -2
- mindspore/train/amp.py +171 -53
- mindspore/train/callback/__init__.py +2 -2
- mindspore/train/callback/_callback.py +4 -4
- mindspore/train/callback/_checkpoint.py +85 -22
- mindspore/train/callback/_cluster_monitor.py +1 -1
- mindspore/train/callback/_flops_collector.py +1 -0
- mindspore/train/callback/_loss_monitor.py +3 -3
- mindspore/train/callback/_on_request_exit.py +134 -31
- mindspore/train/callback/_summary_collector.py +5 -5
- mindspore/train/callback/_tft_register.py +352 -0
- mindspore/train/dataset_helper.py +7 -3
- mindspore/train/metrics/metric.py +3 -3
- mindspore/train/metrics/roc.py +4 -4
- mindspore/train/mind_ir_pb2.py +44 -39
- mindspore/train/model.py +134 -58
- mindspore/train/serialization.py +336 -112
- mindspore/turbojpeg.dll +0 -0
- mindspore/utils/__init__.py +21 -0
- mindspore/utils/utils.py +60 -0
- mindspore/vcmeta.dll +0 -0
- mindspore/vcruntime140.dll +0 -0
- mindspore/vcruntime140_1.dll +0 -0
- mindspore/version.py +1 -1
- {mindspore-2.3.0.dist-info → mindspore-2.4.0.dist-info}/METADATA +6 -2
- {mindspore-2.3.0.dist-info → mindspore-2.4.0.dist-info}/RECORD +281 -275
- mindspore/include/c_api/ms/abstract.h +0 -67
- mindspore/include/c_api/ms/attribute.h +0 -197
- mindspore/include/c_api/ms/base/handle_types.h +0 -43
- mindspore/include/c_api/ms/base/macros.h +0 -32
- mindspore/include/c_api/ms/base/status.h +0 -33
- mindspore/include/c_api/ms/base/types.h +0 -283
- mindspore/include/c_api/ms/context.h +0 -102
- mindspore/include/c_api/ms/graph.h +0 -160
- mindspore/include/c_api/ms/node.h +0 -606
- mindspore/include/c_api/ms/tensor.h +0 -161
- mindspore/include/c_api/ms/value.h +0 -84
- mindspore/mindspore_shared_lib.dll +0 -0
- mindspore/nn/extend/basic.py +0 -140
- mindspore/nn/extend/embedding.py +0 -143
- mindspore/nn/extend/layer/normalization.py +0 -109
- mindspore/nn/extend/pooling.py +0 -117
- mindspore/nn/layer/embedding_service.py +0 -531
- mindspore/ops/_op_impl/aicpu/strided_slice_v2.py +0 -93
- mindspore/ops/_op_impl/aicpu/strided_slice_v2_grad.py +0 -66
- mindspore/ops/extend/__init__.py +0 -53
- mindspore/ops/extend/array_func.py +0 -218
- mindspore/ops/extend/math_func.py +0 -76
- mindspore/ops/extend/nn_func.py +0 -308
- mindspore/ops/silent_check.py +0 -162
- mindspore/profiler/parser/msadvisor_analyzer.py +0 -82
- mindspore/profiler/parser/msadvisor_parser.py +0 -240
- mindspore/train/callback/_mindio_ttp.py +0 -443
- {mindspore-2.3.0.dist-info → mindspore-2.4.0.dist-info}/WHEEL +0 -0
- {mindspore-2.3.0.dist-info → mindspore-2.4.0.dist-info}/entry_points.txt +0 -0
- {mindspore-2.3.0.dist-info → mindspore-2.4.0.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright 2022-
|
|
1
|
+
# Copyright 2022-2024 Huawei Technologies Co., Ltd
|
|
2
2
|
#
|
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
4
|
# you may not use this file except in compliance with the License.
|
|
@@ -16,13 +16,13 @@
|
|
|
16
16
|
1. This file is an abstraction of the dataset loading class. It contains
|
|
17
17
|
some basic dataset operations(skip, filter, map, batch, ...).
|
|
18
18
|
2. Specific dataset loading classes can be found in datasets_vision.py, datasets_text.py,
|
|
19
|
-
datasets_audio.py, datasets_standard_format.py and
|
|
19
|
+
datasets_audio.py, datasets_standard_format.py and datasets_user_defined.py files.
|
|
20
20
|
datasets_vision.py: contains vision dataset loading classes.
|
|
21
21
|
datasets_text.py: contains text dataset loading classes.
|
|
22
22
|
datasets_audio.py: contains audio dataset loading classes.
|
|
23
23
|
datasets_standard_format.py: contains standard format loading classes which
|
|
24
24
|
any other kinds of datasets can be converted to.
|
|
25
|
-
|
|
25
|
+
datasets_user_defined.py: contains basic classes that help users to define
|
|
26
26
|
flexible ways to load dataset.
|
|
27
27
|
"""
|
|
28
28
|
import atexit
|
|
@@ -66,13 +66,13 @@ from mindspore.dataset.debug import DebugHook
|
|
|
66
66
|
|
|
67
67
|
from mindspore.dataset.engine import samplers
|
|
68
68
|
from .iterators import DictIterator, TupleIterator, DummyIterator, check_iterator_cleanup, _set_iterator_cleanup, \
|
|
69
|
-
ITERATORS_LIST, _unset_iterator_cleanup
|
|
69
|
+
ITERATORS_LIST, _unset_iterator_cleanup, _cleanup_the_iterators_if_created
|
|
70
70
|
from .queue import _SharedQueue, _Queue
|
|
71
71
|
from .validators import check_batch, check_shuffle, check_map, check_filter, check_repeat, check_skip, check_zip, \
|
|
72
72
|
check_rename, check_device_send, check_take, check_output_shape, check_project, \
|
|
73
73
|
check_sync_wait, check_zip_dataset, check_add_column, check_concat, check_split, check_bucket_batch_by_length, \
|
|
74
74
|
check_save, check_tuple_iterator, check_dict_iterator, check_schema, check_to_device_send, check_padded_batch, \
|
|
75
|
-
check_total_batch
|
|
75
|
+
check_total_batch, check_sync_update
|
|
76
76
|
from ..core.config import get_callback_timeout, _init_device_info, get_enable_shared_mem, get_num_parallel_workers, \
|
|
77
77
|
get_enable_watchdog, get_seed, set_seed, get_debug_mode, get_multiprocessing_timeout_interval, _get_debug_hook_list
|
|
78
78
|
from ..core.datatypes import mstype_to_detype
|
|
@@ -494,6 +494,12 @@ class Dataset:
|
|
|
494
494
|
|
|
495
495
|
.. image:: bucket_batch_by_length_en.png
|
|
496
496
|
|
|
497
|
+
Note:
|
|
498
|
+
- When using `Data Sinking <https://www.mindspore.cn/docs/en/master/model_train/train_process/optimize/
|
|
499
|
+
sink_mode.html#data-sinking>`_ in Graph mode, the input shape of the network should keep consistent.
|
|
500
|
+
You should set `drop_remainder` to "True" to discard the last incomplete batch of data,
|
|
501
|
+
or supplement/remove samples to ensure the dataset size is divisible by `batch_size`.
|
|
502
|
+
|
|
497
503
|
Args:
|
|
498
504
|
column_names (list[str]): Columns passed to element_length_function.
|
|
499
505
|
bucket_boundaries (list[int]): A list consisting of the upper boundaries
|
|
@@ -564,8 +570,12 @@ class Dataset:
|
|
|
564
570
|
.. image:: batch_en.png
|
|
565
571
|
|
|
566
572
|
Note:
|
|
567
|
-
The order of using repeat and batch reflects the number of batches and per_batch_map.
|
|
568
|
-
|
|
573
|
+
- The order of using repeat and batch reflects the number of batches and per_batch_map.
|
|
574
|
+
It is recommended that the repeat operation applied after the batch operation finished.
|
|
575
|
+
- When using `Data Sinking <https://www.mindspore.cn/docs/en/master/model_train/train_process/optimize/
|
|
576
|
+
sink_mode.html#data-sinking>`_ in Graph mode, the input shape of the network should keep consistent.
|
|
577
|
+
You should set `drop_remainder` to "True" to discard the last incomplete batch of data,
|
|
578
|
+
or supplement/remove samples to ensure the dataset size is divisible by `batch_size`.
|
|
569
579
|
|
|
570
580
|
Args:
|
|
571
581
|
batch_size (Union[int, Callable]): The number of rows each batch is created with. An
|
|
@@ -598,10 +608,10 @@ class Dataset:
|
|
|
598
608
|
name as the input columns, i.e., the columns will be replaced.
|
|
599
609
|
|
|
600
610
|
- python_multiprocessing (bool, optional): Parallelize Python function `per_batch_map` with
|
|
601
|
-
|
|
602
|
-
``False`` means
|
|
603
|
-
|
|
604
|
-
|
|
611
|
+
multiprocessing or multithreading mode, ``True`` means multiprocessing,
|
|
612
|
+
``False`` means multithreading If `per_batch_map` is a I/O bound task, use
|
|
613
|
+
multithreading mode. If `per_batch_map` is a CPU bound task, it is recommended to use
|
|
614
|
+
multiprocessing mode. Default: ``False`` , use python multithreading mode.
|
|
605
615
|
|
|
606
616
|
- max_rowsize(Union[int, list[int]], optional): Maximum size of row in MB that is used for shared memory
|
|
607
617
|
allocation to copy data between processes, the total occupied shared memory will increase as
|
|
@@ -611,7 +621,7 @@ class Dataset:
|
|
|
611
621
|
``input_columns`` and ``output_columns`` use this value as the unit to create shared memory.
|
|
612
622
|
If it is a list, the first element represents the ``input_columns`` use this value as the unit to
|
|
613
623
|
create shared memory, and the second element represents ``output_columns`` use this value as the unit
|
|
614
|
-
to create shared memory. Default:
|
|
624
|
+
to create shared memory. Default: ``None`` , allocate shared memory dynamically.
|
|
615
625
|
|
|
616
626
|
Returns:
|
|
617
627
|
Dataset, a new dataset with the above operation applied.
|
|
@@ -657,8 +667,12 @@ class Dataset:
|
|
|
657
667
|
.. image:: padded_batch_en.png
|
|
658
668
|
|
|
659
669
|
Note:
|
|
660
|
-
The order of using repeat and padded_batch reflects the number of batches.
|
|
661
|
-
|
|
670
|
+
- The order of using repeat and padded_batch reflects the number of batches.
|
|
671
|
+
It is recommended that the repeat operation applied after the padded_batch operation finished.
|
|
672
|
+
- When using `Data Sinking <https://www.mindspore.cn/docs/en/master/model_train/train_process/optimize/
|
|
673
|
+
sink_mode.html#data-sinking>`_ in Graph mode, the input shape of the network should keep consistent.
|
|
674
|
+
You should set `drop_remainder` to "True" to discard the last incomplete batch of data,
|
|
675
|
+
or supplement/remove samples to ensure the dataset size is divisible by `batch_size`.
|
|
662
676
|
|
|
663
677
|
Args:
|
|
664
678
|
batch_size (Union[int, Callable]): The number of rows each batch is created with. An
|
|
@@ -905,7 +919,7 @@ class Dataset:
|
|
|
905
919
|
``input_columns`` and ``output_columns`` use this value as the unit to create shared memory.
|
|
906
920
|
If it is a list, the first element represents the ``input_columns`` use this value as the unit to
|
|
907
921
|
create shared memory, and the second element represents ``output_columns`` use this value as the unit
|
|
908
|
-
to create shared memory. Default:
|
|
922
|
+
to create shared memory. Default: ``None`` , allocate shared memory dynamically.
|
|
909
923
|
|
|
910
924
|
- cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
|
|
911
925
|
Default: ``None``, which means no cache is used.
|
|
@@ -989,8 +1003,8 @@ class Dataset:
|
|
|
989
1003
|
num_parallel_workers = 1
|
|
990
1004
|
logger.warning(
|
|
991
1005
|
"Input 'operations' of 'map' includes network computing operators like in mindspore.nn, mindspore.ops, "
|
|
992
|
-
"mindspore.numpy module and etc, which do not support
|
|
993
|
-
"with python implemented operator like numpy etc. Here decrease 'num_parallel_workers' into 1.")
|
|
1006
|
+
"mindspore.numpy module and etc, which do not support multithreading compiling, recommend to replace "
|
|
1007
|
+
"it with python implemented operator like numpy etc. Here decrease 'num_parallel_workers' into 1.")
|
|
994
1008
|
|
|
995
1009
|
return MapDataset(self, operations, input_columns, output_columns, num_parallel_workers, **kwargs)
|
|
996
1010
|
|
|
@@ -1523,8 +1537,8 @@ class Dataset:
|
|
|
1523
1537
|
2. Before calling the function, do not use batch operation, repeat operation or data augmentation operations
|
|
1524
1538
|
with random attribute in map operation.
|
|
1525
1539
|
3. When array dimension is variable, one-dimensional arrays or
|
|
1526
|
-
|
|
1527
|
-
4. MindRecord does not support
|
|
1540
|
+
multidimensional arrays with variable dimension 0 are supported.
|
|
1541
|
+
4. MindRecord does not support multidimensional string or multidimensional bytes.
|
|
1528
1542
|
|
|
1529
1543
|
Args:
|
|
1530
1544
|
file_name (str): Path to dataset file.
|
|
@@ -1741,6 +1755,7 @@ class Dataset:
|
|
|
1741
1755
|
return self._col_names
|
|
1742
1756
|
|
|
1743
1757
|
@check_output_shape
|
|
1758
|
+
@_cleanup_the_iterators_if_created
|
|
1744
1759
|
def output_shapes(self, estimate=False):
|
|
1745
1760
|
"""
|
|
1746
1761
|
Get the shapes of output data.
|
|
@@ -1792,6 +1807,7 @@ class Dataset:
|
|
|
1792
1807
|
self.saved_output_shapes = output_shapes
|
|
1793
1808
|
return output_shapes
|
|
1794
1809
|
|
|
1810
|
+
@_cleanup_the_iterators_if_created
|
|
1795
1811
|
def output_types(self):
|
|
1796
1812
|
"""
|
|
1797
1813
|
Get the types of output data.
|
|
@@ -1826,6 +1842,7 @@ class Dataset:
|
|
|
1826
1842
|
del self.runtime_context
|
|
1827
1843
|
return self.saved_output_types
|
|
1828
1844
|
|
|
1845
|
+
@_cleanup_the_iterators_if_created
|
|
1829
1846
|
def get_dataset_size(self):
|
|
1830
1847
|
"""
|
|
1831
1848
|
Return the number of batches in an epoch.
|
|
@@ -1893,6 +1910,7 @@ class Dataset:
|
|
|
1893
1910
|
return self.children[0].is_sync()
|
|
1894
1911
|
return False
|
|
1895
1912
|
|
|
1913
|
+
@check_sync_update
|
|
1896
1914
|
def sync_update(self, condition_name, num_batch=None, data=None):
|
|
1897
1915
|
"""
|
|
1898
1916
|
Release a blocking condition and trigger callback with given data.
|
|
@@ -2174,7 +2192,7 @@ class TextBaseDataset(Dataset):
|
|
|
2174
2192
|
Japanese or Chinese character sets, and 1.0 for other languages with small character sets
|
|
2175
2193
|
like English or Latin.
|
|
2176
2194
|
model_type(SentencePieceModel): Model type. Choose from unigram (default), bpe, char, or word.
|
|
2177
|
-
The input sentence must be
|
|
2195
|
+
The input sentence must be pre-tokenized when using word type.
|
|
2178
2196
|
params(dict): Any extra optional parameters of sentencepiece library according to your raw data
|
|
2179
2197
|
|
|
2180
2198
|
Returns:
|
|
@@ -2251,7 +2269,7 @@ class TextBaseDataset(Dataset):
|
|
|
2251
2269
|
Japanese or Chinese character sets, and 1.0 for other languages with small character sets
|
|
2252
2270
|
like English or Latin.
|
|
2253
2271
|
model_type(SentencePieceModel): Model type. Choose from unigram (default), bpe, char, or word.
|
|
2254
|
-
The input sentence must be
|
|
2272
|
+
The input sentence must be pre-tokenized when using word type.
|
|
2255
2273
|
params(dict): Any extra optional parameters of sentencepiece library according to your raw data
|
|
2256
2274
|
|
|
2257
2275
|
Returns:
|
|
@@ -2629,12 +2647,12 @@ class BatchDataset(UnionBaseDataset):
|
|
|
2629
2647
|
``input_columns`` and ``output_columns`` use this value as the unit to create shared memory.
|
|
2630
2648
|
If it is a list, the first element represents the ``input_columns`` use this value as the unit to
|
|
2631
2649
|
create shared memory, and the second element represents ``output_columns`` use this value as the unit
|
|
2632
|
-
to create shared memory. Default:
|
|
2650
|
+
to create shared memory. Default: ``None`` , allocate shared memory dynamically.
|
|
2633
2651
|
|
|
2634
2652
|
"""
|
|
2635
2653
|
|
|
2636
2654
|
def __init__(self, input_dataset, batch_size, drop_remainder=False, num_parallel_workers=None, per_batch_map=None,
|
|
2637
|
-
input_columns=None, output_columns=None, python_multiprocessing=False, max_rowsize=
|
|
2655
|
+
input_columns=None, output_columns=None, python_multiprocessing=False, max_rowsize=None):
|
|
2638
2656
|
super().__init__(children=input_dataset, num_parallel_workers=num_parallel_workers)
|
|
2639
2657
|
|
|
2640
2658
|
if BatchDataset._is_ancestor_of_repeat(input_dataset):
|
|
@@ -2655,7 +2673,9 @@ class BatchDataset(UnionBaseDataset):
|
|
|
2655
2673
|
|
|
2656
2674
|
self.python_multiprocessing = python_multiprocessing
|
|
2657
2675
|
self.process_pool = None
|
|
2658
|
-
if
|
|
2676
|
+
if max_rowsize is None:
|
|
2677
|
+
self.max_rowsize = [-1, -1]
|
|
2678
|
+
elif isinstance(max_rowsize, int):
|
|
2659
2679
|
self.max_rowsize = [max_rowsize * self.batch_size] * 2 if max_rowsize != -1 else [max_rowsize, max_rowsize]
|
|
2660
2680
|
else:
|
|
2661
2681
|
self.max_rowsize = [max_rowsize[0] * self.batch_size, max_rowsize[1] * self.batch_size]
|
|
@@ -3078,7 +3098,7 @@ class Pipe:
|
|
|
3078
3098
|
Class to handle communication between the master process and the worker processes.
|
|
3079
3099
|
"""
|
|
3080
3100
|
|
|
3081
|
-
def __init__(self, warning_ctl, shared_memory=False, max_rowsize=
|
|
3101
|
+
def __init__(self, warning_ctl, shared_memory=False, max_rowsize=(-1, -1)):
|
|
3082
3102
|
self.shared_memory = shared_memory
|
|
3083
3103
|
self.eof = multiprocessing.Event()
|
|
3084
3104
|
if self.shared_memory:
|
|
@@ -3139,7 +3159,10 @@ def _worker_loop(operations, pipe, worker_id):
|
|
|
3139
3159
|
"""
|
|
3140
3160
|
Multiprocess worker process loop.
|
|
3141
3161
|
"""
|
|
3142
|
-
#
|
|
3162
|
+
# Initialize C++ side signal handlers
|
|
3163
|
+
cde.register_worker_handlers()
|
|
3164
|
+
|
|
3165
|
+
# Ensure that the process does not hang when exiting
|
|
3143
3166
|
pipe.res_queue.cancel_join_thread()
|
|
3144
3167
|
|
|
3145
3168
|
def _ignore_sigint():
|
|
@@ -3153,6 +3176,7 @@ def _worker_loop(operations, pipe, worker_id):
|
|
|
3153
3176
|
# that the random results of each process are different.
|
|
3154
3177
|
if get_seed() != 5489:
|
|
3155
3178
|
set_seed(get_seed() + worker_id)
|
|
3179
|
+
|
|
3156
3180
|
while not _main_process_already_exit():
|
|
3157
3181
|
_ignore_sigint()
|
|
3158
3182
|
|
|
@@ -3184,7 +3208,7 @@ class _MPWorker(multiprocessing.Process):
|
|
|
3184
3208
|
Worker process for multiprocessing.
|
|
3185
3209
|
"""
|
|
3186
3210
|
|
|
3187
|
-
def __init__(self, operations, warning_ctl, max_rowsize=
|
|
3211
|
+
def __init__(self, operations, warning_ctl, max_rowsize=(-1, -1), worker_id=0):
|
|
3188
3212
|
shared_memory = get_enable_shared_mem()
|
|
3189
3213
|
self.pipe = Pipe(warning_ctl, shared_memory=shared_memory, max_rowsize=max_rowsize)
|
|
3190
3214
|
self.check_interval = get_multiprocessing_timeout_interval()
|
|
@@ -3216,14 +3240,6 @@ class _MPWorker(multiprocessing.Process):
|
|
|
3216
3240
|
logger.warning("Please `pip install py-spy` to get the stacks of the stuck process.")
|
|
3217
3241
|
try:
|
|
3218
3242
|
res = self.pipe.master_receive()
|
|
3219
|
-
# Because there is no need to copy when creating Tensors in the C++layer, it reduces the time
|
|
3220
|
-
# from np.ndarray to C++Tensor creation. However, when using shared memory in multiple processes,
|
|
3221
|
-
# the address of the shared memory will always be passed to subsequent nodes in the dataset pipeline,
|
|
3222
|
-
# and the shared memory will also be written by the current node, causing dirty data to be accessed
|
|
3223
|
-
# by subsequent nodes in the pipeline. So make a memory copy here to solve the problem of
|
|
3224
|
-
# shared memory being contaminated.
|
|
3225
|
-
if get_enable_shared_mem():
|
|
3226
|
-
res = copy.deepcopy(res)
|
|
3227
3243
|
except queue.Empty:
|
|
3228
3244
|
continue
|
|
3229
3245
|
if res is None:
|
|
@@ -3286,7 +3302,7 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
|
|
|
3286
3302
|
self.origin_hook(ex_type, value, tb)
|
|
3287
3303
|
self.mp_pool_exit_preprocess()
|
|
3288
3304
|
|
|
3289
|
-
def __init__(self, op_name, num_parallel_workers, operations, max_rowsize=
|
|
3305
|
+
def __init__(self, op_name, num_parallel_workers, operations, max_rowsize=(-1, -1)):
|
|
3290
3306
|
super(_PythonMultiprocessing, self).__init__()
|
|
3291
3307
|
self.op_name = op_name
|
|
3292
3308
|
self.num_parallel_workers = num_parallel_workers
|
|
@@ -3302,7 +3318,7 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
|
|
|
3302
3318
|
|
|
3303
3319
|
self.eot = None
|
|
3304
3320
|
self.watch_dog = None
|
|
3305
|
-
self.ppid =
|
|
3321
|
+
self.ppid = None
|
|
3306
3322
|
self.hook = None
|
|
3307
3323
|
self.warning_ctl = None
|
|
3308
3324
|
# cache thread (get_ident()) to worker_id mapping in Python layer
|
|
@@ -3327,10 +3343,10 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
|
|
|
3327
3343
|
if child_pid == 0:
|
|
3328
3344
|
break
|
|
3329
3345
|
except OSError:
|
|
3330
|
-
# waitpid may
|
|
3346
|
+
# waitpid may fail for some reason, so we ignore this error
|
|
3331
3347
|
pass
|
|
3332
3348
|
|
|
3333
|
-
# Dataset need watch_dog thread to monitoring fork
|
|
3349
|
+
# Dataset need watch_dog thread to monitoring fork multiprocessing,
|
|
3334
3350
|
# and thread can't be a member function otherwise python won't collect and release resources.
|
|
3335
3351
|
@staticmethod
|
|
3336
3352
|
def _watch_dog(eot, workers):
|
|
@@ -3363,6 +3379,8 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
|
|
|
3363
3379
|
"main process will exit. If this is not an artificial operation, you can use "
|
|
3364
3380
|
"ds.config.set_enable_watchdog(False) to block this error.")
|
|
3365
3381
|
os.kill(os.getpid(), signal.SIGTERM)
|
|
3382
|
+
# sleep to release GIL
|
|
3383
|
+
time.sleep(1)
|
|
3366
3384
|
|
|
3367
3385
|
# release the workers
|
|
3368
3386
|
del workers
|
|
@@ -3451,6 +3469,12 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
|
|
|
3451
3469
|
while _PythonMultiprocessing.is_process_alive(ppid):
|
|
3452
3470
|
if quit_signal.is_set():
|
|
3453
3471
|
return
|
|
3472
|
+
|
|
3473
|
+
# independent dataset mode, the subprocess of GeneratorDataset / map / batch should exit when
|
|
3474
|
+
# independent dataset process have exit
|
|
3475
|
+
if os.getppid() != ppid:
|
|
3476
|
+
break
|
|
3477
|
+
|
|
3454
3478
|
time.sleep(0.1)
|
|
3455
3479
|
|
|
3456
3480
|
_PythonMultiprocessing._terminate_processes(workers)
|
|
@@ -3462,10 +3486,10 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
|
|
|
3462
3486
|
Launch Python multiprocessing pool.
|
|
3463
3487
|
|
|
3464
3488
|
Args:
|
|
3465
|
-
|
|
3489
|
+
op_id: ID for operation to have Python multiprocessing pool launched
|
|
3466
3490
|
|
|
3467
3491
|
Returns:
|
|
3468
|
-
Python
|
|
3492
|
+
Python multiprocessing pool is launched.
|
|
3469
3493
|
"""
|
|
3470
3494
|
self.python_threads_to_workers = {}
|
|
3471
3495
|
self.op_id = op_id
|
|
@@ -3476,6 +3500,7 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
|
|
|
3476
3500
|
logger.warning(message)
|
|
3477
3501
|
self.terminate()
|
|
3478
3502
|
self.reset()
|
|
3503
|
+
self.ppid = os.getpid()
|
|
3479
3504
|
self.create_pool()
|
|
3480
3505
|
|
|
3481
3506
|
def create_pool(self):
|
|
@@ -3677,12 +3702,13 @@ class MapDataset(UnionBaseDataset):
|
|
|
3677
3702
|
``python_multiprocessing`` is set to True. If it is an int value, it represents ``input_columns`` and
|
|
3678
3703
|
``output_columns`` use this value as the unit to create shared memory. If it is a list, the first element
|
|
3679
3704
|
represents the ``input_columns`` use this value as the unit to create shared memory, and the second element
|
|
3680
|
-
represents ``output_columns`` use this value as the unit to create shared memory. Default:
|
|
3705
|
+
represents ``output_columns`` use this value as the unit to create shared memory. Default: ``None`` ,
|
|
3706
|
+
allocate shared memory dynamically.
|
|
3681
3707
|
offload (bool, optional): Flag to indicate whether offload is used. Default: ``None``.
|
|
3682
3708
|
"""
|
|
3683
3709
|
|
|
3684
3710
|
def __init__(self, input_dataset, operations=None, input_columns=None, output_columns=None,
|
|
3685
|
-
num_parallel_workers=None, python_multiprocessing=False, cache=None, callbacks=None, max_rowsize=
|
|
3711
|
+
num_parallel_workers=None, python_multiprocessing=False, cache=None, callbacks=None, max_rowsize=None,
|
|
3686
3712
|
offload=None):
|
|
3687
3713
|
super().__init__(children=input_dataset, num_parallel_workers=num_parallel_workers, cache=cache)
|
|
3688
3714
|
self.operations = to_list(operations)
|
|
@@ -3708,7 +3734,9 @@ class MapDataset(UnionBaseDataset):
|
|
|
3708
3734
|
self.process_pool = None
|
|
3709
3735
|
|
|
3710
3736
|
self.callbacks = to_list(callbacks)
|
|
3711
|
-
if
|
|
3737
|
+
if max_rowsize is None:
|
|
3738
|
+
self.max_rowsize = [-1, -1]
|
|
3739
|
+
elif isinstance(max_rowsize, int):
|
|
3712
3740
|
self.max_rowsize = [max_rowsize] * 2
|
|
3713
3741
|
else:
|
|
3714
3742
|
self.max_rowsize = max_rowsize
|
|
@@ -63,7 +63,7 @@ class CMUArcticDataset(MappableDataset, AudioBaseDataset):
|
|
|
63
63
|
shard_id (int, optional): The shard ID within `num_shards` . Default: ``None``, will use ``0``. This
|
|
64
64
|
argument can only be specified when `num_shards` is also specified.
|
|
65
65
|
cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
|
|
66
|
-
`Single-Node Data Cache <https://www.mindspore.cn/
|
|
66
|
+
`Single-Node Data Cache <https://www.mindspore.cn/docs/en/master/model_train/dataset/cache.html>`_ .
|
|
67
67
|
Default: ``None``, which means no cache is used.
|
|
68
68
|
|
|
69
69
|
Raises:
|
|
@@ -180,7 +180,7 @@ class GTZANDataset(MappableDataset, AudioBaseDataset):
|
|
|
180
180
|
shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
|
|
181
181
|
argument can only be specified when `num_shards` is also specified.
|
|
182
182
|
cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
|
|
183
|
-
`Single-Node Data Cache <https://www.mindspore.cn/
|
|
183
|
+
`Single-Node Data Cache <https://www.mindspore.cn/docs/en/master/model_train/dataset/cache.html>`_ .
|
|
184
184
|
Default: ``None`` , which means no cache is used.
|
|
185
185
|
|
|
186
186
|
Raises:
|
|
@@ -298,7 +298,7 @@ class LibriTTSDataset(MappableDataset, AudioBaseDataset):
|
|
|
298
298
|
shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
|
|
299
299
|
argument can only be specified when `num_shards` is also specified.
|
|
300
300
|
cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
|
|
301
|
-
`Single-Node Data Cache <https://www.mindspore.cn/
|
|
301
|
+
`Single-Node Data Cache <https://www.mindspore.cn/docs/en/master/model_train/dataset/cache.html>`_ .
|
|
302
302
|
Default: ``None`` , which means no cache is used.
|
|
303
303
|
|
|
304
304
|
Raises:
|
|
@@ -425,7 +425,7 @@ class LJSpeechDataset(MappableDataset, AudioBaseDataset):
|
|
|
425
425
|
shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
|
|
426
426
|
argument can only be specified when `num_shards` is also specified.
|
|
427
427
|
cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
|
|
428
|
-
`Single-Node Data Cache <https://www.mindspore.cn/
|
|
428
|
+
`Single-Node Data Cache <https://www.mindspore.cn/docs/en/master/model_train/dataset/cache.html>`_ .
|
|
429
429
|
Default: ``None`` , which means no cache is used.
|
|
430
430
|
|
|
431
431
|
Raises:
|
|
@@ -548,7 +548,7 @@ class SpeechCommandsDataset(MappableDataset, AudioBaseDataset):
|
|
|
548
548
|
shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` .
|
|
549
549
|
This argument can only be specified when `num_shards` is also specified.
|
|
550
550
|
cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
|
|
551
|
-
`Single-Node Data Cache <https://www.mindspore.cn/
|
|
551
|
+
`Single-Node Data Cache <https://www.mindspore.cn/docs/en/master/model_train/dataset/cache.html>`_ .
|
|
552
552
|
Default: ``None`` , which means no cache is used.
|
|
553
553
|
|
|
554
554
|
Raises:
|
|
@@ -661,7 +661,7 @@ class TedliumDataset(MappableDataset, AudioBaseDataset):
|
|
|
661
661
|
shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
|
|
662
662
|
argument can only be specified when `num_shards` is also specified.
|
|
663
663
|
cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
|
|
664
|
-
`Single-Node Data Cache <https://www.mindspore.cn/
|
|
664
|
+
`Single-Node Data Cache <https://www.mindspore.cn/docs/en/master/model_train/dataset/cache.html>`_ .
|
|
665
665
|
Default: ``None`` , which means no cache is used.
|
|
666
666
|
|
|
667
667
|
Raises:
|
|
@@ -841,7 +841,7 @@ class YesNoDataset(MappableDataset, AudioBaseDataset):
|
|
|
841
841
|
shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This argument can only
|
|
842
842
|
be specified when `num_shards` is also specified.
|
|
843
843
|
cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
|
|
844
|
-
`Single-Node Data Cache <https://www.mindspore.cn/
|
|
844
|
+
`Single-Node Data Cache <https://www.mindspore.cn/docs/en/master/model_train/dataset/cache.html>`_ .
|
|
845
845
|
Default: ``None`` , which means no cache is used.
|
|
846
846
|
|
|
847
847
|
Raises:
|
|
@@ -77,7 +77,7 @@ class CSVDataset(SourceDataset, UnionBaseDataset):
|
|
|
77
77
|
shard_id (int, optional): The shard ID within `num_shards` . Default: ``None``. This
|
|
78
78
|
argument can only be specified when `num_shards` is also specified.
|
|
79
79
|
cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
|
|
80
|
-
`Single-Node Data Cache <https://www.mindspore.cn/
|
|
80
|
+
`Single-Node Data Cache <https://www.mindspore.cn/docs/en/master/model_train/dataset/cache.html>`_ .
|
|
81
81
|
Default: ``None``, which means no cache is used.
|
|
82
82
|
|
|
83
83
|
Raises:
|
|
@@ -156,7 +156,7 @@ class MindDataset(MappableDataset, UnionBaseDataset):
|
|
|
156
156
|
num_samples (int, optional): The number of samples to be included in the dataset.
|
|
157
157
|
Default: ``None`` , all samples.
|
|
158
158
|
cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
|
|
159
|
-
`Single-Node Data Cache <https://www.mindspore.cn/
|
|
159
|
+
`Single-Node Data Cache <https://www.mindspore.cn/docs/en/master/model_train/dataset/cache.html>`_ .
|
|
160
160
|
Default: ``None`` , which means no cache is used.
|
|
161
161
|
|
|
162
162
|
Raises:
|
|
@@ -166,6 +166,52 @@ class MindDataset(MappableDataset, UnionBaseDataset):
|
|
|
166
166
|
RuntimeError: If `shard_id` is specified but `num_shards` is None.
|
|
167
167
|
ValueError: If `shard_id` is not in range of [0, `num_shards` ).
|
|
168
168
|
|
|
169
|
+
Note:
|
|
170
|
+
- When sharding MindRecord (by configuring `num_shards` and `shard_id`), there are two strategies to implement
|
|
171
|
+
the data sharding logic. This API uses the strategy 2.
|
|
172
|
+
|
|
173
|
+
.. list-table:: Data sharding strategy 1
|
|
174
|
+
:widths: 50 50 50 50
|
|
175
|
+
:header-rows: 1
|
|
176
|
+
|
|
177
|
+
* - rank 0
|
|
178
|
+
- rank 1
|
|
179
|
+
- rank 2
|
|
180
|
+
- rank 3
|
|
181
|
+
* - 0
|
|
182
|
+
- 1
|
|
183
|
+
- 2
|
|
184
|
+
- 3
|
|
185
|
+
* - 4
|
|
186
|
+
- 5
|
|
187
|
+
- 6
|
|
188
|
+
- 7
|
|
189
|
+
* - 8
|
|
190
|
+
- 9
|
|
191
|
+
- 10
|
|
192
|
+
- 11
|
|
193
|
+
|
|
194
|
+
.. list-table:: Data sharding strategy 2
|
|
195
|
+
:widths: 50 50 50 50
|
|
196
|
+
:header-rows: 1
|
|
197
|
+
|
|
198
|
+
* - rank 0
|
|
199
|
+
- rank 1
|
|
200
|
+
- rank 2
|
|
201
|
+
- rank 3
|
|
202
|
+
* - 0
|
|
203
|
+
- 3
|
|
204
|
+
- 6
|
|
205
|
+
- 9
|
|
206
|
+
* - 1
|
|
207
|
+
- 4
|
|
208
|
+
- 7
|
|
209
|
+
- 10
|
|
210
|
+
* - 2
|
|
211
|
+
- 5
|
|
212
|
+
- 8
|
|
213
|
+
- 11
|
|
214
|
+
|
|
169
215
|
Note:
|
|
170
216
|
- The parameters `num_samples` , `shuffle` , `num_shards` , `shard_id` can be used to control the sampler
|
|
171
217
|
used in the dataset, and their effects when combined with parameter `sampler` are as follows.
|
|
@@ -307,7 +353,7 @@ class TFRecordDataset(SourceDataset, UnionBaseDataset):
|
|
|
307
353
|
When `compression_type` is not ``None``, and `num_samples` or numRows (parsed from `schema` ) is provided,
|
|
308
354
|
`shard_equal_rows` will be implied as ``True``.
|
|
309
355
|
cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
|
|
310
|
-
`Single-Node Data Cache <https://www.mindspore.cn/
|
|
356
|
+
`Single-Node Data Cache <https://www.mindspore.cn/docs/en/master/model_train/dataset/cache.html>`_ .
|
|
311
357
|
Default: ``None`` , which means no cache is used.
|
|
312
358
|
compression_type (str, optional): The type of compression used for all files, must be either ``''``,
|
|
313
359
|
``'GZIP'``, or ``'ZLIB'``. Default: ``None`` , as in empty string. It is highly recommended to
|
|
@@ -383,6 +429,10 @@ class OBSMindDataset(GeneratorDataset):
|
|
|
383
429
|
|
|
384
430
|
The columns of generated dataset depend on the source MindRecord files.
|
|
385
431
|
|
|
432
|
+
Note:
|
|
433
|
+
- This interface accesses the `/cache` directory for node synchronization and requires the user to ensure
|
|
434
|
+
access to the `/cache` directory.
|
|
435
|
+
|
|
386
436
|
Args:
|
|
387
437
|
dataset_files (list[str]): List of files in cloud storage to be read and file path is in
|
|
388
438
|
the format of s3://bucketName/objectKey.
|