mindspore 2.3.0__cp310-cp310-win_amd64.whl → 2.4.1__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mindspore might be problematic. Click here for more details.

Files changed (275) hide show
  1. mindspore/.commit_id +1 -1
  2. mindspore/__init__.py +3 -1
  3. mindspore/_c_dataengine.cp310-win_amd64.pyd +0 -0
  4. mindspore/_c_expression.cp310-win_amd64.pyd +0 -0
  5. mindspore/_c_mindrecord.cp310-win_amd64.pyd +0 -0
  6. mindspore/_checkparam.py +50 -9
  7. mindspore/_extends/parse/compile_config.py +41 -0
  8. mindspore/_extends/parse/parser.py +9 -7
  9. mindspore/_extends/parse/standard_method.py +52 -14
  10. mindspore/_extends/pijit/pijit_func_white_list.py +350 -24
  11. mindspore/amp.py +24 -10
  12. mindspore/common/__init__.py +6 -4
  13. mindspore/common/_pijit_context.py +190 -0
  14. mindspore/common/_register_for_tensor.py +2 -1
  15. mindspore/common/_tensor_overload.py +139 -0
  16. mindspore/common/api.py +102 -87
  17. mindspore/common/dump.py +5 -6
  18. mindspore/common/generator.py +1 -7
  19. mindspore/common/hook_handle.py +14 -26
  20. mindspore/common/initializer.py +51 -15
  21. mindspore/common/mindir_util.py +2 -2
  22. mindspore/common/parameter.py +62 -15
  23. mindspore/common/recompute.py +39 -9
  24. mindspore/common/sparse_tensor.py +7 -3
  25. mindspore/common/tensor.py +183 -37
  26. mindspore/communication/__init__.py +1 -1
  27. mindspore/communication/_comm_helper.py +38 -3
  28. mindspore/communication/comm_func.py +315 -60
  29. mindspore/communication/management.py +14 -14
  30. mindspore/context.py +132 -22
  31. mindspore/dataset/__init__.py +1 -1
  32. mindspore/dataset/audio/__init__.py +1 -1
  33. mindspore/dataset/core/config.py +7 -0
  34. mindspore/dataset/core/validator_helpers.py +7 -0
  35. mindspore/dataset/engine/cache_client.py +1 -1
  36. mindspore/dataset/engine/datasets.py +72 -44
  37. mindspore/dataset/engine/datasets_audio.py +7 -7
  38. mindspore/dataset/engine/datasets_standard_format.py +53 -3
  39. mindspore/dataset/engine/datasets_text.py +20 -20
  40. mindspore/dataset/engine/datasets_user_defined.py +174 -104
  41. mindspore/dataset/engine/datasets_vision.py +33 -33
  42. mindspore/dataset/engine/iterators.py +29 -0
  43. mindspore/dataset/engine/obs/util.py +7 -0
  44. mindspore/dataset/engine/queue.py +114 -60
  45. mindspore/dataset/engine/serializer_deserializer.py +2 -2
  46. mindspore/dataset/engine/validators.py +34 -14
  47. mindspore/dataset/text/__init__.py +1 -4
  48. mindspore/dataset/transforms/__init__.py +0 -3
  49. mindspore/dataset/utils/line_reader.py +2 -0
  50. mindspore/dataset/vision/__init__.py +1 -4
  51. mindspore/dataset/vision/utils.py +1 -1
  52. mindspore/dataset/vision/validators.py +2 -1
  53. mindspore/{nn/extend → experimental/es}/__init__.py +4 -11
  54. mindspore/experimental/es/embedding_service.py +883 -0
  55. mindspore/{nn/layer → experimental/es}/embedding_service_layer.py +218 -30
  56. mindspore/experimental/llm_boost/__init__.py +21 -0
  57. mindspore/{nn/extend/layer → experimental/llm_boost/atb}/__init__.py +4 -8
  58. mindspore/experimental/llm_boost/atb/boost_base.py +211 -0
  59. mindspore/experimental/llm_boost/atb/llama_boost.py +115 -0
  60. mindspore/experimental/llm_boost/atb/qwen_boost.py +101 -0
  61. mindspore/experimental/llm_boost/register.py +129 -0
  62. mindspore/experimental/llm_boost/utils.py +31 -0
  63. mindspore/experimental/optim/adamw.py +85 -0
  64. mindspore/experimental/optim/optimizer.py +3 -0
  65. mindspore/hal/__init__.py +3 -3
  66. mindspore/hal/contiguous_tensors_handle.py +175 -0
  67. mindspore/hal/stream.py +18 -0
  68. mindspore/include/api/model_group.h +13 -1
  69. mindspore/include/api/types.h +10 -10
  70. mindspore/include/dataset/config.h +2 -2
  71. mindspore/include/dataset/constants.h +2 -2
  72. mindspore/include/dataset/execute.h +2 -2
  73. mindspore/include/dataset/vision.h +4 -0
  74. mindspore/log.py +1 -1
  75. mindspore/mindrecord/filewriter.py +68 -51
  76. mindspore/mindspore_backend.dll +0 -0
  77. mindspore/mindspore_common.dll +0 -0
  78. mindspore/mindspore_core.dll +0 -0
  79. mindspore/mindspore_np_dtype.dll +0 -0
  80. mindspore/mindspore_ops.dll +0 -0
  81. mindspore/mint/__init__.py +983 -46
  82. mindspore/mint/distributed/__init__.py +31 -0
  83. mindspore/mint/distributed/distributed.py +254 -0
  84. mindspore/mint/nn/__init__.py +268 -23
  85. mindspore/mint/nn/functional.py +125 -19
  86. mindspore/mint/nn/layer/__init__.py +39 -0
  87. mindspore/mint/nn/layer/activation.py +133 -0
  88. mindspore/mint/nn/layer/normalization.py +477 -0
  89. mindspore/mint/nn/layer/pooling.py +110 -0
  90. mindspore/mint/optim/adamw.py +26 -13
  91. mindspore/mint/special/__init__.py +63 -0
  92. mindspore/multiprocessing/__init__.py +2 -1
  93. mindspore/nn/__init__.py +0 -1
  94. mindspore/nn/cell.py +276 -96
  95. mindspore/nn/layer/activation.py +211 -44
  96. mindspore/nn/layer/basic.py +137 -10
  97. mindspore/nn/layer/embedding.py +137 -2
  98. mindspore/nn/layer/normalization.py +101 -5
  99. mindspore/nn/layer/padding.py +34 -48
  100. mindspore/nn/layer/pooling.py +161 -7
  101. mindspore/nn/layer/transformer.py +3 -3
  102. mindspore/nn/loss/__init__.py +2 -2
  103. mindspore/nn/loss/loss.py +84 -6
  104. mindspore/nn/optim/__init__.py +2 -1
  105. mindspore/nn/optim/adadelta.py +1 -1
  106. mindspore/nn/optim/adam.py +1 -1
  107. mindspore/nn/optim/lamb.py +1 -1
  108. mindspore/nn/optim/tft_wrapper.py +124 -0
  109. mindspore/nn/wrap/cell_wrapper.py +12 -23
  110. mindspore/nn/wrap/grad_reducer.py +5 -5
  111. mindspore/nn/wrap/loss_scale.py +17 -3
  112. mindspore/numpy/__init__.py +1 -1
  113. mindspore/numpy/array_creations.py +65 -68
  114. mindspore/numpy/array_ops.py +64 -60
  115. mindspore/numpy/fft.py +610 -75
  116. mindspore/numpy/logic_ops.py +11 -10
  117. mindspore/numpy/math_ops.py +85 -84
  118. mindspore/numpy/utils_const.py +4 -4
  119. mindspore/opencv_core452.dll +0 -0
  120. mindspore/opencv_imgcodecs452.dll +0 -0
  121. mindspore/opencv_imgproc452.dll +0 -0
  122. mindspore/ops/__init__.py +6 -4
  123. mindspore/ops/_grad_experimental/grad_array_ops.py +0 -11
  124. mindspore/ops/_grad_experimental/grad_comm_ops.py +67 -4
  125. mindspore/ops/_grad_experimental/grad_math_ops.py +0 -22
  126. mindspore/ops/_vmap/vmap_array_ops.py +2 -4
  127. mindspore/ops/_vmap/vmap_math_ops.py +17 -1
  128. mindspore/ops/_vmap/vmap_nn_ops.py +43 -2
  129. mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +91 -7
  130. mindspore/ops/auto_generate/gen_arg_dtype_cast.py +2 -0
  131. mindspore/ops/auto_generate/gen_extend_func.py +767 -13
  132. mindspore/ops/auto_generate/gen_ops_def.py +2452 -364
  133. mindspore/ops/auto_generate/gen_ops_prim.py +5442 -1756
  134. mindspore/ops/auto_generate/pyboost_inner_prim.py +176 -56
  135. mindspore/ops/composite/base.py +85 -48
  136. mindspore/ops/composite/multitype_ops/_compile_utils.py +1 -0
  137. mindspore/ops/composite/multitype_ops/not_in_impl.py +2 -2
  138. mindspore/ops/function/__init__.py +22 -0
  139. mindspore/ops/function/array_func.py +492 -153
  140. mindspore/ops/function/debug_func.py +113 -1
  141. mindspore/ops/function/fft_func.py +15 -2
  142. mindspore/ops/function/grad/grad_func.py +3 -2
  143. mindspore/ops/function/math_func.py +564 -207
  144. mindspore/ops/function/nn_func.py +817 -383
  145. mindspore/ops/function/other_func.py +3 -2
  146. mindspore/ops/function/random_func.py +402 -12
  147. mindspore/ops/function/reshard_func.py +13 -11
  148. mindspore/ops/function/sparse_unary_func.py +1 -1
  149. mindspore/ops/function/vmap_func.py +3 -2
  150. mindspore/ops/functional.py +24 -14
  151. mindspore/ops/op_info_register.py +3 -3
  152. mindspore/ops/operations/__init__.py +7 -2
  153. mindspore/ops/operations/_grad_ops.py +2 -76
  154. mindspore/ops/operations/_infer_ops.py +1 -1
  155. mindspore/ops/operations/_inner_ops.py +71 -94
  156. mindspore/ops/operations/array_ops.py +14 -146
  157. mindspore/ops/operations/comm_ops.py +63 -53
  158. mindspore/ops/operations/custom_ops.py +83 -19
  159. mindspore/ops/operations/debug_ops.py +42 -10
  160. mindspore/ops/operations/manually_defined/_inner.py +12 -0
  161. mindspore/ops/operations/manually_defined/ops_def.py +273 -20
  162. mindspore/ops/operations/math_ops.py +12 -223
  163. mindspore/ops/operations/nn_ops.py +20 -114
  164. mindspore/ops/operations/other_ops.py +7 -4
  165. mindspore/ops/operations/random_ops.py +46 -1
  166. mindspore/ops/primitive.py +18 -6
  167. mindspore/ops_generate/arg_dtype_cast.py +2 -0
  168. mindspore/ops_generate/gen_aclnn_implement.py +11 -11
  169. mindspore/ops_generate/gen_constants.py +36 -0
  170. mindspore/ops_generate/gen_ops.py +67 -52
  171. mindspore/ops_generate/gen_ops_inner_prim.py +1 -1
  172. mindspore/ops_generate/gen_pyboost_func.py +131 -47
  173. mindspore/ops_generate/op_proto.py +10 -3
  174. mindspore/ops_generate/pyboost_utils.py +14 -1
  175. mindspore/ops_generate/template.py +43 -21
  176. mindspore/parallel/__init__.py +3 -1
  177. mindspore/parallel/_auto_parallel_context.py +31 -9
  178. mindspore/parallel/_cell_wrapper.py +85 -0
  179. mindspore/parallel/_parallel_serialization.py +47 -19
  180. mindspore/parallel/_tensor.py +127 -13
  181. mindspore/parallel/_utils.py +53 -22
  182. mindspore/parallel/algo_parameter_config.py +5 -5
  183. mindspore/parallel/checkpoint_transform.py +46 -39
  184. mindspore/parallel/cluster/process_entity/__init__.py +1 -1
  185. mindspore/parallel/cluster/process_entity/_api.py +31 -23
  186. mindspore/parallel/cluster/process_entity/_utils.py +2 -27
  187. mindspore/parallel/parameter_broadcast.py +3 -4
  188. mindspore/parallel/shard.py +162 -31
  189. mindspore/parallel/transform_safetensors.py +1146 -0
  190. mindspore/profiler/__init__.py +2 -1
  191. mindspore/profiler/common/constant.py +29 -0
  192. mindspore/profiler/common/registry.py +47 -0
  193. mindspore/profiler/common/util.py +28 -0
  194. mindspore/profiler/dynamic_profiler.py +694 -0
  195. mindspore/profiler/envprofiling.py +17 -19
  196. mindspore/profiler/parser/ascend_analysis/constant.py +18 -0
  197. mindspore/profiler/parser/ascend_analysis/file_manager.py +25 -4
  198. mindspore/profiler/parser/ascend_analysis/function_event.py +43 -19
  199. mindspore/profiler/parser/ascend_analysis/fwk_cann_parser.py +31 -26
  200. mindspore/profiler/parser/ascend_analysis/fwk_file_parser.py +56 -10
  201. mindspore/profiler/parser/ascend_analysis/msprof_timeline_parser.py +55 -8
  202. mindspore/profiler/parser/ascend_analysis/path_manager.py +313 -0
  203. mindspore/profiler/parser/ascend_analysis/profiler_info_parser.py +27 -20
  204. mindspore/profiler/parser/ascend_analysis/trace_event_manager.py +9 -2
  205. mindspore/profiler/parser/ascend_msprof_exporter.py +5 -4
  206. mindspore/profiler/parser/ascend_timeline_generator.py +27 -25
  207. mindspore/profiler/parser/base_timeline_generator.py +19 -25
  208. mindspore/profiler/parser/cpu_gpu_timeline_generator.py +25 -12
  209. mindspore/profiler/parser/framework_parser.py +1 -391
  210. mindspore/profiler/parser/gpu_analysis/__init__.py +14 -0
  211. mindspore/profiler/parser/gpu_analysis/function_event.py +44 -0
  212. mindspore/profiler/parser/gpu_analysis/fwk_file_parser.py +89 -0
  213. mindspore/profiler/parser/gpu_analysis/profiler_info_parser.py +72 -0
  214. mindspore/profiler/parser/memory_usage_parser.py +0 -154
  215. mindspore/profiler/parser/profiler_info.py +78 -6
  216. mindspore/profiler/profiler.py +153 -0
  217. mindspore/profiler/profiling.py +285 -413
  218. mindspore/rewrite/__init__.py +1 -2
  219. mindspore/rewrite/common/namespace.py +4 -4
  220. mindspore/rewrite/symbol_tree/symbol_tree.py +3 -3
  221. mindspore/run_check/_check_version.py +39 -104
  222. mindspore/safeguard/rewrite_obfuscation.py +591 -247
  223. mindspore/train/__init__.py +4 -3
  224. mindspore/train/_utils.py +105 -19
  225. mindspore/train/amp.py +171 -53
  226. mindspore/train/callback/__init__.py +2 -2
  227. mindspore/train/callback/_callback.py +4 -4
  228. mindspore/train/callback/_checkpoint.py +97 -31
  229. mindspore/train/callback/_cluster_monitor.py +1 -1
  230. mindspore/train/callback/_flops_collector.py +1 -0
  231. mindspore/train/callback/_loss_monitor.py +3 -3
  232. mindspore/train/callback/_on_request_exit.py +145 -31
  233. mindspore/train/callback/_summary_collector.py +5 -5
  234. mindspore/train/callback/_tft_register.py +375 -0
  235. mindspore/train/dataset_helper.py +15 -3
  236. mindspore/train/metrics/metric.py +3 -3
  237. mindspore/train/metrics/roc.py +4 -4
  238. mindspore/train/mind_ir_pb2.py +44 -39
  239. mindspore/train/model.py +154 -58
  240. mindspore/train/serialization.py +342 -128
  241. mindspore/utils/__init__.py +21 -0
  242. mindspore/utils/utils.py +60 -0
  243. mindspore/version.py +1 -1
  244. {mindspore-2.3.0.dist-info → mindspore-2.4.1.dist-info}/METADATA +13 -7
  245. {mindspore-2.3.0.dist-info → mindspore-2.4.1.dist-info}/RECORD +248 -242
  246. mindspore/include/c_api/ms/abstract.h +0 -67
  247. mindspore/include/c_api/ms/attribute.h +0 -197
  248. mindspore/include/c_api/ms/base/handle_types.h +0 -43
  249. mindspore/include/c_api/ms/base/macros.h +0 -32
  250. mindspore/include/c_api/ms/base/status.h +0 -33
  251. mindspore/include/c_api/ms/base/types.h +0 -283
  252. mindspore/include/c_api/ms/context.h +0 -102
  253. mindspore/include/c_api/ms/graph.h +0 -160
  254. mindspore/include/c_api/ms/node.h +0 -606
  255. mindspore/include/c_api/ms/tensor.h +0 -161
  256. mindspore/include/c_api/ms/value.h +0 -84
  257. mindspore/mindspore_shared_lib.dll +0 -0
  258. mindspore/nn/extend/basic.py +0 -140
  259. mindspore/nn/extend/embedding.py +0 -143
  260. mindspore/nn/extend/layer/normalization.py +0 -109
  261. mindspore/nn/extend/pooling.py +0 -117
  262. mindspore/nn/layer/embedding_service.py +0 -531
  263. mindspore/ops/_op_impl/aicpu/strided_slice_v2.py +0 -93
  264. mindspore/ops/_op_impl/aicpu/strided_slice_v2_grad.py +0 -66
  265. mindspore/ops/extend/__init__.py +0 -53
  266. mindspore/ops/extend/array_func.py +0 -218
  267. mindspore/ops/extend/math_func.py +0 -76
  268. mindspore/ops/extend/nn_func.py +0 -308
  269. mindspore/ops/silent_check.py +0 -162
  270. mindspore/profiler/parser/msadvisor_analyzer.py +0 -82
  271. mindspore/profiler/parser/msadvisor_parser.py +0 -240
  272. mindspore/train/callback/_mindio_ttp.py +0 -443
  273. {mindspore-2.3.0.dist-info → mindspore-2.4.1.dist-info}/WHEEL +0 -0
  274. {mindspore-2.3.0.dist-info → mindspore-2.4.1.dist-info}/entry_points.txt +0 -0
  275. {mindspore-2.3.0.dist-info → mindspore-2.4.1.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,4 @@
1
- # Copyright 2022-2023 Huawei Technologies Co., Ltd
1
+ # Copyright 2022-2024 Huawei Technologies Co., Ltd
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -16,13 +16,13 @@
16
16
  1. This file is an abstraction of the dataset loading class. It contains
17
17
  some basic dataset operations(skip, filter, map, batch, ...).
18
18
  2. Specific dataset loading classes can be found in datasets_vision.py, datasets_text.py,
19
- datasets_audio.py, datasets_standard_format.py and dataets_user_defined.py files.
19
+ datasets_audio.py, datasets_standard_format.py and datasets_user_defined.py files.
20
20
  datasets_vision.py: contains vision dataset loading classes.
21
21
  datasets_text.py: contains text dataset loading classes.
22
22
  datasets_audio.py: contains audio dataset loading classes.
23
23
  datasets_standard_format.py: contains standard format loading classes which
24
24
  any other kinds of datasets can be converted to.
25
- dataets_user_defined.py: contains basic classes that help users to define
25
+ datasets_user_defined.py: contains basic classes that help users to define
26
26
  flexible ways to load dataset.
27
27
  """
28
28
  import atexit
@@ -66,13 +66,13 @@ from mindspore.dataset.debug import DebugHook
66
66
 
67
67
  from mindspore.dataset.engine import samplers
68
68
  from .iterators import DictIterator, TupleIterator, DummyIterator, check_iterator_cleanup, _set_iterator_cleanup, \
69
- ITERATORS_LIST, _unset_iterator_cleanup
69
+ ITERATORS_LIST, _unset_iterator_cleanup, _cleanup_the_iterators_if_created
70
70
  from .queue import _SharedQueue, _Queue
71
71
  from .validators import check_batch, check_shuffle, check_map, check_filter, check_repeat, check_skip, check_zip, \
72
72
  check_rename, check_device_send, check_take, check_output_shape, check_project, \
73
73
  check_sync_wait, check_zip_dataset, check_add_column, check_concat, check_split, check_bucket_batch_by_length, \
74
74
  check_save, check_tuple_iterator, check_dict_iterator, check_schema, check_to_device_send, check_padded_batch, \
75
- check_total_batch
75
+ check_total_batch, check_sync_update
76
76
  from ..core.config import get_callback_timeout, _init_device_info, get_enable_shared_mem, get_num_parallel_workers, \
77
77
  get_enable_watchdog, get_seed, set_seed, get_debug_mode, get_multiprocessing_timeout_interval, _get_debug_hook_list
78
78
  from ..core.datatypes import mstype_to_detype
@@ -494,6 +494,12 @@ class Dataset:
494
494
 
495
495
  .. image:: bucket_batch_by_length_en.png
496
496
 
497
+ Note:
498
+ - When using `Data Sinking <https://www.mindspore.cn/docs/en/master/model_train/train_process/optimize/
499
+ sink_mode.html#data-sinking>`_ in Graph mode, the input shape of the network should keep consistent.
500
+ You should set `drop_remainder` to "True" to discard the last incomplete batch of data,
501
+ or supplement/remove samples to ensure the dataset size is divisible by `batch_size`.
502
+
497
503
  Args:
498
504
  column_names (list[str]): Columns passed to element_length_function.
499
505
  bucket_boundaries (list[int]): A list consisting of the upper boundaries
@@ -564,8 +570,12 @@ class Dataset:
564
570
  .. image:: batch_en.png
565
571
 
566
572
  Note:
567
- The order of using repeat and batch reflects the number of batches and per_batch_map.
568
- It is recommended that the repeat operation applied after the batch operation finished.
573
+ - The order of using repeat and batch reflects the number of batches and per_batch_map.
574
+ It is recommended that the repeat operation applied after the batch operation finished.
575
+ - When using `Data Sinking <https://www.mindspore.cn/docs/en/master/model_train/train_process/optimize/
576
+ sink_mode.html#data-sinking>`_ in Graph mode, the input shape of the network should keep consistent.
577
+ You should set `drop_remainder` to "True" to discard the last incomplete batch of data,
578
+ or supplement/remove samples to ensure the dataset size is divisible by `batch_size`.
569
579
 
570
580
  Args:
571
581
  batch_size (Union[int, Callable]): The number of rows each batch is created with. An
@@ -598,10 +608,10 @@ class Dataset:
598
608
  name as the input columns, i.e., the columns will be replaced.
599
609
 
600
610
  - python_multiprocessing (bool, optional): Parallelize Python function `per_batch_map` with
601
- multi-processing or multi-threading mode, ``True`` means multi-processing,
602
- ``False`` means multi-threading If `per_batch_map` is a I/O bound task, use
603
- multi-threading mode. If `per_batch_map` is a CPU bound task, it is recommended to use
604
- multi-processing mode. Default: ``False`` , use python multi-threading mode.
611
+ multiprocessing or multithreading mode, ``True`` means multiprocessing,
612
+ ``False`` means multithreading If `per_batch_map` is a I/O bound task, use
613
+ multithreading mode. If `per_batch_map` is a CPU bound task, it is recommended to use
614
+ multiprocessing mode. Default: ``False`` , use python multithreading mode.
605
615
 
606
616
  - max_rowsize(Union[int, list[int]], optional): Maximum size of row in MB that is used for shared memory
607
617
  allocation to copy data between processes, the total occupied shared memory will increase as
@@ -611,7 +621,7 @@ class Dataset:
611
621
  ``input_columns`` and ``output_columns`` use this value as the unit to create shared memory.
612
622
  If it is a list, the first element represents the ``input_columns`` use this value as the unit to
613
623
  create shared memory, and the second element represents ``output_columns`` use this value as the unit
614
- to create shared memory. Default: 16.
624
+ to create shared memory. Default: ``None`` , allocate shared memory dynamically.
615
625
 
616
626
  Returns:
617
627
  Dataset, a new dataset with the above operation applied.
@@ -657,8 +667,12 @@ class Dataset:
657
667
  .. image:: padded_batch_en.png
658
668
 
659
669
  Note:
660
- The order of using repeat and padded_batch reflects the number of batches.
661
- It is recommended that the repeat operation applied after the padded_batch operation finished.
670
+ - The order of using repeat and padded_batch reflects the number of batches.
671
+ It is recommended that the repeat operation applied after the padded_batch operation finished.
672
+ - When using `Data Sinking <https://www.mindspore.cn/docs/en/master/model_train/train_process/optimize/
673
+ sink_mode.html#data-sinking>`_ in Graph mode, the input shape of the network should keep consistent.
674
+ You should set `drop_remainder` to "True" to discard the last incomplete batch of data,
675
+ or supplement/remove samples to ensure the dataset size is divisible by `batch_size`.
662
676
 
663
677
  Args:
664
678
  batch_size (Union[int, Callable]): The number of rows each batch is created with. An
@@ -905,7 +919,7 @@ class Dataset:
905
919
  ``input_columns`` and ``output_columns`` use this value as the unit to create shared memory.
906
920
  If it is a list, the first element represents the ``input_columns`` use this value as the unit to
907
921
  create shared memory, and the second element represents ``output_columns`` use this value as the unit
908
- to create shared memory. Default: 16.
922
+ to create shared memory. Default: ``None`` , allocate shared memory dynamically.
909
923
 
910
924
  - cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
911
925
  Default: ``None``, which means no cache is used.
@@ -989,8 +1003,8 @@ class Dataset:
989
1003
  num_parallel_workers = 1
990
1004
  logger.warning(
991
1005
  "Input 'operations' of 'map' includes network computing operators like in mindspore.nn, mindspore.ops, "
992
- "mindspore.numpy module and etc, which do not support multi-thread compiling, recommend to replace it "
993
- "with python implemented operator like numpy etc. Here decrease 'num_parallel_workers' into 1.")
1006
+ "mindspore.numpy module and etc, which do not support multithreading compiling, recommend to replace "
1007
+ "it with python implemented operator like numpy etc. Here decrease 'num_parallel_workers' into 1.")
994
1008
 
995
1009
  return MapDataset(self, operations, input_columns, output_columns, num_parallel_workers, **kwargs)
996
1010
 
@@ -1523,8 +1537,8 @@ class Dataset:
1523
1537
  2. Before calling the function, do not use batch operation, repeat operation or data augmentation operations
1524
1538
  with random attribute in map operation.
1525
1539
  3. When array dimension is variable, one-dimensional arrays or
1526
- multi-dimensional arrays with variable dimension 0 are supported.
1527
- 4. MindRecord does not support multi-dimensional string or multi-dimensional bytes.
1540
+ multidimensional arrays with variable dimension 0 are supported.
1541
+ 4. MindRecord does not support multidimensional string or multidimensional bytes.
1528
1542
 
1529
1543
  Args:
1530
1544
  file_name (str): Path to dataset file.
@@ -1741,6 +1755,7 @@ class Dataset:
1741
1755
  return self._col_names
1742
1756
 
1743
1757
  @check_output_shape
1758
+ @_cleanup_the_iterators_if_created
1744
1759
  def output_shapes(self, estimate=False):
1745
1760
  """
1746
1761
  Get the shapes of output data.
@@ -1792,6 +1807,7 @@ class Dataset:
1792
1807
  self.saved_output_shapes = output_shapes
1793
1808
  return output_shapes
1794
1809
 
1810
+ @_cleanup_the_iterators_if_created
1795
1811
  def output_types(self):
1796
1812
  """
1797
1813
  Get the types of output data.
@@ -1826,6 +1842,7 @@ class Dataset:
1826
1842
  del self.runtime_context
1827
1843
  return self.saved_output_types
1828
1844
 
1845
+ @_cleanup_the_iterators_if_created
1829
1846
  def get_dataset_size(self):
1830
1847
  """
1831
1848
  Return the number of batches in an epoch.
@@ -1893,6 +1910,7 @@ class Dataset:
1893
1910
  return self.children[0].is_sync()
1894
1911
  return False
1895
1912
 
1913
+ @check_sync_update
1896
1914
  def sync_update(self, condition_name, num_batch=None, data=None):
1897
1915
  """
1898
1916
  Release a blocking condition and trigger callback with given data.
@@ -2174,7 +2192,7 @@ class TextBaseDataset(Dataset):
2174
2192
  Japanese or Chinese character sets, and 1.0 for other languages with small character sets
2175
2193
  like English or Latin.
2176
2194
  model_type(SentencePieceModel): Model type. Choose from unigram (default), bpe, char, or word.
2177
- The input sentence must be pretokenized when using word type.
2195
+ The input sentence must be pre-tokenized when using word type.
2178
2196
  params(dict): Any extra optional parameters of sentencepiece library according to your raw data
2179
2197
 
2180
2198
  Returns:
@@ -2251,7 +2269,7 @@ class TextBaseDataset(Dataset):
2251
2269
  Japanese or Chinese character sets, and 1.0 for other languages with small character sets
2252
2270
  like English or Latin.
2253
2271
  model_type(SentencePieceModel): Model type. Choose from unigram (default), bpe, char, or word.
2254
- The input sentence must be pretokenized when using word type.
2272
+ The input sentence must be pre-tokenized when using word type.
2255
2273
  params(dict): Any extra optional parameters of sentencepiece library according to your raw data
2256
2274
 
2257
2275
  Returns:
@@ -2629,12 +2647,12 @@ class BatchDataset(UnionBaseDataset):
2629
2647
  ``input_columns`` and ``output_columns`` use this value as the unit to create shared memory.
2630
2648
  If it is a list, the first element represents the ``input_columns`` use this value as the unit to
2631
2649
  create shared memory, and the second element represents ``output_columns`` use this value as the unit
2632
- to create shared memory. Default: 16.
2650
+ to create shared memory. Default: ``None`` , allocate shared memory dynamically.
2633
2651
 
2634
2652
  """
2635
2653
 
2636
2654
  def __init__(self, input_dataset, batch_size, drop_remainder=False, num_parallel_workers=None, per_batch_map=None,
2637
- input_columns=None, output_columns=None, python_multiprocessing=False, max_rowsize=16):
2655
+ input_columns=None, output_columns=None, python_multiprocessing=False, max_rowsize=None):
2638
2656
  super().__init__(children=input_dataset, num_parallel_workers=num_parallel_workers)
2639
2657
 
2640
2658
  if BatchDataset._is_ancestor_of_repeat(input_dataset):
@@ -2655,7 +2673,9 @@ class BatchDataset(UnionBaseDataset):
2655
2673
 
2656
2674
  self.python_multiprocessing = python_multiprocessing
2657
2675
  self.process_pool = None
2658
- if isinstance(max_rowsize, int):
2676
+ if max_rowsize is None:
2677
+ self.max_rowsize = [-1, -1]
2678
+ elif isinstance(max_rowsize, int):
2659
2679
  self.max_rowsize = [max_rowsize * self.batch_size] * 2 if max_rowsize != -1 else [max_rowsize, max_rowsize]
2660
2680
  else:
2661
2681
  self.max_rowsize = [max_rowsize[0] * self.batch_size, max_rowsize[1] * self.batch_size]
@@ -3078,7 +3098,7 @@ class Pipe:
3078
3098
  Class to handle communication between the master process and the worker processes.
3079
3099
  """
3080
3100
 
3081
- def __init__(self, warning_ctl, shared_memory=False, max_rowsize=16):
3101
+ def __init__(self, warning_ctl, shared_memory=False, max_rowsize=(-1, -1)):
3082
3102
  self.shared_memory = shared_memory
3083
3103
  self.eof = multiprocessing.Event()
3084
3104
  if self.shared_memory:
@@ -3139,7 +3159,10 @@ def _worker_loop(operations, pipe, worker_id):
3139
3159
  """
3140
3160
  Multiprocess worker process loop.
3141
3161
  """
3142
- # Ensure that the process does not hung when exiting
3162
+ # Initialize C++ side signal handlers
3163
+ cde.register_worker_handlers()
3164
+
3165
+ # Ensure that the process does not hang when exiting
3143
3166
  pipe.res_queue.cancel_join_thread()
3144
3167
 
3145
3168
  def _ignore_sigint():
@@ -3153,6 +3176,7 @@ def _worker_loop(operations, pipe, worker_id):
3153
3176
  # that the random results of each process are different.
3154
3177
  if get_seed() != 5489:
3155
3178
  set_seed(get_seed() + worker_id)
3179
+
3156
3180
  while not _main_process_already_exit():
3157
3181
  _ignore_sigint()
3158
3182
 
@@ -3184,7 +3208,7 @@ class _MPWorker(multiprocessing.Process):
3184
3208
  Worker process for multiprocessing.
3185
3209
  """
3186
3210
 
3187
- def __init__(self, operations, warning_ctl, max_rowsize=16, worker_id=0):
3211
+ def __init__(self, operations, warning_ctl, max_rowsize=(-1, -1), worker_id=0):
3188
3212
  shared_memory = get_enable_shared_mem()
3189
3213
  self.pipe = Pipe(warning_ctl, shared_memory=shared_memory, max_rowsize=max_rowsize)
3190
3214
  self.check_interval = get_multiprocessing_timeout_interval()
@@ -3216,14 +3240,6 @@ class _MPWorker(multiprocessing.Process):
3216
3240
  logger.warning("Please `pip install py-spy` to get the stacks of the stuck process.")
3217
3241
  try:
3218
3242
  res = self.pipe.master_receive()
3219
- # Because there is no need to copy when creating Tensors in the C++layer, it reduces the time
3220
- # from np.ndarray to C++Tensor creation. However, when using shared memory in multiple processes,
3221
- # the address of the shared memory will always be passed to subsequent nodes in the dataset pipeline,
3222
- # and the shared memory will also be written by the current node, causing dirty data to be accessed
3223
- # by subsequent nodes in the pipeline. So make a memory copy here to solve the problem of
3224
- # shared memory being contaminated.
3225
- if get_enable_shared_mem():
3226
- res = copy.deepcopy(res)
3227
3243
  except queue.Empty:
3228
3244
  continue
3229
3245
  if res is None:
@@ -3286,7 +3302,7 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
3286
3302
  self.origin_hook(ex_type, value, tb)
3287
3303
  self.mp_pool_exit_preprocess()
3288
3304
 
3289
- def __init__(self, op_name, num_parallel_workers, operations, max_rowsize=16):
3305
+ def __init__(self, op_name, num_parallel_workers, operations, max_rowsize=(-1, -1)):
3290
3306
  super(_PythonMultiprocessing, self).__init__()
3291
3307
  self.op_name = op_name
3292
3308
  self.num_parallel_workers = num_parallel_workers
@@ -3302,7 +3318,7 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
3302
3318
 
3303
3319
  self.eot = None
3304
3320
  self.watch_dog = None
3305
- self.ppid = os.getpid()
3321
+ self.ppid = None
3306
3322
  self.hook = None
3307
3323
  self.warning_ctl = None
3308
3324
  # cache thread (get_ident()) to worker_id mapping in Python layer
@@ -3327,10 +3343,10 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
3327
3343
  if child_pid == 0:
3328
3344
  break
3329
3345
  except OSError:
3330
- # waitpid may be failed for some reasons so we ignore this error
3346
+ # waitpid may fail for some reason, so we ignore this error
3331
3347
  pass
3332
3348
 
3333
- # Dataset need watch_dog thread to monitoring fork multi-processing,
3349
+ # Dataset need watch_dog thread to monitoring fork multiprocessing,
3334
3350
  # and thread can't be a member function otherwise python won't collect and release resources.
3335
3351
  @staticmethod
3336
3352
  def _watch_dog(eot, workers):
@@ -3363,6 +3379,8 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
3363
3379
  "main process will exit. If this is not an artificial operation, you can use "
3364
3380
  "ds.config.set_enable_watchdog(False) to block this error.")
3365
3381
  os.kill(os.getpid(), signal.SIGTERM)
3382
+ # sleep to release GIL
3383
+ time.sleep(1)
3366
3384
 
3367
3385
  # release the workers
3368
3386
  del workers
@@ -3451,6 +3469,12 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
3451
3469
  while _PythonMultiprocessing.is_process_alive(ppid):
3452
3470
  if quit_signal.is_set():
3453
3471
  return
3472
+
3473
+ # independent dataset mode, the subprocess of GeneratorDataset / map / batch should exit when
3474
+ # independent dataset process have exit
3475
+ if os.getppid() != ppid:
3476
+ break
3477
+
3454
3478
  time.sleep(0.1)
3455
3479
 
3456
3480
  _PythonMultiprocessing._terminate_processes(workers)
@@ -3462,10 +3486,10 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
3462
3486
  Launch Python multiprocessing pool.
3463
3487
 
3464
3488
  Args:
3465
- pop_id: ID for operation to have Python multiprocessing pool launched
3489
+ op_id: ID for operation to have Python multiprocessing pool launched
3466
3490
 
3467
3491
  Returns:
3468
- Python multiprocssing pool is launched.
3492
+ Python multiprocessing pool is launched.
3469
3493
  """
3470
3494
  self.python_threads_to_workers = {}
3471
3495
  self.op_id = op_id
@@ -3476,6 +3500,7 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
3476
3500
  logger.warning(message)
3477
3501
  self.terminate()
3478
3502
  self.reset()
3503
+ self.ppid = os.getpid()
3479
3504
  self.create_pool()
3480
3505
 
3481
3506
  def create_pool(self):
@@ -3677,12 +3702,13 @@ class MapDataset(UnionBaseDataset):
3677
3702
  ``python_multiprocessing`` is set to True. If it is an int value, it represents ``input_columns`` and
3678
3703
  ``output_columns`` use this value as the unit to create shared memory. If it is a list, the first element
3679
3704
  represents the ``input_columns`` use this value as the unit to create shared memory, and the second element
3680
- represents ``output_columns`` use this value as the unit to create shared memory. Default: 16.
3705
+ represents ``output_columns`` use this value as the unit to create shared memory. Default: ``None`` ,
3706
+ allocate shared memory dynamically.
3681
3707
  offload (bool, optional): Flag to indicate whether offload is used. Default: ``None``.
3682
3708
  """
3683
3709
 
3684
3710
  def __init__(self, input_dataset, operations=None, input_columns=None, output_columns=None,
3685
- num_parallel_workers=None, python_multiprocessing=False, cache=None, callbacks=None, max_rowsize=16,
3711
+ num_parallel_workers=None, python_multiprocessing=False, cache=None, callbacks=None, max_rowsize=None,
3686
3712
  offload=None):
3687
3713
  super().__init__(children=input_dataset, num_parallel_workers=num_parallel_workers, cache=cache)
3688
3714
  self.operations = to_list(operations)
@@ -3708,7 +3734,9 @@ class MapDataset(UnionBaseDataset):
3708
3734
  self.process_pool = None
3709
3735
 
3710
3736
  self.callbacks = to_list(callbacks)
3711
- if isinstance(max_rowsize, int):
3737
+ if max_rowsize is None:
3738
+ self.max_rowsize = [-1, -1]
3739
+ elif isinstance(max_rowsize, int):
3712
3740
  self.max_rowsize = [max_rowsize] * 2
3713
3741
  else:
3714
3742
  self.max_rowsize = max_rowsize
@@ -63,7 +63,7 @@ class CMUArcticDataset(MappableDataset, AudioBaseDataset):
63
63
  shard_id (int, optional): The shard ID within `num_shards` . Default: ``None``, will use ``0``. This
64
64
  argument can only be specified when `num_shards` is also specified.
65
65
  cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
66
- `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
66
+ `Single-Node Data Cache <https://www.mindspore.cn/docs/en/master/model_train/dataset/cache.html>`_ .
67
67
  Default: ``None``, which means no cache is used.
68
68
 
69
69
  Raises:
@@ -180,7 +180,7 @@ class GTZANDataset(MappableDataset, AudioBaseDataset):
180
180
  shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
181
181
  argument can only be specified when `num_shards` is also specified.
182
182
  cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
183
- `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
183
+ `Single-Node Data Cache <https://www.mindspore.cn/docs/en/master/model_train/dataset/cache.html>`_ .
184
184
  Default: ``None`` , which means no cache is used.
185
185
 
186
186
  Raises:
@@ -298,7 +298,7 @@ class LibriTTSDataset(MappableDataset, AudioBaseDataset):
298
298
  shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
299
299
  argument can only be specified when `num_shards` is also specified.
300
300
  cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
301
- `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
301
+ `Single-Node Data Cache <https://www.mindspore.cn/docs/en/master/model_train/dataset/cache.html>`_ .
302
302
  Default: ``None`` , which means no cache is used.
303
303
 
304
304
  Raises:
@@ -425,7 +425,7 @@ class LJSpeechDataset(MappableDataset, AudioBaseDataset):
425
425
  shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
426
426
  argument can only be specified when `num_shards` is also specified.
427
427
  cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
428
- `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
428
+ `Single-Node Data Cache <https://www.mindspore.cn/docs/en/master/model_train/dataset/cache.html>`_ .
429
429
  Default: ``None`` , which means no cache is used.
430
430
 
431
431
  Raises:
@@ -548,7 +548,7 @@ class SpeechCommandsDataset(MappableDataset, AudioBaseDataset):
548
548
  shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` .
549
549
  This argument can only be specified when `num_shards` is also specified.
550
550
  cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
551
- `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
551
+ `Single-Node Data Cache <https://www.mindspore.cn/docs/en/master/model_train/dataset/cache.html>`_ .
552
552
  Default: ``None`` , which means no cache is used.
553
553
 
554
554
  Raises:
@@ -661,7 +661,7 @@ class TedliumDataset(MappableDataset, AudioBaseDataset):
661
661
  shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
662
662
  argument can only be specified when `num_shards` is also specified.
663
663
  cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
664
- `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
664
+ `Single-Node Data Cache <https://www.mindspore.cn/docs/en/master/model_train/dataset/cache.html>`_ .
665
665
  Default: ``None`` , which means no cache is used.
666
666
 
667
667
  Raises:
@@ -841,7 +841,7 @@ class YesNoDataset(MappableDataset, AudioBaseDataset):
841
841
  shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This argument can only
842
842
  be specified when `num_shards` is also specified.
843
843
  cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
844
- `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
844
+ `Single-Node Data Cache <https://www.mindspore.cn/docs/en/master/model_train/dataset/cache.html>`_ .
845
845
  Default: ``None`` , which means no cache is used.
846
846
 
847
847
  Raises:
@@ -77,7 +77,7 @@ class CSVDataset(SourceDataset, UnionBaseDataset):
77
77
  shard_id (int, optional): The shard ID within `num_shards` . Default: ``None``. This
78
78
  argument can only be specified when `num_shards` is also specified.
79
79
  cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
80
- `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
80
+ `Single-Node Data Cache <https://www.mindspore.cn/docs/en/master/model_train/dataset/cache.html>`_ .
81
81
  Default: ``None``, which means no cache is used.
82
82
 
83
83
  Raises:
@@ -156,7 +156,7 @@ class MindDataset(MappableDataset, UnionBaseDataset):
156
156
  num_samples (int, optional): The number of samples to be included in the dataset.
157
157
  Default: ``None`` , all samples.
158
158
  cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
159
- `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
159
+ `Single-Node Data Cache <https://www.mindspore.cn/docs/en/master/model_train/dataset/cache.html>`_ .
160
160
  Default: ``None`` , which means no cache is used.
161
161
 
162
162
  Raises:
@@ -166,6 +166,52 @@ class MindDataset(MappableDataset, UnionBaseDataset):
166
166
  RuntimeError: If `shard_id` is specified but `num_shards` is None.
167
167
  ValueError: If `shard_id` is not in range of [0, `num_shards` ).
168
168
 
169
+ Note:
170
+ - When sharding MindRecord (by configuring `num_shards` and `shard_id`), there are two strategies to implement
171
+ the data sharding logic. This API uses the strategy 2.
172
+
173
+ .. list-table:: Data sharding strategy 1
174
+ :widths: 50 50 50 50
175
+ :header-rows: 1
176
+
177
+ * - rank 0
178
+ - rank 1
179
+ - rank 2
180
+ - rank 3
181
+ * - 0
182
+ - 1
183
+ - 2
184
+ - 3
185
+ * - 4
186
+ - 5
187
+ - 6
188
+ - 7
189
+ * - 8
190
+ - 9
191
+ - 10
192
+ - 11
193
+
194
+ .. list-table:: Data sharding strategy 2
195
+ :widths: 50 50 50 50
196
+ :header-rows: 1
197
+
198
+ * - rank 0
199
+ - rank 1
200
+ - rank 2
201
+ - rank 3
202
+ * - 0
203
+ - 3
204
+ - 6
205
+ - 9
206
+ * - 1
207
+ - 4
208
+ - 7
209
+ - 10
210
+ * - 2
211
+ - 5
212
+ - 8
213
+ - 11
214
+
169
215
  Note:
170
216
  - The parameters `num_samples` , `shuffle` , `num_shards` , `shard_id` can be used to control the sampler
171
217
  used in the dataset, and their effects when combined with parameter `sampler` are as follows.
@@ -307,7 +353,7 @@ class TFRecordDataset(SourceDataset, UnionBaseDataset):
307
353
  When `compression_type` is not ``None``, and `num_samples` or numRows (parsed from `schema` ) is provided,
308
354
  `shard_equal_rows` will be implied as ``True``.
309
355
  cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
310
- `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
356
+ `Single-Node Data Cache <https://www.mindspore.cn/docs/en/master/model_train/dataset/cache.html>`_ .
311
357
  Default: ``None`` , which means no cache is used.
312
358
  compression_type (str, optional): The type of compression used for all files, must be either ``''``,
313
359
  ``'GZIP'``, or ``'ZLIB'``. Default: ``None`` , as in empty string. It is highly recommended to
@@ -383,6 +429,10 @@ class OBSMindDataset(GeneratorDataset):
383
429
 
384
430
  The columns of generated dataset depend on the source MindRecord files.
385
431
 
432
+ Note:
433
+ - This interface accesses the `/cache` directory for node synchronization and requires the user to ensure
434
+ access to the `/cache` directory.
435
+
386
436
  Args:
387
437
  dataset_files (list[str]): List of files in cloud storage to be read and file path is in
388
438
  the format of s3://bucketName/objectKey.